diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c index 68d67c7a4e66cff45c4361ca67f751e4dce959e9..3948ef9367789ff403815153cce53892b06d1876 100644 --- a/src/backend/tsearch/ts_selfuncs.c +++ b/src/backend/tsearch/ts_selfuncs.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -257,25 +257,23 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, * * 1 - select(oper) in NOT nodes * - * freq[val] in VAL nodes, if the value is in MCELEM + * histogram-based estimation in prefix VAL nodes + * + * freq[val] in exact VAL nodes, if the value is in MCELEM * min(freq[MCELEM]) / 2 in VAL nodes, if it is not * * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use * binary search for determining freq[MCELEM]. * * If we don't have stats for the tsvector, we still use this logic, - * except we always use DEFAULT_TS_MATCH_SEL for VAL nodes. This case - * is signaled by lookup == NULL. + * except we use default estimates for VAL nodes. This case is signaled + * by lookup == NULL. */ static Selectivity tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup, int length, float4 minfreq) { - LexemeKey key; - TextFreq *searchres; - Selectivity selec, - s1, - s2; + Selectivity selec; /* since this function recurses, it could be driven to stack overflow */ check_stack_depth(); @@ -283,10 +281,7 @@ tsquery_opr_selec(QueryItem *item, char *operand, if (item->type == QI_VAL) { QueryOperand *oper = (QueryOperand *) item; - - /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */ - if (lookup == NULL) - return (Selectivity) DEFAULT_TS_MATCH_SEL; + LexemeKey key; /* * Prepare the key for bsearch(). @@ -294,56 +289,115 @@ tsquery_opr_selec(QueryItem *item, char *operand, key.lexeme = operand + oper->distance; key.length = oper->length; - searchres = (TextFreq *) bsearch(&key, lookup, length, - sizeof(TextFreq), - compare_lexeme_textfreq); - - if (searchres) + if (oper->prefix) { + /* Prefix match, ie the query item is lexeme:* */ + Selectivity matched, + allmcvs; + int i; + + /* + * Our strategy is to scan through the MCV list and add up the + * frequencies of the ones that match the prefix, thereby + * assuming that the MCVs are representative of the whole lexeme + * population in this respect. Compare histogram_selectivity(). + * + * This is only a good plan if we have a pretty fair number of + * MCVs available; we set the threshold at 100. If no stats or + * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4. + */ + if (lookup == NULL || length < 100) + return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4); + + matched = allmcvs = 0; + for (i = 0; i < length; i++) + { + TextFreq *t = lookup + i; + int tlen = VARSIZE_ANY_EXHDR(t->element); + + if (tlen >= key.length && + strncmp(key.lexeme, VARDATA_ANY(t->element), + key.length) == 0) + matched += t->frequency; + allmcvs += t->frequency; + } + + if (allmcvs > 0) /* paranoia about zero divide */ + selec = matched / allmcvs; + else + selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4); + /* - * The element is in MCELEM. Return precise selectivity (or at - * least as precise as ANALYZE could find out). + * In any case, never believe that a prefix match has selectivity + * less than DEFAULT_TS_MATCH_SEL. */ - return (Selectivity) searchres->frequency; + selec = Max(DEFAULT_TS_MATCH_SEL, selec); } else { - /* - * The element is not in MCELEM. Punt, but assume that the - * selectivity cannot be more than minfreq / 2. - */ - return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2); + /* Regular exact lexeme match */ + TextFreq *searchres; + + /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */ + if (lookup == NULL) + return (Selectivity) DEFAULT_TS_MATCH_SEL; + + searchres = (TextFreq *) bsearch(&key, lookup, length, + sizeof(TextFreq), + compare_lexeme_textfreq); + + if (searchres) + { + /* + * The element is in MCELEM. Return precise selectivity (or + * at least as precise as ANALYZE could find out). + */ + selec = searchres->frequency; + } + else + { + /* + * The element is not in MCELEM. Punt, but assume that the + * selectivity cannot be more than minfreq / 2. + */ + selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2); + } } } - - /* Current TSQuery node is an operator */ - switch (item->qoperator.oper) + else { - case OP_NOT: - selec = 1.0 - tsquery_opr_selec(item + 1, operand, - lookup, length, minfreq); - break; - - case OP_AND: - s1 = tsquery_opr_selec(item + 1, operand, - lookup, length, minfreq); - s2 = tsquery_opr_selec(item + item->qoperator.left, operand, - lookup, length, minfreq); - selec = s1 * s2; - break; - - case OP_OR: - s1 = tsquery_opr_selec(item + 1, operand, - lookup, length, minfreq); - s2 = tsquery_opr_selec(item + item->qoperator.left, operand, - lookup, length, minfreq); - selec = s1 + s2 - s1 * s2; - break; - - default: - elog(ERROR, "unrecognized operator: %d", item->qoperator.oper); - selec = 0; /* keep compiler quiet */ - break; + /* Current TSQuery node is an operator */ + Selectivity s1, + s2; + + switch (item->qoperator.oper) + { + case OP_NOT: + selec = 1.0 - tsquery_opr_selec(item + 1, operand, + lookup, length, minfreq); + break; + + case OP_AND: + s1 = tsquery_opr_selec(item + 1, operand, + lookup, length, minfreq); + s2 = tsquery_opr_selec(item + item->qoperator.left, operand, + lookup, length, minfreq); + selec = s1 * s2; + break; + + case OP_OR: + s1 = tsquery_opr_selec(item + 1, operand, + lookup, length, minfreq); + s2 = tsquery_opr_selec(item + item->qoperator.left, operand, + lookup, length, minfreq); + selec = s1 + s2 - s1 * s2; + break; + + default: + elog(ERROR, "unrecognized operator: %d", item->qoperator.oper); + selec = 0; /* keep compiler quiet */ + break; + } } /* Clamp intermediate results to stay sane despite roundoff error */