diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index d06809e7675a370d2b25886ad80ab29628248c34..abef8abc3826ebbad00d7ed06dbabdfbdc0bdcd3 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6575,6 +6575,20 @@ spgcostestimate(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * Support routines for gincostestimate + */ + +typedef struct +{ + bool haveFullScan; + double partialEntries; + double exactEntries; + double searchEntries; + double arrayScans; +} GinQualCounts; + /* Find the index column matching "op"; return its index, or -1 if no match */ static int find_index_column(Node *op, IndexOptInfo *index) @@ -6590,6 +6604,277 @@ find_index_column(Node *op, IndexOptInfo *index) return -1; } +/* + * Estimate the number of index terms that need to be searched for while + * testing the given GIN query, and increment the counts in *counts + * appropriately. If the query is unsatisfiable, return false. + */ +static bool +gincost_pattern(IndexOptInfo *index, int indexcol, + Oid clause_op, Datum query, + GinQualCounts *counts) +{ + Oid extractProcOid; + int strategy_op; + Oid lefttype, + righttype; + int32 nentries = 0; + bool *partial_matches = NULL; + Pointer *extra_data = NULL; + bool *nullFlags = NULL; + int32 searchMode = GIN_SEARCH_MODE_DEFAULT; + int32 i; + + /* + * Get the operator's strategy number and declared input data types + * within the index opfamily. (We don't need the latter, but we use + * get_op_opfamily_properties because it will throw error if it fails + * to find a matching pg_amop entry.) + */ + get_op_opfamily_properties(clause_op, index->opfamily[indexcol], false, + &strategy_op, &lefttype, &righttype); + + /* + * GIN always uses the "default" support functions, which are those + * with lefttype == righttype == the opclass' opcintype (see + * IndexSupportInitialize in relcache.c). + */ + extractProcOid = get_opfamily_proc(index->opfamily[indexcol], + index->opcintype[indexcol], + index->opcintype[indexcol], + GIN_EXTRACTQUERY_PROC); + + if (!OidIsValid(extractProcOid)) + { + /* should not happen; throw same error as index_getprocinfo */ + elog(ERROR, "missing support function %d for attribute %d of index \"%s\"", + GIN_EXTRACTQUERY_PROC, indexcol + 1, + get_rel_name(index->indexoid)); + } + + OidFunctionCall7(extractProcOid, + query, + PointerGetDatum(&nentries), + UInt16GetDatum(strategy_op), + PointerGetDatum(&partial_matches), + PointerGetDatum(&extra_data), + PointerGetDatum(&nullFlags), + PointerGetDatum(&searchMode)); + + if (nentries <= 0 && searchMode == GIN_SEARCH_MODE_DEFAULT) + { + /* No match is possible */ + return false; + } + + for (i = 0; i < nentries; i++) + { + /* + * For partial match we haven't any information to estimate number of + * matched entries in index, so, we just estimate it as 100 + */ + if (partial_matches && partial_matches[i]) + counts->partialEntries += 100; + else + counts->exactEntries++; + + counts->searchEntries++; + } + + if (searchMode == GIN_SEARCH_MODE_INCLUDE_EMPTY) + { + /* Treat "include empty" like an exact-match item */ + counts->exactEntries++; + counts->searchEntries++; + } + else if (searchMode != GIN_SEARCH_MODE_DEFAULT) + { + /* It's GIN_SEARCH_MODE_ALL */ + counts->haveFullScan = true; + } + + return true; +} + +/* + * Estimate the number of index terms that need to be searched for while + * testing the given GIN index clause, and increment the counts in *counts + * appropriately. If the query is unsatisfiable, return false. + */ +static bool +gincost_opexpr(IndexOptInfo *index, OpExpr *clause, GinQualCounts *counts) +{ + Node *leftop = get_leftop((Expr *) clause); + Node *rightop = get_rightop((Expr *) clause); + Oid clause_op = clause->opno; + int indexcol; + Node *operand; + + /* Locate the operand being compared to the index column */ + if ((indexcol = find_index_column(leftop, index)) >= 0) + { + operand = rightop; + } + else if ((indexcol = find_index_column(rightop, index)) >= 0) + { + operand = leftop; + clause_op = get_commutator(clause_op); + } + else + { + elog(ERROR, "could not match index to operand"); + operand = NULL; /* keep compiler quiet */ + } + + if (IsA(operand, RelabelType)) + operand = (Node *) ((RelabelType *) operand)->arg; + + /* + * It's impossible to call extractQuery method for unknown operand. So + * unless operand is a Const we can't do much; just assume there will + * be one ordinary search entry from the operand at runtime. + */ + if (!IsA(operand, Const)) + { + counts->exactEntries++; + counts->searchEntries++; + return true; + } + + /* If Const is null, there can be no matches */ + if (((Const *) operand)->constisnull) + return false; + + /* Otherwise, apply extractQuery and get the actual term counts */ + return gincost_pattern(index, indexcol, clause_op, + ((Const *) operand)->constvalue, + counts); +} + +/* + * Estimate the number of index terms that need to be searched for while + * testing the given GIN index clause, and increment the counts in *counts + * appropriately. If the query is unsatisfiable, return false. + * + * A ScalarArrayOpExpr will give rise to N separate indexscans at runtime, + * each of which involves one value from the RHS array, plus all the + * non-array quals (if any). To model this, we average the counts across + * the RHS elements, and add the averages to the counts in *counts (which + * correspond to per-indexscan costs). We also multiply counts->arrayScans + * by N, causing gincostestimate to scale up its estimates accordingly. + */ +static bool +gincost_scalararrayopexpr(IndexOptInfo *index, ScalarArrayOpExpr *clause, + double numIndexEntries, + GinQualCounts *counts) +{ + Node *leftop = (Node *) linitial(clause->args); + Node *rightop = (Node *) lsecond(clause->args); + Oid clause_op = clause->opno; + int indexcol; + ArrayType *arrayval; + int16 elmlen; + bool elmbyval; + char elmalign; + int numElems; + Datum *elemValues; + bool *elemNulls; + GinQualCounts arraycounts; + int numPossible = 0; + int i; + + Assert(clause->useOr); + + /* index column must be on the left */ + if ((indexcol = find_index_column(leftop, index)) < 0) + elog(ERROR, "could not match index to operand"); + + if (IsA(rightop, RelabelType)) + rightop = (Node *) ((RelabelType *) rightop)->arg; + + /* + * It's impossible to call extractQuery method for unknown operand. So + * unless operand is a Const we can't do much; just assume there will + * be one ordinary search entry from each array entry at runtime, and + * fall back on a probably-bad estimate of the number of array entries. + */ + if (!IsA(rightop, Const)) + { + counts->exactEntries++; + counts->searchEntries++; + counts->arrayScans *= estimate_array_length(rightop); + return true; + } + + /* If Const is null, there can be no matches */ + if (((Const *) rightop)->constisnull) + return false; + + /* Otherwise, extract the array elements and iterate over them */ + arrayval = DatumGetArrayTypeP(((Const *) rightop)->constvalue); + get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), + &elmlen, &elmbyval, &elmalign); + deconstruct_array(arrayval, + ARR_ELEMTYPE(arrayval), + elmlen, elmbyval, elmalign, + &elemValues, &elemNulls, &numElems); + + memset(&arraycounts, 0, sizeof(arraycounts)); + + for (i = 0; i < numElems; i++) + { + GinQualCounts elemcounts; + + /* NULL can't match anything, so ignore, as the executor will */ + if (elemNulls[i]) + continue; + + /* Otherwise, apply extractQuery and get the actual term counts */ + memset(&elemcounts, 0, sizeof(elemcounts)); + + if (gincost_pattern(index, indexcol, clause_op, elemValues[i], + &elemcounts)) + { + /* We ignore array elements that are unsatisfiable patterns */ + numPossible++; + + if (elemcounts.haveFullScan) + { + /* + * Full index scan will be required. We treat this as if + * every key in the index had been listed in the query; is + * that reasonable? + */ + elemcounts.partialEntries = 0; + elemcounts.exactEntries = numIndexEntries; + elemcounts.searchEntries = numIndexEntries; + } + arraycounts.partialEntries += elemcounts.partialEntries; + arraycounts.exactEntries += elemcounts.exactEntries; + arraycounts.searchEntries += elemcounts.searchEntries; + } + } + + if (numPossible == 0) + { + /* No satisfiable patterns in the array */ + return false; + } + + /* + * Now add the averages to the global counts. This will give us an + * estimate of the average number of terms searched for in each indexscan, + * including contributions from both array and non-array quals. + */ + counts->partialEntries += arraycounts.partialEntries / numPossible; + counts->exactEntries += arraycounts.exactEntries / numPossible; + counts->searchEntries += arraycounts.searchEntries / numPossible; + + counts->arrayScans *= numPossible; + + return true; +} + /* * GIN has search behavior completely different from other index types */ @@ -6613,17 +6898,15 @@ gincostestimate(PG_FUNCTION_ARGS) numDataPages, numPendingPages, numEntries; - bool haveFullScan = false; - double partialEntriesInQuals = 0.0; - double searchEntriesInQuals = 0.0; - double exactEntriesInQuals = 0.0; + GinQualCounts counts; + bool matchPossible; double entryPagesFetched, dataPagesFetched, dataPagesFetchedBySel; double qual_op_cost, qual_arg_cost, spc_random_page_cost, - num_scans; + outer_scans; QualCost index_qual_cost; Relation indexRel; GinStatsData ginStats; @@ -6709,199 +6992,113 @@ gincostestimate(PG_FUNCTION_ARGS) /* * Examine quals to estimate number of search entries & partial matches */ + memset(&counts, 0, sizeof(counts)); + counts.arrayScans = 1; + matchPossible = true; + foreach(l, indexQuals) { RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); Expr *clause; - Node *leftop, - *rightop, - *operand; - Oid extractProcOid; - Oid clause_op; - int strategy_op; - Oid lefttype, - righttype; - int32 nentries = 0; - bool *partial_matches = NULL; - Pointer *extra_data = NULL; - bool *nullFlags = NULL; - int32 searchMode = GIN_SEARCH_MODE_DEFAULT; - int indexcol; Assert(IsA(rinfo, RestrictInfo)); clause = rinfo->clause; - Assert(IsA(clause, OpExpr)); - leftop = get_leftop(clause); - rightop = get_rightop(clause); - clause_op = ((OpExpr *) clause)->opno; - - if ((indexcol = find_index_column(leftop, index)) >= 0) - { - operand = rightop; - } - else if ((indexcol = find_index_column(rightop, index)) >= 0) - { - operand = leftop; - clause_op = get_commutator(clause_op); - } - else - { - elog(ERROR, "could not match index to operand"); - operand = NULL; /* keep compiler quiet */ - } - - if (IsA(operand, RelabelType)) - operand = (Node *) ((RelabelType *) operand)->arg; - - /* - * It's impossible to call extractQuery method for unknown operand. So - * unless operand is a Const we can't do much; just assume there will - * be one ordinary search entry from the operand at runtime. - */ - if (!IsA(operand, Const)) - { - searchEntriesInQuals++; - continue; - } - - /* If Const is null, there can be no matches */ - if (((Const *) operand)->constisnull) - { - *indexStartupCost = 0; - *indexTotalCost = 0; - *indexSelectivity = 0; - PG_RETURN_VOID(); - } - - /* - * Get the operator's strategy number and declared input data types - * within the index opfamily. (We don't need the latter, but we use - * get_op_opfamily_properties because it will throw error if it fails - * to find a matching pg_amop entry.) - */ - get_op_opfamily_properties(clause_op, index->opfamily[indexcol], false, - &strategy_op, &lefttype, &righttype); - - /* - * GIN always uses the "default" support functions, which are those - * with lefttype == righttype == the opclass' opcintype (see - * IndexSupportInitialize in relcache.c). - */ - extractProcOid = get_opfamily_proc(index->opfamily[indexcol], - index->opcintype[indexcol], - index->opcintype[indexcol], - GIN_EXTRACTQUERY_PROC); - - if (!OidIsValid(extractProcOid)) + if (IsA(clause, OpExpr)) { - /* should not happen; throw same error as index_getprocinfo */ - elog(ERROR, "missing support function %d for attribute %d of index \"%s\"", - GIN_EXTRACTQUERY_PROC, indexcol + 1, - get_rel_name(index->indexoid)); + matchPossible = gincost_opexpr(index, + (OpExpr *) clause, + &counts); + if (!matchPossible) + break; } - - OidFunctionCall7(extractProcOid, - ((Const *) operand)->constvalue, - PointerGetDatum(&nentries), - UInt16GetDatum(strategy_op), - PointerGetDatum(&partial_matches), - PointerGetDatum(&extra_data), - PointerGetDatum(&nullFlags), - PointerGetDatum(&searchMode)); - - if (nentries <= 0 && searchMode == GIN_SEARCH_MODE_DEFAULT) + else if (IsA(clause, ScalarArrayOpExpr)) { - /* No match is possible */ - *indexStartupCost = 0; - *indexTotalCost = 0; - *indexSelectivity = 0; - PG_RETURN_VOID(); + matchPossible = gincost_scalararrayopexpr(index, + (ScalarArrayOpExpr *) clause, + numEntries, + &counts); + if (!matchPossible) + break; } else { - int32 i; - - for (i = 0; i < nentries; i++) - { - /* - * For partial match we haven't any information to estimate - * number of matched entries in index, so, we just estimate it - * as 100 - */ - if (partial_matches && partial_matches[i]) - partialEntriesInQuals += 100; - else - exactEntriesInQuals++; - - searchEntriesInQuals++; - } + /* shouldn't be anything else for a GIN index */ + elog(ERROR, "unsupported GIN indexqual type: %d", + (int) nodeTag(clause)); } + } - if (searchMode == GIN_SEARCH_MODE_INCLUDE_EMPTY) - { - /* Treat "include empty" like an exact-match item */ - exactEntriesInQuals++; - searchEntriesInQuals++; - } - else if (searchMode != GIN_SEARCH_MODE_DEFAULT) - { - /* It's GIN_SEARCH_MODE_ALL */ - haveFullScan = true; - } + /* Fall out if there were any provably-unsatisfiable quals */ + if (!matchPossible) + { + *indexStartupCost = 0; + *indexTotalCost = 0; + *indexSelectivity = 0; + PG_RETURN_VOID(); } - if (haveFullScan || indexQuals == NIL) + if (counts.haveFullScan || indexQuals == NIL) { /* * Full index scan will be required. We treat this as if every key in * the index had been listed in the query; is that reasonable? */ - searchEntriesInQuals = numEntries; + counts.partialEntries = 0; + counts.exactEntries = numEntries; + counts.searchEntries = numEntries; } /* Will we have more than one iteration of a nestloop scan? */ if (outer_rel != NULL && outer_rel->rows > 1) - num_scans = outer_rel->rows; + outer_scans = outer_rel->rows; else - num_scans = 1; + outer_scans = 1; /* - * cost to begin scan, first of all, pay attention to pending list. + * Compute cost to begin scan, first of all, pay attention to pending list. */ entryPagesFetched = numPendingPages; /* * Estimate number of entry pages read. We need to do - * searchEntriesInQuals searches. Use a power function as it should be, + * counts.searchEntries searches. Use a power function as it should be, * but tuples on leaf pages usually is much greater. Here we include all * searches in entry tree, including search of first entry in partial * match algorithm */ - entryPagesFetched += ceil(searchEntriesInQuals * rint(pow(numEntryPages, 0.15))); + entryPagesFetched += ceil(counts.searchEntries * rint(pow(numEntryPages, 0.15))); /* * Add an estimate of entry pages read by partial match algorithm. It's a * scan over leaf pages in entry tree. We haven't any useful stats here, * so estimate it as proportion. */ - entryPagesFetched += ceil(numEntryPages * partialEntriesInQuals / numEntries); + entryPagesFetched += ceil(numEntryPages * counts.partialEntries / numEntries); /* * Partial match algorithm reads all data pages before doing actual scan, - * so it's a startup cost. Again, we havn't any useful stats here, so, + * so it's a startup cost. Again, we haven't any useful stats here, so, * estimate it as proportion */ - dataPagesFetched = ceil(numDataPages * partialEntriesInQuals / numEntries); + dataPagesFetched = ceil(numDataPages * counts.partialEntries / numEntries); - /* calculate cache effects */ - if (num_scans > 1 || searchEntriesInQuals > 1) + /* + * Calculate cache effects if more than one scan due to nestloops or array + * quals. The result is pro-rated per nestloop scan, but the array qual + * factor shouldn't be pro-rated (compare genericcostestimate). + */ + if (outer_scans > 1 || counts.arrayScans > 1) { + entryPagesFetched *= outer_scans * counts.arrayScans; entryPagesFetched = index_pages_fetched(entryPagesFetched, (BlockNumber) numEntryPages, numEntryPages, root); + entryPagesFetched /= outer_scans; + dataPagesFetched *= outer_scans * counts.arrayScans; dataPagesFetched = index_pages_fetched(dataPagesFetched, (BlockNumber) numDataPages, numDataPages, root); + dataPagesFetched /= outer_scans; } /* @@ -6910,8 +7107,12 @@ gincostestimate(PG_FUNCTION_ARGS) */ *indexStartupCost = (entryPagesFetched + dataPagesFetched) * spc_random_page_cost; - /* cost to scan data pages for each exact (non-partial) matched entry */ - dataPagesFetched = ceil(numDataPages * exactEntriesInQuals / numEntries); + /* + * Now we compute the number of data pages fetched while the scan proceeds. + */ + + /* data pages scanned for each exact (non-partial) matched entry */ + dataPagesFetched = ceil(numDataPages * counts.exactEntries / numEntries); /* * Estimate number of data pages read, using selectivity estimation and @@ -6932,10 +7133,17 @@ gincostestimate(PG_FUNCTION_ARGS) dataPagesFetched = dataPagesFetchedBySel; } - if (num_scans > 1) + /* Account for cache effects, the same as above */ + if (outer_scans > 1 || counts.arrayScans > 1) + { + dataPagesFetched *= outer_scans * counts.arrayScans; dataPagesFetched = index_pages_fetched(dataPagesFetched, (BlockNumber) numDataPages, numDataPages, root); + dataPagesFetched /= outer_scans; + } + + /* And apply random_page_cost as the cost per page */ *indexTotalCost = *indexStartupCost + dataPagesFetched * spc_random_page_cost; diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index e1d7646c0d087714a09dd3213abce665bc1e6932..9341dbe0d77e1723f0c98f7837b5b69b7c748fc3 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -142,6 +142,12 @@ SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; 494 (1 row) +SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); + count +------- + 158 +(1 row) + RESET enable_seqscan; DROP INDEX wowidx; CREATE INDEX wowidx ON test_tsvector USING gin (a); @@ -188,6 +194,12 @@ SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; 494 (1 row) +SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); + count +------- + 158 +(1 row) + RESET enable_seqscan; INSERT INTO test_tsvector VALUES ('???', 'DFG:1A,2B,6C,10 FGH'); SELECT * FROM ts_stat('SELECT a FROM test_tsvector') ORDER BY ndoc DESC, nentry DESC, word LIMIT 10; diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql index d261da21048fef2a0cf343c7cad3cd7fb45774e1..9fd12076acedb0b7c2a617466070b49e76f40778 100644 --- a/src/test/regress/sql/tsearch.sql +++ b/src/test/regress/sql/tsearch.sql @@ -60,6 +60,7 @@ SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt'; SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)'; SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)'; SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; +SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); RESET enable_seqscan; @@ -76,6 +77,7 @@ SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt'; SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)'; SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)'; SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; +SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); RESET enable_seqscan; INSERT INTO test_tsvector VALUES ('???', 'DFG:1A,2B,6C,10 FGH');