diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c index 7829bcbac160b80186736e725d1b0053881be7a3..130e52b26c4f20fe909811baa70d5a854f12792c 100644 --- a/src/backend/parser/parse_expr.c +++ b/src/backend/parser/parse_expr.c @@ -556,7 +556,8 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref) colname = strVal(field2); /* Try to identify as a column of the RTE */ - node = scanRTEForColumn(pstate, rte, colname, cref->location); + node = scanRTEForColumn(pstate, rte, colname, cref->location, + 0, NULL); if (node == NULL) { /* Try it as a function call on the whole row */ @@ -601,7 +602,8 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref) colname = strVal(field3); /* Try to identify as a column of the RTE */ - node = scanRTEForColumn(pstate, rte, colname, cref->location); + node = scanRTEForColumn(pstate, rte, colname, cref->location, + 0, NULL); if (node == NULL) { /* Try it as a function call on the whole row */ @@ -659,7 +661,8 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref) colname = strVal(field4); /* Try to identify as a column of the RTE */ - node = scanRTEForColumn(pstate, rte, colname, cref->location); + node = scanRTEForColumn(pstate, rte, colname, cref->location, + 0, NULL); if (node == NULL) { /* Try it as a function call on the whole row */ diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index a20080400e42dea0a473f264284995f88b162d99..53bbaecac39e6319fe5af1dfe92b611a8684bc5e 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -1779,7 +1779,7 @@ ParseComplexProjection(ParseState *pstate, char *funcname, Node *first_arg, ((Var *) first_arg)->varno, ((Var *) first_arg)->varlevelsup); /* Return a Var if funcname matches a column, else NULL */ - return scanRTEForColumn(pstate, rte, funcname, location); + return scanRTEForColumn(pstate, rte, funcname, location, 0, NULL); } /* diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c index f416fc2a45e61a4662b3b77367c0fd0b641d8d54..80daeb9dc4a34a9aa06e9729ecb65022bebe924e 100644 --- a/src/backend/parser/parse_relation.c +++ b/src/backend/parser/parse_relation.c @@ -33,6 +33,8 @@ #include "utils/syscache.h" +#define MAX_FUZZY_DISTANCE 3 + static RangeTblEntry *scanNameSpaceForRefname(ParseState *pstate, const char *refname, int location); static RangeTblEntry *scanNameSpaceForRelid(ParseState *pstate, Oid relid, @@ -519,6 +521,101 @@ GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup) return NULL; /* keep compiler quiet */ } +/* + * updateFuzzyAttrMatchState + * Using Levenshtein distance, consider if column is best fuzzy match. + */ +static void +updateFuzzyAttrMatchState(int fuzzy_rte_penalty, + FuzzyAttrMatchState *fuzzystate, RangeTblEntry *rte, + const char *actual, const char *match, int attnum) +{ + int columndistance; + int matchlen; + + /* Bail before computing the Levenshtein distance if there's no hope. */ + if (fuzzy_rte_penalty > fuzzystate->distance) + return; + + /* + * Outright reject dropped columns, which can appear here with apparent + * empty actual names, per remarks within scanRTEForColumn(). + */ + if (actual[0] == '\0') + return; + + /* Use Levenshtein to compute match distance. */ + matchlen = strlen(match); + columndistance = + varstr_levenshtein_less_equal(actual, strlen(actual), match, matchlen, + 1, 1, 1, + fuzzystate->distance + 1 + - fuzzy_rte_penalty); + + /* + * If more than half the characters are different, don't treat it as a + * match, to avoid making ridiculous suggestions. + */ + if (columndistance > matchlen / 2) + return; + + /* + * From this point on, we can ignore the distinction between the + * RTE-name distance and the column-name distance. + */ + columndistance += fuzzy_rte_penalty; + + /* + * If the new distance is less than or equal to that of the best match + * found so far, update fuzzystate. + */ + if (columndistance < fuzzystate->distance) + { + /* Store new lowest observed distance for RTE */ + fuzzystate->distance = columndistance; + fuzzystate->rfirst = rte; + fuzzystate->first = attnum; + fuzzystate->rsecond = NULL; + fuzzystate->second = InvalidAttrNumber; + } + else if (columndistance == fuzzystate->distance) + { + /* + * This match distance may equal a prior match within this same + * range table. When that happens, the prior match may also be + * given, but only if there is no more than two equally distant + * matches from the RTE (in turn, our caller will only accept + * two equally distant matches overall). + */ + if (AttributeNumberIsValid(fuzzystate->second)) + { + /* Too many RTE-level matches */ + fuzzystate->rfirst = NULL; + fuzzystate->first = InvalidAttrNumber; + fuzzystate->rsecond = NULL; + fuzzystate->second = InvalidAttrNumber; + /* Clearly, distance is too low a bar (for *any* RTE) */ + fuzzystate->distance = columndistance - 1; + } + else if (AttributeNumberIsValid(fuzzystate->first)) + { + /* Record as provisional second match for RTE */ + fuzzystate->rsecond = rte; + fuzzystate->second = attnum; + } + else if (fuzzystate->distance <= MAX_FUZZY_DISTANCE) + { + /* + * Record as provisional first match (this can occasionally + * occur because previous lowest distance was "too low a + * bar", rather than being associated with a real match) + */ + fuzzystate->rfirst = rte; + fuzzystate->first = attnum; + } + } +} + /* * scanRTEForColumn * Search the column names of a single RTE for the given name. @@ -527,10 +624,14 @@ GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup) * * Side effect: if we find a match, mark the RTE as requiring read access * for the column. + * + * Additional side effect: if fuzzystate is non-NULL, check non-system columns + * for an approximate match and update fuzzystate accordingly. */ Node * scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname, - int location) + int location, int fuzzy_rte_penalty, + FuzzyAttrMatchState *fuzzystate) { Node *result = NULL; int attnum = 0; @@ -548,12 +649,16 @@ scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname, * Should this somehow go wrong and we try to access a dropped column, * we'll still catch it by virtue of the checks in * get_rte_attribute_type(), which is called by make_var(). That routine - * has to do a cache lookup anyway, so the check there is cheap. + * has to do a cache lookup anyway, so the check there is cheap. Callers + * interested in finding match with shortest distance need to defend + * against this directly, though. */ foreach(c, rte->eref->colnames) { + const char *attcolname = strVal(lfirst(c)); + attnum++; - if (strcmp(strVal(lfirst(c)), colname) == 0) + if (strcmp(attcolname, colname) == 0) { if (result) ereport(ERROR, @@ -566,6 +671,11 @@ scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname, markVarForSelectPriv(pstate, var, rte); result = (Node *) var; } + + /* Updating fuzzy match state, if provided. */ + if (fuzzystate != NULL) + updateFuzzyAttrMatchState(fuzzy_rte_penalty, fuzzystate, + rte, attcolname, colname, attnum); } /* @@ -642,7 +752,8 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly, continue; /* use orig_pstate here to get the right sublevels_up */ - newresult = scanRTEForColumn(orig_pstate, rte, colname, location); + newresult = scanRTEForColumn(orig_pstate, rte, colname, location, + 0, NULL); if (newresult) { @@ -668,8 +779,8 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly, /* * searchRangeTableForCol - * See if any RangeTblEntry could possibly provide the given column name. - * If so, return a pointer to the RangeTblEntry; else return NULL. + * See if any RangeTblEntry could possibly provide the given column name (or + * find the best match available). Returns state with relevant details. * * This is different from colNameToVar in that it considers every entry in * the ParseState's rangetable(s), not only those that are currently visible @@ -677,11 +788,31 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly, * and it may give ambiguous results (there might be multiple equally valid * matches, but only one will be returned). This must be used ONLY as a * heuristic in giving suitable error messages. See errorMissingColumn. + * + * This function is also different in that it will consider approximate + * matches -- if the user entered an alias/column pair that is only slightly + * different from a valid pair, we may be able to infer what they meant to + * type and provide a reasonable hint. + * + * The FuzzyAttrMatchState will have 'rfirst' pointing to the best RTE + * containing the most promising match for the alias and column name. If + * the alias and column names match exactly, 'first' will be InvalidAttrNumber; + * otherwise, it will be the attribute number for the match. In the latter + * case, 'rsecond' may point to a second, equally close approximate match, + * and 'second' will contain the attribute number for the second match. */ -static RangeTblEntry * -searchRangeTableForCol(ParseState *pstate, char *colname, int location) +static FuzzyAttrMatchState * +searchRangeTableForCol(ParseState *pstate, const char *alias, char *colname, + int location) { ParseState *orig_pstate = pstate; + FuzzyAttrMatchState *fuzzystate = palloc(sizeof(FuzzyAttrMatchState)); + + fuzzystate->distance = MAX_FUZZY_DISTANCE + 1; + fuzzystate->rfirst = NULL; + fuzzystate->rsecond = NULL; + fuzzystate->first = InvalidAttrNumber; + fuzzystate->second = InvalidAttrNumber; while (pstate != NULL) { @@ -689,15 +820,51 @@ searchRangeTableForCol(ParseState *pstate, char *colname, int location) foreach(l, pstate->p_rtable) { - RangeTblEntry *rte = (RangeTblEntry *) lfirst(l); + RangeTblEntry *rte = (RangeTblEntry *) lfirst(l); + int fuzzy_rte_penalty = 0; - if (scanRTEForColumn(orig_pstate, rte, colname, location)) - return rte; + /* + * Typically, it is not useful to look for matches within join + * RTEs; they effectively duplicate other RTEs for our purposes, + * and if a match is chosen from a join RTE, an unhelpful alias is + * displayed in the final diagnostic message. + */ + if (rte->rtekind == RTE_JOIN) + continue; + + /* + * If the user didn't specify an alias, then matches against one + * RTE are as good as another. But if the user did specify an + * alias, then we want at least a fuzzy - and preferably an exact + * - match for the range table entry. + */ + if (alias != NULL) + fuzzy_rte_penalty = + varstr_levenshtein(alias, strlen(alias), + rte->eref->aliasname, + strlen(rte->eref->aliasname), + 1, 1, 1); + + /* + * Scan for a matching column; if we find an exact match, we're + * done. Otherwise, update fuzzystate. + */ + if (scanRTEForColumn(orig_pstate, rte, colname, location, + fuzzy_rte_penalty, fuzzystate) + && fuzzy_rte_penalty == 0) + { + fuzzystate->rfirst = rte; + fuzzystate->first = InvalidAttrNumber; + fuzzystate->rsecond = NULL; + fuzzystate->second = InvalidAttrNumber; + return fuzzystate; + } } pstate = pstate->parentParseState; } - return NULL; + + return fuzzystate; } /* @@ -2860,34 +3027,67 @@ void errorMissingColumn(ParseState *pstate, char *relname, char *colname, int location) { - RangeTblEntry *rte; + FuzzyAttrMatchState *state; + char *closestfirst = NULL; /* - * If relname was given, just play dumb and report it. (In practice, a - * bad qualification name should end up at errorMissingRTE, not here, so - * no need to work hard on this case.) + * Search the entire rtable looking for possible matches. If we find one, + * emit a hint about it. + * + * TODO: improve this code (and also errorMissingRTE) to mention using + * LATERAL if appropriate. */ - if (relname) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("column %s.%s does not exist", relname, colname), - parser_errposition(pstate, location))); + state = searchRangeTableForCol(pstate, relname, colname, location); /* - * Otherwise, search the entire rtable looking for possible matches. If - * we find one, emit a hint about it. + * Extract closest col string for best match, if any. * - * TODO: improve this code (and also errorMissingRTE) to mention using - * LATERAL if appropriate. + * Infer an exact match referenced despite not being visible from the fact + * that an attribute number was not present in state passed back -- this is + * what is reported when !closestfirst. There might also be an exact match + * that was qualified with an incorrect alias, in which case closestfirst + * will be set (so hint is the same as generic fuzzy case). */ - rte = searchRangeTableForCol(pstate, colname, location); - - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("column \"%s\" does not exist", colname), - rte ? errhint("There is a column named \"%s\" in table \"%s\", but it cannot be referenced from this part of the query.", - colname, rte->eref->aliasname) : 0, - parser_errposition(pstate, location))); + if (state->rfirst && AttributeNumberIsValid(state->first)) + closestfirst = strVal(list_nth(state->rfirst->eref->colnames, + state->first - 1)); + + if (!state->rsecond) + { + /* + * Handle case where there is zero or one column suggestions to hint, + * including exact matches referenced but not visible. + */ + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + relname ? + errmsg("column %s.%s does not exist", relname, colname): + errmsg("column \"%s\" does not exist", colname), + state->rfirst ? closestfirst ? + errhint("Perhaps you meant to reference the column \"%s\".\"%s\".", + state->rfirst->eref->aliasname, closestfirst): + errhint("There is a column named \"%s\" in table \"%s\", but it cannot be referenced from this part of the query.", + colname, state->rfirst->eref->aliasname): 0, + parser_errposition(pstate, location))); + } + else + { + /* Handle case where there are two equally useful column hints */ + char *closestsecond; + + closestsecond = strVal(list_nth(state->rsecond->eref->colnames, + state->second - 1)); + + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + relname ? + errmsg("column %s.%s does not exist", relname, colname): + errmsg("column \"%s\" does not exist", colname), + errhint("Perhaps you meant to reference the column \"%s\".\"%s\" or the column \"%s\".\"%s\".", + state->rfirst->eref->aliasname, closestfirst, + state->rsecond->eref->aliasname, closestsecond), + parser_errposition(pstate, location))); + } } diff --git a/src/backend/utils/adt/levenshtein.c b/src/backend/utils/adt/levenshtein.c index 3669adc18a0d50f903d0b31ed39913d08187bb0a..f6e2ca6452a0ad8dbdeaf5ed1d4b4b08f20e07d7 100644 --- a/src/backend/utils/adt/levenshtein.c +++ b/src/backend/utils/adt/levenshtein.c @@ -95,6 +95,15 @@ varstr_levenshtein(const char *source, int slen, const char *target, int tlen, #define STOP_COLUMN m #endif + /* + * A common use for Levenshtein distance is to match attributes when building + * diagnostic, user-visible messages. Restrict the size of + * MAX_LEVENSHTEIN_STRLEN at compile time so that this is guaranteed to + * work. + */ + StaticAssertStmt(NAMEDATALEN <= MAX_LEVENSHTEIN_STRLEN, + "Levenshtein hinting mechanism restricts NAMEDATALEN"); + m = pg_mbstrlen_with_len(source, slen); n = pg_mbstrlen_with_len(target, tlen); diff --git a/src/include/parser/parse_relation.h b/src/include/parser/parse_relation.h index fc0a2577f31438f02c01fdf4d149be33a4190ecc..9dc0d5846b05bc376abf829259c0c0a006c1e558 100644 --- a/src/include/parser/parse_relation.h +++ b/src/include/parser/parse_relation.h @@ -16,6 +16,24 @@ #include "parser/parse_node.h" + +/* + * Support for fuzzily matching column. + * + * This is for building diagnostic messages, where non-exact matching + * attributes are suggested to the user. The struct's fields may be facets of + * a particular RTE, or of an entire range table, depending on context. + */ +typedef struct +{ + int distance; /* Weighted distance (lowest so far) */ + RangeTblEntry *rfirst; /* RTE of first */ + AttrNumber first; /* Closest attribute so far */ + RangeTblEntry *rsecond; /* RTE of second */ + AttrNumber second; /* Second closest attribute so far */ +} FuzzyAttrMatchState; + + extern RangeTblEntry *refnameRangeTblEntry(ParseState *pstate, const char *schemaname, const char *refname, @@ -35,7 +53,8 @@ extern RangeTblEntry *GetRTEByRangeTablePosn(ParseState *pstate, extern CommonTableExpr *GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup); extern Node *scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, - char *colname, int location); + char *colname, int location, + int fuzzy_rte_penalty, FuzzyAttrMatchState *fuzzystate); extern Node *colNameToVar(ParseState *pstate, char *colname, bool localonly, int location); extern void markVarForSelectPriv(ParseState *pstate, Var *var, diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index d23371087156383f7125a06ffd50db896a820d07..51db1b679279544b6aadfa995fad2c9b771f6d8f 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -536,6 +536,7 @@ create table atacc1 ( test int ); -- add a check constraint (fails) alter table atacc1 add constraint atacc_test1 check (test1>3); ERROR: column "test1" does not exist +HINT: Perhaps you meant to reference the column "atacc1"."test". drop table atacc1; -- something a little more complicated create table atacc1 ( test int, test2 int, test3 int); @@ -1342,6 +1343,7 @@ select f1 from c1; ERROR: column "f1" does not exist LINE 1: select f1 from c1; ^ +HINT: Perhaps you meant to reference the column "c1"."f2". drop table p1 cascade; NOTICE: drop cascades to table c1 create table p1 (f1 int, f2 int); @@ -1355,6 +1357,7 @@ select f1 from c1; ERROR: column "f1" does not exist LINE 1: select f1 from c1; ^ +HINT: Perhaps you meant to reference the column "c1"."f2". drop table p1 cascade; NOTICE: drop cascades to table c1 create table p1 (f1 int, f2 int); diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index ca3a17b283ed876798f256fff1b7da8ec59d72aa..1e3fe073504225e8528bc3f456199002ebcdfc83 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -2222,6 +2222,12 @@ select * from t1 left join t2 on (t1.a = t2.a); 200 | 1000 | 200 | 2001 (5 rows) +-- Test matching of column name with wrong alias +select t1.x from t1 join t3 on (t1.a = t3.x); +ERROR: column t1.x does not exist +LINE 1: select t1.x from t1 join t3 on (t1.a = t3.x); + ^ +HINT: Perhaps you meant to reference the column "t3"."x". -- -- regression test for 8.1 merge right join bug -- @@ -3433,6 +3439,38 @@ select * from ----+----+----+---- (0 rows) +-- +-- Test hints given on incorrect column references are useful +-- +select t1.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t1" suggestipn +ERROR: column t1.uunique1 does not exist +LINE 1: select t1.uunique1 from + ^ +HINT: Perhaps you meant to reference the column "t1"."unique1". +select t2.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t2" suggestion +ERROR: column t2.uunique1 does not exist +LINE 1: select t2.uunique1 from + ^ +HINT: Perhaps you meant to reference the column "t2"."unique1". +select uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, suggest both at once +ERROR: column "uunique1" does not exist +LINE 1: select uunique1 from + ^ +HINT: Perhaps you meant to reference the column "t1"."unique1" or the column "t2"."unique1". +-- +-- Take care to reference the correct RTE +-- +select atts.relid::regclass, s.* from pg_stats s join + pg_attribute a on s.attname = a.attname and s.tablename = + a.attrelid::regclass::text join (select unnest(indkey) attnum, + indexrelid from pg_index i) atts on atts.attnum = a.attnum where + schemaname != 'pg_catalog'; +ERROR: column atts.relid does not exist +LINE 1: select atts.relid::regclass, s.* from pg_stats s join + ^ -- -- Test LATERAL -- diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 6005476c16289a96a6e24b81c3763701bcea03da..7a08bdff7bfb0e159e35fc10dd8d5ee0069fd74a 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -397,6 +397,10 @@ insert into t2a values (200, 2001); select * from t1 left join t2 on (t1.a = t2.a); +-- Test matching of column name with wrong alias + +select t1.x from t1 join t3 on (t1.a = t3.x); + -- -- regression test for 8.1 merge right join bug -- @@ -1060,6 +1064,26 @@ select * from int8_tbl x join (int4_tbl x cross join int4_tbl y(ff)) j on q1 = f1; -- ok -- +-- Test hints given on incorrect column references are useful +-- + +select t1.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t1" suggestipn +select t2.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t2" suggestion +select uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, suggest both at once + +-- +-- Take care to reference the correct RTE +-- + +select atts.relid::regclass, s.* from pg_stats s join + pg_attribute a on s.attname = a.attname and s.tablename = + a.attrelid::regclass::text join (select unnest(indkey) attnum, + indexrelid from pg_index i) atts on atts.attnum = a.attnum where + schemaname != 'pg_catalog'; +-- -- Test LATERAL --