diff --git a/contrib/fuzzystrmatch/Makefile b/contrib/fuzzystrmatch/Makefile index 024265d47900b6ac3e432daf8457ec7708d689b0..0327d9510a50f37e90d2a73880a14fbb2bab29c4 100644 --- a/contrib/fuzzystrmatch/Makefile +++ b/contrib/fuzzystrmatch/Makefile @@ -17,6 +17,3 @@ top_builddir = ../.. include $(top_builddir)/src/Makefile.global include $(top_srcdir)/contrib/contrib-global.mk endif - -# levenshtein.c is #included by fuzzystrmatch.c -fuzzystrmatch.o: fuzzystrmatch.c levenshtein.c diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c index 7a53d8a008e1193e07ca5ad38766ad7915d25434..f0df08032b697d94ead57dcecc097ffce675e60b 100644 --- a/contrib/fuzzystrmatch/fuzzystrmatch.c +++ b/contrib/fuzzystrmatch/fuzzystrmatch.c @@ -154,23 +154,6 @@ getcode(char c) /* These prevent GH from becoming F */ #define NOGHTOF(c) (getcode(c) & 16) /* BDH */ -/* Faster than memcmp(), for this use case. */ -static inline bool -rest_of_char_same(const char *s1, const char *s2, int len) -{ - while (len > 0) - { - len--; - if (s1[len] != s2[len]) - return false; - } - return true; -} - -#include "levenshtein.c" -#define LEVENSHTEIN_LESS_EQUAL -#include "levenshtein.c" - PG_FUNCTION_INFO_V1(levenshtein_with_costs); Datum levenshtein_with_costs(PG_FUNCTION_ARGS) @@ -180,8 +163,20 @@ levenshtein_with_costs(PG_FUNCTION_ARGS) int ins_c = PG_GETARG_INT32(2); int del_c = PG_GETARG_INT32(3); int sub_c = PG_GETARG_INT32(4); - - PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c)); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; + + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, ins_c, + del_c, sub_c)); } @@ -191,8 +186,20 @@ levenshtein(PG_FUNCTION_ARGS) { text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); - - PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1)); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; + + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, 1, 1, + 1)); } @@ -206,8 +213,21 @@ levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS) int del_c = PG_GETARG_INT32(3); int sub_c = PG_GETARG_INT32(4); int max_d = PG_GETARG_INT32(5); - - PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d)); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; + + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data, + t_bytes, ins_c, del_c, + sub_c, max_d)); } @@ -218,8 +238,20 @@ levenshtein_less_equal(PG_FUNCTION_ARGS) text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); int max_d = PG_GETARG_INT32(2); - - PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d)); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; + + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data, + t_bytes, 1, 1, 1, max_d)); } diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 7b4391bba179082c21e9ff526b1f8276687dfce4..3ea9bf435a31c7d68ec6f142044fd36540866854 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -38,4 +38,6 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \ like.o: like.c like_match.c +varlena.o: varlena.c levenshtein.c + include $(top_srcdir)/src/backend/common.mk diff --git a/contrib/fuzzystrmatch/levenshtein.c b/src/backend/utils/adt/levenshtein.c similarity index 85% rename from contrib/fuzzystrmatch/levenshtein.c rename to src/backend/utils/adt/levenshtein.c index 4f37a54b1e446338e79da6686b4646a2fcf32fed..a8670e9a85bd7a070ee7f614725c3f6ab11399fe 100644 --- a/contrib/fuzzystrmatch/levenshtein.c +++ b/src/backend/utils/adt/levenshtein.c @@ -1,41 +1,34 @@ -/* +/*------------------------------------------------------------------------- + * * levenshtein.c + * Levenshtein distance implementation. * - * Functions for "fuzzy" comparison of strings + * Original author: Joe Conway <mail@joeconway.com> * - * Joe Conway <mail@joeconway.com> + * This file is included by varlena.c twice, to provide matching code for (1) + * Levenshtein distance with custom costings, and (2) Levenshtein distance with + * custom costings and a "max" value above which exact distances are not + * interesting. Before the inclusion, we rely on the presence of the inline + * function rest_of_char_same(). + * + * Written based on a description of the algorithm by Michael Gilleland found + * at http://www.merriampark.com/ld.htm. Also looked at levenshtein.c in the + * PHP 4.0.6 distribution for inspiration. Configurable penalty costs + * extension is introduced by Volkan YAZICI <volkan.yazici@gmail.com. * * Copyright (c) 2001-2014, PostgreSQL Global Development Group - * ALL RIGHTS RESERVED; * - * levenshtein() - * ------------- - * Written based on a description of the algorithm by Michael Gilleland - * found at http://www.merriampark.com/ld.htm - * Also looked at levenshtein.c in the PHP 4.0.6 distribution for - * inspiration. - * Configurable penalty costs extension is introduced by Volkan - * YAZICI <volkan.yazici@gmail.com>. - */ - -/* - * External declarations for exported functions + * IDENTIFICATION + * src/backend/utils/adt/levenshtein.c + * + *------------------------------------------------------------------------- */ -#ifdef LEVENSHTEIN_LESS_EQUAL -static int levenshtein_less_equal_internal(text *s, text *t, - int ins_c, int del_c, int sub_c, int max_d); -#else -static int levenshtein_internal(text *s, text *t, - int ins_c, int del_c, int sub_c); -#endif - #define MAX_LEVENSHTEIN_STRLEN 255 - /* - * Calculates Levenshtein distance metric between supplied strings. Generally - * (1, 1, 1) penalty costs suffices for common cases, but your mileage may - * vary. + * Calculates Levenshtein distance metric between supplied csrings, which are + * not necessarily null-terminated. Generally (1, 1, 1) penalty costs suffices + * for common cases, but your mileage may vary. * * One way to compute Levenshtein distance is to incrementally construct * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number @@ -63,30 +56,27 @@ static int levenshtein_internal(text *s, text *t, * identify the portion of the matrix close to the diagonal which can still * affect the final answer. */ -static int +int #ifdef LEVENSHTEIN_LESS_EQUAL -levenshtein_less_equal_internal(text *s, text *t, - int ins_c, int del_c, int sub_c, int max_d) +varstr_levenshtein_less_equal(const char *source, int slen, const char *target, + int tlen, int ins_c, int del_c, int sub_c, + int max_d) #else -levenshtein_internal(text *s, text *t, - int ins_c, int del_c, int sub_c) +varstr_levenshtein(const char *source, int slen, const char *target, int tlen, + int ins_c, int del_c, int sub_c) #endif { int m, - n, - s_bytes, - t_bytes; + n; int *prev; int *curr; int *s_char_len = NULL; int i, j; - const char *s_data; - const char *t_data; const char *y; /* - * For levenshtein_less_equal_internal, we have real variables called + * For varstr_levenshtein_less_equal, we have real variables called * start_column and stop_column; otherwise it's just short-hand for 0 and * m. */ @@ -105,15 +95,8 @@ levenshtein_internal(text *s, text *t, #define STOP_COLUMN m #endif - /* Extract a pointer to the actual character data. */ - s_data = VARDATA_ANY(s); - t_data = VARDATA_ANY(t); - - /* Determine length of each string in bytes and characters. */ - s_bytes = VARSIZE_ANY_EXHDR(s); - t_bytes = VARSIZE_ANY_EXHDR(t); - m = pg_mbstrlen_with_len(s_data, s_bytes); - n = pg_mbstrlen_with_len(t_data, t_bytes); + m = pg_mbstrlen_with_len(source, slen); + n = pg_mbstrlen_with_len(target, tlen); /* * We can transform an empty s into t with n insertions, or a non-empty t @@ -193,10 +176,10 @@ levenshtein_internal(text *s, text *t, * multi-byte characters, we still build the array, so that the fast-path * needn't deal with the case where the array hasn't been initialized. */ - if (m != s_bytes || n != t_bytes) + if (m != slen || n != tlen) { int i; - const char *cp = s_data; + const char *cp = source; s_char_len = (int *) palloc((m + 1) * sizeof(int)); for (i = 0; i < m; ++i) @@ -223,11 +206,11 @@ levenshtein_internal(text *s, text *t, prev[i] = i * del_c; /* Loop through rows of the notional array */ - for (y = t_data, j = 1; j < n; j++) + for (y = target, j = 1; j < n; j++) { int *temp; - const char *x = s_data; - int y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1; + const char *x = source; + int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1; #ifdef LEVENSHTEIN_LESS_EQUAL @@ -384,7 +367,7 @@ levenshtein_internal(text *s, text *t, prev[start_column] = max_d + 1; curr[start_column] = max_d + 1; if (start_column != 0) - s_data += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1; + source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1; start_column++; } diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index c3171b549a6feb9ef625388bd8f78c33a5a157b6..b3f397e9595270750181029542e2f0fc83e821a8 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1546,7 +1546,6 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid) return result; } - /* text_cmp() * Internal comparison function for text strings. * Returns -1, 0 or 1 @@ -4747,3 +4746,24 @@ text_format_nv(PG_FUNCTION_ARGS) { return text_format(fcinfo); } + +/* + * Helper function for Levenshtein distance functions. Faster than memcmp(), + * for this use case. + */ +static inline bool +rest_of_char_same(const char *s1, const char *s2, int len) +{ + while (len > 0) + { + len--; + if (s1[len] != s2[len]) + return false; + } + return true; +} + +/* Expand each Levenshtein distance variant */ +#include "levenshtein.c" +#define LEVENSHTEIN_LESS_EQUAL +#include "levenshtein.c" diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 3ba34f88eec7be1b26c8d481990a8b91ec2d5b2b..417fd1771a8ed2efab0ae3822d0d09570b382c80 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -786,6 +786,11 @@ extern Datum textoverlay_no_len(PG_FUNCTION_ARGS); extern Datum name_text(PG_FUNCTION_ARGS); extern Datum text_name(PG_FUNCTION_ARGS); extern int varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid); +extern int varstr_levenshtein(const char *source, int slen, const char *target, + int tlen, int ins_c, int del_c, int sub_c); +extern int varstr_levenshtein_less_equal(const char *source, int slen, + const char *target, int tlen, int ins_c, + int del_c, int sub_c, int max_d); extern List *textToQualifiedNameList(text *textval); extern bool SplitIdentifierString(char *rawstring, char separator, List **namelist);