diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c index ee4b61d44bfe12fa3ab4830a0b80231337a9411a..d0b1bcc19def7f546644da8c2571c8467eb91ab1 100644 --- a/src/backend/tsearch/to_tsany.c +++ b/src/backend/tsearch/to_tsany.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/to_tsany.c,v 1.1 2007/08/21 01:11:18 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/to_tsany.c,v 1.2 2007/09/07 15:09:55 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -225,10 +225,17 @@ to_tsvector(PG_FUNCTION_ARGS) /* - * This function is used for morph parsing + * This function is used for morph parsing. + * + * The value is passed to parsetext which will call the right dictionary to + * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP + * to the stack. + * + * All words belonging to the same variant are pushed as an ANDed list, + * and different variants are ORred together. */ static void -pushval_morph(TSQueryParserState * state, int typeval, char *strval, int lenval, int2 weight) +pushval_morph(void *opaque, TSQueryParserState state, char *strval, int lenval, int2 weight) { int4 count = 0; ParsedText prs; @@ -237,13 +244,14 @@ pushval_morph(TSQueryParserState * state, int typeval, char *strval, int lenval, cntvar = 0, cntpos = 0, cnt = 0; + Oid cfg_id = (Oid) opaque; /* the input is actually an Oid, not a pointer */ prs.lenwords = 4; prs.curwords = 0; prs.pos = 0; prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); - parsetext(state->cfg_id, &prs, strval, lenval); + parsetext(cfg_id, &prs, strval, lenval); if (prs.curwords > 0) { @@ -260,21 +268,21 @@ pushval_morph(TSQueryParserState * state, int typeval, char *strval, int lenval, while (count < prs.curwords && pos == prs.words[count].pos.pos && variant == prs.words[count].nvariant) { - pushval_asis(state, VAL, prs.words[count].word, prs.words[count].len, weight); + pushValue(state, prs.words[count].word, prs.words[count].len, weight); pfree(prs.words[count].word); if (cnt) - pushquery(state, OPR, (int4) '&', 0, 0, 0); + pushOperator(state, OP_AND); cnt++; count++; } if (cntvar) - pushquery(state, OPR, (int4) '|', 0, 0, 0); + pushOperator(state, OP_OR); cntvar++; } if (cntpos) - pushquery(state, OPR, (int4) '&', 0, 0, 0); + pushOperator(state, OP_AND); cntpos++; } @@ -283,7 +291,7 @@ pushval_morph(TSQueryParserState * state, int typeval, char *strval, int lenval, } else - pushval_asis(state, VALSTOP, NULL, 0, 0); + pushStop(state); } Datum @@ -295,7 +303,7 @@ to_tsquery_byid(PG_FUNCTION_ARGS) QueryItem *res; int4 len; - query = parse_tsquery(TextPGetCString(in), pushval_morph, cfgid, false); + query = parse_tsquery(TextPGetCString(in), pushval_morph, (void *) cfgid, false); if (query->size == 0) PG_RETURN_TSQUERY(query); @@ -333,7 +341,7 @@ plainto_tsquery_byid(PG_FUNCTION_ARGS) QueryItem *res; int4 len; - query = parse_tsquery(TextPGetCString(in), pushval_morph, cfgid, true); + query = parse_tsquery(TextPGetCString(in), pushval_morph, (void *)cfgid, true); if (query->size == 0) PG_RETURN_TSQUERY(query); diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c index 47e18fc1ac5b3c75137208621c40676ae401d321..22c5f2b86eaf3b8dd37ea69ef078433bd92a7ea3 100644 --- a/src/backend/tsearch/ts_parse.c +++ b/src/backend/tsearch/ts_parse.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.2 2007/08/25 00:03:59 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.3 2007/09/07 15:09:55 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -344,10 +344,12 @@ LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem) } /* - * Parse string and lexize words + * Parse string and lexize words. + * + * prs will be filled in. */ void -parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen) +parsetext(Oid cfgId, ParsedText * prs, char *buf, int buflen) { int type, lenlemm; @@ -427,7 +429,7 @@ parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen) * Headline framework */ static void -hladdword(HeadlineParsedText * prs, char *buf, int4 buflen, int type) +hladdword(HeadlineParsedText * prs, char *buf, int buflen, int type) { while (prs->curwords >= prs->lenwords) { @@ -458,17 +460,19 @@ hlfinditem(HeadlineParsedText * prs, TSQuery query, char *buf, int buflen) word = &(prs->words[prs->curwords - 1]); for (i = 0; i < query->size; i++) { - if (item->type == VAL && item->length == buflen && strncmp(GETOPERAND(query) + item->distance, buf, buflen) == 0) + if (item->type == QI_VAL && + item->operand.length == buflen && + strncmp(GETOPERAND(query) + item->operand.distance, buf, buflen) == 0) { if (word->item) { memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry)); - prs->words[prs->curwords].item = item; + prs->words[prs->curwords].item = &item->operand; prs->words[prs->curwords].repeated = 1; prs->curwords++; } else - word->item = item; + word->item = &item->operand; } item++; } @@ -511,7 +515,7 @@ addHLParsedLex(HeadlineParsedText * prs, TSQuery query, ParsedLex * lexs, TSLexe } void -hlparsetext(Oid cfgId, HeadlineParsedText * prs, TSQuery query, char *buf, int4 buflen) +hlparsetext(Oid cfgId, HeadlineParsedText * prs, TSQuery query, char *buf, int buflen) { int type, lenlemm; diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 5b47f66d07fdf77c93e21cf4bdbed3eeb0f8d3d6..5f65cbc9fb239a07aa15fb28fa8356c5f0553bb1 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.2 2007/08/22 01:39:45 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.3 2007/09/07 15:09:55 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -1575,7 +1575,7 @@ typedef struct } hlCheck; static bool -checkcondition_HL(void *checkval, QueryItem * val) +checkcondition_HL(void *checkval, QueryOperand * val) { int i; @@ -1601,14 +1601,14 @@ hlCover(HeadlineParsedText * prs, TSQuery query, int *p, int *q) for (j = 0; j < query->size; j++) { - if (item->type != VAL) + if (item->type != QI_VAL) { item++; continue; } for (i = pos; i < prs->curwords; i++) { - if (prs->words[i].item == item) + if (prs->words[i].item == &item->operand) { if (i > *q) *q = i; @@ -1624,14 +1624,14 @@ hlCover(HeadlineParsedText * prs, TSQuery query, int *p, int *q) item = GETQUERY(query); for (j = 0; j < query->size; j++) { - if (item->type != VAL) + if (item->type != QI_VAL) { item++; continue; } for (i = *q; i >= pos; i--) { - if (prs->words[i].item == item) + if (prs->words[i].item == &item->operand) { if (i < *p) *p = i; diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index a1f233dca8279ce0efa422ce0bfe9e301ee812ee..9a75c736df650dbf215a6c9f1bfcb35e70a4d1a8 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -1,7 +1,7 @@ # # Makefile for utils/adt # -# $PostgreSQL: pgsql/src/backend/utils/adt/Makefile,v 1.66 2007/08/27 01:39:24 tgl Exp $ +# $PostgreSQL: pgsql/src/backend/utils/adt/Makefile,v 1.67 2007/09/07 15:09:56 teodor Exp $ # subdir = src/backend/utils/adt @@ -28,7 +28,7 @@ OBJS = acl.o arrayfuncs.o array_userfuncs.o arrayutils.o bool.o \ ascii.o quote.o pgstatfuncs.o encode.o dbsize.o genfile.o \ tsginidx.o tsgistidx.o tsquery.o tsquery_cleanup.o tsquery_gist.o \ tsquery_op.o tsquery_rewrite.o tsquery_util.o tsrank.o \ - tsvector.o tsvector_op.o \ + tsvector.o tsvector_op.o tsvector_parser.o\ uuid.o xml.o like.o: like.c like_match.c diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c index 491dd21aa81b40fae9c88bf0f762e9e71cfbf49c..10b80dc9566d304ed5ab768f6bde609edf2b1523 100644 --- a/src/backend/utils/adt/tsginidx.c +++ b/src/backend/utils/adt/tsginidx.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.2 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -77,24 +77,25 @@ gin_extract_query(PG_FUNCTION_ARGS) item = GETQUERY(query); for (i = 0; i < query->size; i++) - if (item[i].type == VAL) + if (item[i].type == QI_VAL) (*nentries)++; entries = (Datum *) palloc(sizeof(Datum) * (*nentries)); for (i = 0; i < query->size; i++) - if (item[i].type == VAL) + if (item[i].type == QI_VAL) { text *txt; + QueryOperand *val = &item[i].operand; - txt = (text *) palloc(VARHDRSZ + item[i].length); + txt = (text *) palloc(VARHDRSZ + val->length); - SET_VARSIZE(txt, VARHDRSZ + item[i].length); - memcpy(VARDATA(txt), GETOPERAND(query) + item[i].distance, item[i].length); + SET_VARSIZE(txt, VARHDRSZ + val->length); + memcpy(VARDATA(txt), GETOPERAND(query) + val->distance, val->length); entries[j++] = PointerGetDatum(txt); - if (strategy != TSearchWithClassStrategyNumber && item[i].weight != 0) + if (strategy != TSearchWithClassStrategyNumber && val->weight != 0) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("@@ operator does not support lexeme class restrictions"), @@ -116,11 +117,11 @@ typedef struct } GinChkVal; static bool -checkcondition_gin(void *checkval, QueryItem * val) +checkcondition_gin(void *checkval, QueryOperand * val) { GinChkVal *gcv = (GinChkVal *) checkval; - return gcv->mapped_check[val - gcv->frst]; + return gcv->mapped_check[((QueryItem *) val) - gcv->frst]; } Datum @@ -142,7 +143,7 @@ gin_ts_consistent(PG_FUNCTION_ARGS) gcv.mapped_check = (bool *) palloc(sizeof(bool) * query->size); for (i = 0; i < query->size; i++) - if (item[i].type == VAL) + if (item[i].type == QI_VAL) gcv.mapped_check[i] = check[j++]; res = TS_execute( diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c index 6c262521ef49b56c2be2ee4d5c9e7aa740b0d65d..4fc51378b4bf5c70cbfc5e3e7d16195e7597f79c 100644 --- a/src/backend/utils/adt/tsgistidx.c +++ b/src/backend/utils/adt/tsgistidx.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsgistidx.c,v 1.2 2007/08/21 06:34:42 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsgistidx.c,v 1.3 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -293,7 +293,7 @@ typedef struct * is there value 'val' in array or not ? */ static bool -checkcondition_arr(void *checkval, QueryItem * val) +checkcondition_arr(void *checkval, QueryOperand * val) { int4 *StopLow = ((CHKVAL *) checkval)->arrb; int4 *StopHigh = ((CHKVAL *) checkval)->arre; @@ -304,9 +304,9 @@ checkcondition_arr(void *checkval, QueryItem * val) while (StopLow < StopHigh) { StopMiddle = StopLow + (StopHigh - StopLow) / 2; - if (*StopMiddle == val->val) + if (*StopMiddle == val->valcrc) return (true); - else if (*StopMiddle < val->val) + else if (*StopMiddle < val->valcrc) StopLow = StopMiddle + 1; else StopHigh = StopMiddle; @@ -316,9 +316,9 @@ checkcondition_arr(void *checkval, QueryItem * val) } static bool -checkcondition_bit(void *checkval, QueryItem * val) +checkcondition_bit(void *checkval, QueryOperand * val) { - return GETBIT(checkval, HASHVAL(val->val)); + return GETBIT(checkval, HASHVAL(val->valcrc)); } Datum diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index 83759728ff96c97e3134e9da1a624a09ca9823c9..27b93eb64d7725fa0ec9795a31fd576e188ae13d 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.2 2007/08/31 02:26:29 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.3 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -23,6 +23,29 @@ #include "utils/pg_crc.h" +struct TSQueryParserStateData +{ + /* State for gettoken_query */ + char *buffer; /* entire string we are scanning */ + char *buf; /* current scan point */ + int state; + int count; /* nesting count, incremented by (, + decremented by ) */ + + /* polish (prefix) notation in list, filled in by push* functions */ + List *polstr; + + /* Strings from operands are collected in op. curop is a pointer to + * the end of used space of op. */ + char *op; + char *curop; + int lenop; /* allocated size of op */ + int sumlen; /* used size of op */ + + /* state for value's parser */ + TSVectorParseState valstate; +}; + /* parser's states */ #define WAITOPERAND 1 #define WAITOPERATOR 2 @@ -30,21 +53,10 @@ #define WAITSINGLEOPERAND 4 /* - * node of query tree, also used - * for storing polish notation in parser + * subroutine to parse the weight part, like ':1AB' of a query. */ -typedef struct ParseQueryNode -{ - int2 weight; - int2 type; - int4 val; - int2 distance; - int2 length; - struct ParseQueryNode *next; -} ParseQueryNode; - static char * -get_weight(char *buf, int2 *weight) +get_weight(char *buf, int16 *weight) { *weight = 0; @@ -81,11 +93,28 @@ get_weight(char *buf, int2 *weight) return buf; } +/* + * token types for parsing + */ +typedef enum { + PT_END = 0, + PT_ERR = 1, + PT_VAL = 2, + PT_OPR = 3, + PT_OPEN = 4, + PT_CLOSE = 5, +} ts_tokentype; + /* * get token from query string + * + * *operator is filled in with OP_* when return values is PT_OPR + * *strval, *lenval and *weight are filled in when return value is PT_VAL */ -static int4 -gettoken_query(TSQueryParserState * state, int4 *val, int4 *lenval, char **strval, int2 *weight) +static ts_tokentype +gettoken_query(TSQueryParserState state, + int8 *operator, + int *lenval, char **strval, int16 *weight) { while (1) { @@ -97,16 +126,16 @@ gettoken_query(TSQueryParserState * state, int4 *val, int4 *lenval, char **strva { (state->buf)++; /* can safely ++, t_iseq guarantee * that pg_mblen()==1 */ - *val = (int4) '!'; + *operator = OP_NOT; state->state = WAITOPERAND; - return OPR; + return PT_OPR; } else if (t_iseq(state->buf, '(')) { state->count++; (state->buf)++; state->state = WAITOPERAND; - return OPEN; + return PT_OPEN; } else if (t_iseq(state->buf, ':')) { @@ -117,17 +146,16 @@ gettoken_query(TSQueryParserState * state, int4 *val, int4 *lenval, char **strva } else if (!t_isspace(state->buf)) { - state->valstate.prsbuf = state->buf; - if (gettoken_tsvector(&(state->valstate))) + /* We rely on the tsvector parser to parse the value for us */ + reset_tsvector_parser(state->valstate, state->buf); + if (gettoken_tsvector(state->valstate, strval, lenval, NULL, NULL, &state->buf)) { - *strval = state->valstate.word; - *lenval = state->valstate.curpos - state->valstate.word; - state->buf = get_weight(state->valstate.prsbuf, weight); + state->buf = get_weight(state->buf, weight); state->state = WAITOPERATOR; - return VAL; + return PT_VAL; } else if (state->state == WAITFIRSTOPERAND) - return END; + return PT_END; else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -136,52 +164,71 @@ gettoken_query(TSQueryParserState * state, int4 *val, int4 *lenval, char **strva } break; case WAITOPERATOR: - if (t_iseq(state->buf, '&') || t_iseq(state->buf, '|')) + if (t_iseq(state->buf, '&')) + { + state->state = WAITOPERAND; + *operator = OP_AND; + (state->buf)++; + return PT_OPR; + } + if (t_iseq(state->buf, '|')) { state->state = WAITOPERAND; - *val = (int4) *(state->buf); + *operator = OP_OR; (state->buf)++; - return OPR; + return PT_OPR; } else if (t_iseq(state->buf, ')')) { (state->buf)++; state->count--; - return (state->count < 0) ? ERR : CLOSE; + return (state->count < 0) ? PT_ERR : PT_CLOSE; } else if (*(state->buf) == '\0') - return (state->count) ? ERR : END; + return (state->count) ? PT_ERR : PT_END; else if (!t_isspace(state->buf)) - return ERR; + return PT_ERR; break; case WAITSINGLEOPERAND: if (*(state->buf) == '\0') - return END; + return PT_END; *strval = state->buf; *lenval = strlen(state->buf); state->buf += strlen(state->buf); state->count++; - return VAL; + return PT_VAL; default: - return ERR; + return PT_ERR; break; } state->buf += pg_mblen(state->buf); } - return END; + return PT_END; } /* - * push new one in polish notation reverse view + * Push an operator to state->polstr */ void -pushquery(TSQueryParserState * state, int4 type, int4 val, int4 distance, int4 lenval, int2 weight) +pushOperator(TSQueryParserState state, int8 oper) { - ParseQueryNode *tmp = (ParseQueryNode *) palloc(sizeof(ParseQueryNode)); + QueryOperator *tmp; + + Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR); + + tmp = (QueryOperator *) palloc(sizeof(QueryOperator)); + tmp->type = QI_OPR; + tmp->oper = oper; + /* left is filled in later with findoprnd */ + + state->polstr = lcons(tmp, state->polstr); +} + +static void +pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int lenval, int weight) +{ + QueryOperand *tmp; - tmp->weight = weight; - tmp->type = type; - tmp->val = val; if (distance >= MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -192,20 +239,27 @@ pushquery(TSQueryParserState * state, int4 type, int4 val, int4 distance, int4 l (errcode(ERRCODE_SYNTAX_ERROR), errmsg("operand is too long in tsearch query: \"%s\"", state->buffer))); - tmp->distance = distance; + + tmp = (QueryOperand *) palloc(sizeof(QueryOperand)); + tmp->type = QI_VAL; + tmp->weight = weight; + tmp->valcrc = (int32) valcrc; tmp->length = lenval; - tmp->next = state->str; - state->str = tmp; - state->num++; + tmp->distance = distance; + + state->polstr = lcons(tmp, state->polstr); } /* - * This function is used for tsquery parsing + * Push an operand to state->polstr. + * + * strval must point to a string equal to state->curop. lenval is the length + * of the string. */ void -pushval_asis(TSQueryParserState * state, int type, char *strval, int lenval, int2 weight) +pushValue(TSQueryParserState state, char *strval, int lenval, int2 weight) { - pg_crc32 c; + pg_crc32 valcrc; if (lenval >= MAXSTRLEN) ereport(ERROR, @@ -213,162 +267,202 @@ pushval_asis(TSQueryParserState * state, int type, char *strval, int lenval, int errmsg("word is too long in tsearch query: \"%s\"", state->buffer))); - INIT_CRC32(c); - COMP_CRC32(c, strval, lenval); - FIN_CRC32(c); - pushquery(state, type, *(int4 *) &c, - state->curop - state->op, lenval, weight); + INIT_CRC32(valcrc); + COMP_CRC32(valcrc, strval, lenval); + FIN_CRC32(valcrc); + pushValue_internal(state, valcrc, state->curop - state->op, lenval, weight); + /* append the value string to state.op, enlarging buffer if needed first */ while (state->curop - state->op + lenval + 1 >= state->lenop) { - int4 tmp = state->curop - state->op; + int used = state->curop - state->op; state->lenop *= 2; state->op = (char *) repalloc((void *) state->op, state->lenop); - state->curop = state->op + tmp; + state->curop = state->op + used; } memcpy((void *) state->curop, (void *) strval, lenval); state->curop += lenval; *(state->curop) = '\0'; state->curop++; state->sumlen += lenval + 1 /* \0 */ ; - return; } + +/* + * Push a stopword placeholder to state->polstr + */ +void +pushStop(TSQueryParserState state) +{ + QueryOperand *tmp; + + tmp = (QueryOperand *) palloc(sizeof(QueryOperand)); + tmp->type = QI_VALSTOP; + + state->polstr = lcons(tmp, state->polstr); +} + + #define STACKDEPTH 32 /* - * make polish notation of query + * Make polish (prefix) notation of query. + * + * See parse_tsquery for explanation of pushval. */ -static int4 -makepol(TSQueryParserState * state, - void (*pushval) (TSQueryParserState *, int, char *, int, int2)) +static void +makepol(TSQueryParserState state, + PushFunction pushval, + void *opaque) { - int4 val = 0, - type; - int4 lenval = 0; + int8 operator = 0; + ts_tokentype type; + int lenval = 0; char *strval = NULL; - int4 stack[STACKDEPTH]; - int4 lenstack = 0; - int2 weight = 0; + int8 opstack[STACKDEPTH]; + int lenstack = 0; + int16 weight = 0; /* since this function recurses, it could be driven to stack overflow */ check_stack_depth(); - while ((type = gettoken_query(state, &val, &lenval, &strval, &weight)) != END) + while ((type = gettoken_query(state, &operator, &lenval, &strval, &weight)) != PT_END) { switch (type) { - case VAL: - pushval(state, VAL, strval, lenval, weight); - while (lenstack && (stack[lenstack - 1] == (int4) '&' || - stack[lenstack - 1] == (int4) '!')) + case PT_VAL: + pushval(opaque, state, strval, lenval, weight); + while (lenstack && (opstack[lenstack - 1] == OP_AND || + opstack[lenstack - 1] == OP_NOT)) { lenstack--; - pushquery(state, OPR, stack[lenstack], 0, 0, 0); + pushOperator(state, opstack[lenstack]); } break; - case OPR: - if (lenstack && val == (int4) '|') - pushquery(state, OPR, val, 0, 0, 0); + case PT_OPR: + if (lenstack && operator == OP_OR) + pushOperator(state, OP_OR); else { if (lenstack == STACKDEPTH) /* internal error */ elog(ERROR, "tsquery stack too small"); - stack[lenstack] = val; + opstack[lenstack] = operator; lenstack++; } break; - case OPEN: - if (makepol(state, pushval) == ERR) - return ERR; - if (lenstack && (stack[lenstack - 1] == (int4) '&' || - stack[lenstack - 1] == (int4) '!')) + case PT_OPEN: + makepol(state, pushval, opaque); + + if (lenstack && (opstack[lenstack - 1] == OP_AND || + opstack[lenstack - 1] == OP_NOT)) { lenstack--; - pushquery(state, OPR, stack[lenstack], 0, 0, 0); + pushOperator(state, opstack[lenstack]); } break; - case CLOSE: + case PT_CLOSE: while (lenstack) { lenstack--; - pushquery(state, OPR, stack[lenstack], 0, 0, 0); + pushOperator(state, opstack[lenstack]); }; - return END; - break; - case ERR: + return; + case PT_ERR: default: ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error in tsearch query: \"%s\"", state->buffer))); - return ERR; - } } while (lenstack) { lenstack--; - pushquery(state, OPR, stack[lenstack], 0, 0, 0); - }; - return END; + pushOperator(state, opstack[lenstack]); + } } +/* + * Fills in the left-fields previously left unfilled. The input + * QueryItems must be in polish (prefix) notation. + */ static void -findoprnd(QueryItem * ptr, int4 *pos) +findoprnd(QueryItem *ptr, int *pos) { - if (ptr[*pos].type == VAL || ptr[*pos].type == VALSTOP) - { - ptr[*pos].left = 0; - (*pos)++; - } - else if (ptr[*pos].val == (int4) '!') + /* since this function recurses, it could be driven to stack overflow. */ + check_stack_depth(); + + if (ptr[*pos].type == QI_VAL || + ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here, + * they haven't been cleansed + * away yet. + */ { - ptr[*pos].left = 1; (*pos)++; - findoprnd(ptr, pos); } - else + else { - QueryItem *curitem = &ptr[*pos]; - int4 tmp = *pos; + Assert(ptr[*pos].type == QI_OPR); - (*pos)++; - findoprnd(ptr, pos); - curitem->left = *pos - tmp; - findoprnd(ptr, pos); + if (ptr[*pos].operator.oper == OP_NOT) + { + ptr[*pos].operator.left = 1; + (*pos)++; + findoprnd(ptr, pos); + } + else + { + QueryOperator *curitem = &ptr[*pos].operator; + int tmp = *pos; + + Assert(curitem->oper == OP_AND || curitem->oper == OP_OR); + + (*pos)++; + findoprnd(ptr, pos); + curitem->left = *pos - tmp; + findoprnd(ptr, pos); + } } } - /* - * input + * Each value (operand) in the query is be passed to pushval. pushval can + * transform the simple value to an arbitrarily complex expression using + * pushValue and pushOperator. It must push a single value with pushValue, + * a complete expression with all operands, or a a stopword placeholder + * with pushStop, otherwise the prefix notation representation will be broken, + * having an operator with no operand. + * + * opaque is passed on to pushval as is, pushval can use it to store its + * private state. + * + * The returned query might contain QI_STOPVAL nodes. The caller is responsible + * for cleaning them up (with clean_fakeval) */ TSQuery -parse_tsquery(char *buf, void (*pushval) (TSQueryParserState *, int, char *, int, int2), Oid cfg_id, bool isplain) +parse_tsquery(char *buf, + PushFunction pushval, + void *opaque, + bool isplain) { - TSQueryParserState state; - int4 i; + struct TSQueryParserStateData state; + int i; TSQuery query; - int4 commonlen; + int commonlen; QueryItem *ptr; - ParseQueryNode *tmp; - int4 pos = 0; + int pos = 0; + ListCell *cell; /* init state */ state.buffer = buf; state.buf = buf; state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND; state.count = 0; - state.num = 0; - state.str = NULL; - state.cfg_id = cfg_id; + state.polstr = NIL; /* init value parser's state */ - state.valstate.oprisdelim = true; - state.valstate.len = 32; - state.valstate.word = (char *) palloc(state.valstate.len); + state.valstate = init_tsvector_parser(NULL, true); /* init list of operand */ state.sumlen = 0; @@ -377,9 +471,11 @@ parse_tsquery(char *buf, void (*pushval) (TSQueryParserState *, int, char *, int *(state.curop) = '\0'; /* parse query & make polish notation (postfix, but in reverse order) */ - makepol(&state, pushval); - pfree(state.valstate.word); - if (!state.num) + makepol(&state, pushval, opaque); + + close_tsvector_parser(state.valstate); + + if (list_length(state.polstr) == 0) { ereport(NOTICE, (errmsg("tsearch query doesn't contain lexeme(s): \"%s\"", @@ -390,37 +486,54 @@ parse_tsquery(char *buf, void (*pushval) (TSQueryParserState *, int, char *, int return query; } - /* make finish struct */ - commonlen = COMPUTESIZE(state.num, state.sumlen); - query = (TSQuery) palloc(commonlen); + /* Pack the QueryItems in the final TSQuery struct to return to caller */ + commonlen = COMPUTESIZE(list_length(state.polstr), state.sumlen); + query = (TSQuery) palloc0(commonlen); SET_VARSIZE(query, commonlen); - query->size = state.num; + query->size = list_length(state.polstr); ptr = GETQUERY(query); - /* set item in polish notation */ - for (i = 0; i < state.num; i++) + /* Copy QueryItems to TSQuery */ + i = 0; + foreach(cell, state.polstr) { - ptr[i].weight = state.str->weight; - ptr[i].type = state.str->type; - ptr[i].val = state.str->val; - ptr[i].distance = state.str->distance; - ptr[i].length = state.str->length; - tmp = state.str->next; - pfree(state.str); - state.str = tmp; + QueryItem *item = (QueryItem *) lfirst(cell); + + switch(item->type) + { + case QI_VAL: + memcpy(&ptr[i], item, sizeof(QueryOperand)); + break; + case QI_VALSTOP: + ptr[i].type = QI_VALSTOP; + break; + case QI_OPR: + memcpy(&ptr[i], item, sizeof(QueryOperator)); + break; + default: + elog(ERROR, "unknown QueryItem type %d", item->type); + } + i++; } - /* set user friendly-operand view */ + /* Copy all the operand strings to TSQuery */ memcpy((void *) GETOPERAND(query), (void *) state.op, state.sumlen); pfree(state.op); - /* set left operand's position for every operator */ + /* Set left operand pointers for every operator. */ pos = 0; findoprnd(ptr, &pos); return query; } +static void +pushval_asis(void *opaque, TSQueryParserState state, char *strval, int lenval, + int16 weight) +{ + pushValue(state, strval, lenval, weight); +} + /* * in without morphology */ @@ -431,7 +544,7 @@ tsqueryin(PG_FUNCTION_ARGS) pg_verifymbstr(in, strlen(in), false); - PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, InvalidOid, false)); + PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, NULL, false)); } /* @@ -443,13 +556,14 @@ typedef struct char *buf; char *cur; char *op; - int4 buflen; + int buflen; } INFIX; -#define RESIZEBUF(inf,addsize) \ +/* Makes sure inf->buf is large enough for adding 'addsize' bytes */ +#define RESIZEBUF(inf, addsize) \ while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \ { \ - int4 len = (inf)->cur - (inf)->buf; \ + int len = (inf)->cur - (inf)->buf; \ (inf)->buflen *= 2; \ (inf)->buf = (char*) repalloc( (void*)(inf)->buf, (inf)->buflen ); \ (inf)->cur = (inf)->buf + len; \ @@ -462,12 +576,16 @@ while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \ static void infix(INFIX * in, bool first) { - if (in->curpol->type == VAL) + /* since this function recurses, it could be driven to stack overflow. */ + check_stack_depth(); + + if (in->curpol->type == QI_VAL) { - char *op = in->op + in->curpol->distance; + QueryOperand *curpol = &in->curpol->operand; + char *op = in->op + curpol->distance; int clen; - RESIZEBUF(in, in->curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 5); + RESIZEBUF(in, curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 5); *(in->cur) = '\''; in->cur++; while (*op) @@ -485,26 +603,26 @@ infix(INFIX * in, bool first) } *(in->cur) = '\''; in->cur++; - if (in->curpol->weight) + if (curpol->weight) { *(in->cur) = ':'; in->cur++; - if (in->curpol->weight & (1 << 3)) + if (curpol->weight & (1 << 3)) { *(in->cur) = 'A'; in->cur++; } - if (in->curpol->weight & (1 << 2)) + if (curpol->weight & (1 << 2)) { *(in->cur) = 'B'; in->cur++; } - if (in->curpol->weight & (1 << 1)) + if (curpol->weight & (1 << 1)) { *(in->cur) = 'C'; in->cur++; } - if (in->curpol->weight & 1) + if (curpol->weight & 1) { *(in->cur) = 'D'; in->cur++; @@ -513,7 +631,7 @@ infix(INFIX * in, bool first) *(in->cur) = '\0'; in->curpol++; } - else if (in->curpol->val == (int4) '!') + else if (in->curpol->operator.oper == OP_NOT) { bool isopr = false; @@ -522,13 +640,15 @@ infix(INFIX * in, bool first) in->cur++; *(in->cur) = '\0'; in->curpol++; - if (in->curpol->type == OPR) + + if (in->curpol->type == QI_OPR) { isopr = true; RESIZEBUF(in, 2); sprintf(in->cur, "( "); in->cur = strchr(in->cur, '\0'); } + infix(in, isopr); if (isopr) { @@ -539,11 +659,11 @@ infix(INFIX * in, bool first) } else { - int4 op = in->curpol->val; + int8 op = in->curpol->operator.oper; INFIX nrm; in->curpol++; - if (op == (int4) '|' && !first) + if (op == OP_OR && !first) { RESIZEBUF(in, 2); sprintf(in->cur, "( "); @@ -564,11 +684,22 @@ infix(INFIX * in, bool first) /* print operator & right operand */ RESIZEBUF(in, 3 + (nrm.cur - nrm.buf)); - sprintf(in->cur, " %c %s", op, nrm.buf); + switch(op) + { + case OP_OR: + sprintf(in->cur, " | %s", nrm.buf); + break; + case OP_AND: + sprintf(in->cur, " & %s", nrm.buf); + break; + default: + /* OP_NOT is handled in above if-branch*/ + elog(ERROR, "unexpected operator type %d", op); + } in->cur = strchr(in->cur, '\0'); pfree(nrm.buf); - if (op == (int4) '|' && !first) + if (op == OP_OR && !first) { RESIZEBUF(in, 2); sprintf(in->cur, " )"); @@ -615,28 +746,33 @@ tsquerysend(PG_FUNCTION_ARGS) pq_sendint(&buf, query->size, sizeof(int32)); for (i = 0; i < query->size; i++) { - int tmp; - pq_sendint(&buf, item->type, sizeof(item->type)); - pq_sendint(&buf, item->weight, sizeof(item->weight)); - pq_sendint(&buf, item->left, sizeof(item->left)); - pq_sendint(&buf, item->val, sizeof(item->val)); - - /* - * We are sure that sizeof(WordEntry) == sizeof(int32), and about - * layout of QueryItem - */ - tmp = *(int32 *) (((char *) item) + HDRSIZEQI); - pq_sendint(&buf, tmp, sizeof(tmp)); + switch(item->type) + { + case QI_VAL: + pq_sendint(&buf, item->operand.weight, sizeof(item->operand.weight)); + pq_sendint(&buf, item->operand.valcrc, sizeof(item->operand.valcrc)); + pq_sendint(&buf, item->operand.length, sizeof(int16)); + /* istrue flag is just for temporary use in tsrank.c/Cover, + * so we don't need to transfer that */ + break; + case QI_OPR: + pq_sendint(&buf, item->operator.oper, sizeof(item->operator.oper)); + if (item->operator.oper != OP_NOT) + pq_sendint(&buf, item->operator.left, sizeof(item->operator.left)); + break; + default: + elog(ERROR, "unknown tsquery node type %d", item->type); + } item++; } item = GETQUERY(query); for (i = 0; i < query->size; i++) { - if (item->type == VAL) - pq_sendbytes(&buf, GETOPERAND(query) + item->distance, item->length); + if (item->type == QI_VAL) + pq_sendbytes(&buf, GETOPERAND(query) + item->operand.distance, item->operand.length); item++; } @@ -652,8 +788,7 @@ tsqueryrecv(PG_FUNCTION_ARGS) TSQuery query; int i, size, - tmp, - len = HDRSIZETQ; + len; QueryItem *item; int datalen = 0; char *ptr; @@ -661,7 +796,8 @@ tsqueryrecv(PG_FUNCTION_ARGS) size = pq_getmsgint(buf, sizeof(uint32)); if (size < 0 || size > (MaxAllocSize / sizeof(QueryItem))) elog(ERROR, "invalid size of tsquery"); - len += sizeof(QueryItem) * size; + + len = HDRSIZETQ + sizeof(QueryItem) * size; query = (TSQuery) palloc(len); query->size = size; @@ -670,32 +806,67 @@ tsqueryrecv(PG_FUNCTION_ARGS) for (i = 0; i < size; i++) { item->type = (int8) pq_getmsgint(buf, sizeof(int8)); - item->weight = (int8) pq_getmsgint(buf, sizeof(int8)); - item->left = (int16) pq_getmsgint(buf, sizeof(int16)); - item->val = (int32) pq_getmsgint(buf, sizeof(int32)); - tmp = pq_getmsgint(buf, sizeof(int32)); - memcpy((((char *) item) + HDRSIZEQI), &tmp, sizeof(int32)); - - /* - * Sanity checks - */ - if (item->type == VAL) - { - datalen += item->length + 1; /* \0 */ - } - else if (item->type == OPR) + + switch(item->type) { - if (item->val == '|' || item->val == '&') - { - if (item->left <= 0 || i + item->left >= size) - elog(ERROR, "invalid pointer to left operand"); - } + case QI_VAL: + item->operand.weight = (int8) pq_getmsgint(buf, sizeof(int8)); + item->operand.valcrc = (int32) pq_getmsgint(buf, sizeof(int32)); + item->operand.length = pq_getmsgint(buf, sizeof(int16)); + + /* + * Check that datalen doesn't grow too large. Without the + * check, a malicious client could induce a buffer overflow + * by sending a tsquery whose size exceeds 2GB. datalen + * would overflow, we would allocate a too small buffer below, + * and overflow the buffer. Because operand.length is a 20-bit + * field, adding one such value to datalen must exceed + * MaxAllocSize before wrapping over the 32-bit datalen field, + * so this check will protect from it. + */ + if (datalen > MAXSTRLEN) + elog(ERROR, "invalid tsquery; total operand length exceeded"); + + /* We can calculate distance from datalen, no need to send it + * through the wire. If we did, we would have to check that + * it's valid anyway. + */ + item->operand.distance = datalen; + + datalen += item->operand.length + 1; /* \0 */ - if (i == size - 1) - elog(ERROR, "invalid pointer to right operand"); + break; + case QI_OPR: + item->operator.oper = (int8) pq_getmsgint(buf, sizeof(int8)); + if (item->operator.oper != OP_NOT && + item->operator.oper != OP_OR && + item->operator.oper != OP_AND) + elog(ERROR, "unknown operator type %d", (int) item->operator.oper); + if(item->operator.oper != OP_NOT) + { + item->operator.left = (int16) pq_getmsgint(buf, sizeof(int16)); + /* + * Sanity checks + */ + if (item->operator.left <= 0 || i + item->operator.left >= size) + elog(ERROR, "invalid pointer to left operand"); + + /* XXX: Though there's no way to construct a TSQuery that's + * not in polish notation, we don't enforce that for + * queries received from client in binary mode. Is there + * anything that relies on it? + * + * XXX: The tree could be malformed in other ways too, + * a node could have two parents, for example. + */ + } + + if (i == size - 1) + elog(ERROR, "invalid pointer to right operand"); + break; + default: + elog(ERROR, "unknown tsquery node type %d", item->type); } - else - elog(ERROR, "unknown tsquery node type"); item++; } @@ -706,13 +877,12 @@ tsqueryrecv(PG_FUNCTION_ARGS) ptr = GETOPERAND(query); for (i = 0; i < size; i++) { - if (item->type == VAL) + if (item->type == QI_VAL) { - item->distance = ptr - GETOPERAND(query); memcpy(ptr, - pq_getmsgbytes(buf, item->length), - item->length); - ptr += item->length; + pq_getmsgbytes(buf, item->operand.length), + item->operand.length); + ptr += item->operand.length; *ptr++ = '\0'; } item++; @@ -736,7 +906,7 @@ tsquerytree(PG_FUNCTION_ARGS) INFIX nrm; text *res; QueryItem *q; - int4 len; + int len; if (query->size == 0) { diff --git a/src/backend/utils/adt/tsquery_cleanup.c b/src/backend/utils/adt/tsquery_cleanup.c index 7991a4ad198c2d838aef08f5e641360d531df848..22e6f7c8198918aac94ef73273fc9f90ef2bea77 100644 --- a/src/backend/utils/adt/tsquery_cleanup.c +++ b/src/backend/utils/adt/tsquery_cleanup.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_cleanup.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_cleanup.c,v 1.2 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -35,20 +35,23 @@ maketree(QueryItem * in) node->valnode = in; node->right = node->left = NULL; - if (in->type == OPR) + if (in->type == QI_OPR) { node->right = maketree(in + 1); - if (in->val != (int4) '!') - node->left = maketree(in + in->left); + if (in->operator.oper != OP_NOT) + node->left = maketree(in + in->operator.left); } return node; } +/* + * Internal state for plaintree and plainnode + */ typedef struct { QueryItem *ptr; - int4 len; - int4 cur; + int len; /* allocated size of ptr */ + int cur; /* number of elements in ptr */ } PLAINTREE; static void @@ -60,37 +63,37 @@ plainnode(PLAINTREE * state, NODE * node) state->ptr = (QueryItem *) repalloc((void *) state->ptr, state->len * sizeof(QueryItem)); } memcpy((void *) &(state->ptr[state->cur]), (void *) node->valnode, sizeof(QueryItem)); - if (node->valnode->type == VAL) + if (node->valnode->type == QI_VAL) state->cur++; - else if (node->valnode->val == (int4) '!') + else if (node->valnode->operator.oper == OP_NOT) { - state->ptr[state->cur].left = 1; + state->ptr[state->cur].operator.left = 1; state->cur++; plainnode(state, node->right); } else { - int4 cur = state->cur; + int cur = state->cur; state->cur++; plainnode(state, node->right); - state->ptr[cur].left = state->cur - cur; + state->ptr[cur].operator.left = state->cur - cur; plainnode(state, node->left); } pfree(node); } /* - * make plain view of tree from 'normal' view of tree + * make plain view of tree from a NODE-tree representation */ static QueryItem * -plaintree(NODE * root, int4 *len) +plaintree(NODE * root, int *len) { PLAINTREE pl; pl.cur = 0; pl.len = 16; - if (root && (root->valnode->type == VAL || root->valnode->type == OPR)) + if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR)) { pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem)); plainnode(&pl, root); @@ -122,17 +125,17 @@ freetree(NODE * node) static NODE * clean_NOT_intree(NODE * node) { - if (node->valnode->type == VAL) + if (node->valnode->type == QI_VAL) return node; - if (node->valnode->val == (int4) '!') + if (node->valnode->operator.oper == OP_NOT) { freetree(node); return NULL; } /* operator & or | */ - if (node->valnode->val == (int4) '|') + if (node->valnode->operator.oper == OP_OR) { if ((node->left = clean_NOT_intree(node->left)) == NULL || (node->right = clean_NOT_intree(node->right)) == NULL) @@ -144,6 +147,8 @@ clean_NOT_intree(NODE * node) else { NODE *res = node; + + Assert(node->valnode->operator.oper == OP_AND); node->left = clean_NOT_intree(node->left); node->right = clean_NOT_intree(node->right); @@ -168,7 +173,7 @@ clean_NOT_intree(NODE * node) } QueryItem * -clean_NOT(QueryItem * ptr, int4 *len) +clean_NOT(QueryItem * ptr, int *len) { NODE *root = maketree(ptr); @@ -180,10 +185,13 @@ clean_NOT(QueryItem * ptr, int4 *len) #undef V_UNKNOWN #endif -#define V_UNKNOWN 0 -#define V_TRUE 1 -#define V_FALSE 2 -#define V_STOP 3 +/* + * output values for result output parameter of clean_fakeval_intree + */ +#define V_UNKNOWN 0 /* the expression can't be evaluated statically */ +#define V_TRUE 1 /* the expression is always true (not implemented) */ +#define V_FALSE 2 /* the expression is always false (not implemented) */ +#define V_STOP 3 /* the expression is a stop word */ /* * Clean query tree from values which is always in @@ -195,17 +203,19 @@ clean_fakeval_intree(NODE * node, char *result) char lresult = V_UNKNOWN, rresult = V_UNKNOWN; - if (node->valnode->type == VAL) + if (node->valnode->type == QI_VAL) return node; - else if (node->valnode->type == VALSTOP) + else + if (node->valnode->type == QI_VALSTOP) { pfree(node); *result = V_STOP; return NULL; } + Assert(node->valnode->type == QI_OPR); - if (node->valnode->val == (int4) '!') + if (node->valnode->operator.oper == OP_NOT) { node->right = clean_fakeval_intree(node->right, &rresult); if (!node->right) @@ -221,6 +231,7 @@ clean_fakeval_intree(NODE * node, char *result) node->left = clean_fakeval_intree(node->left, &lresult); node->right = clean_fakeval_intree(node->right, &rresult); + if (lresult == V_STOP && rresult == V_STOP) { freetree(node); @@ -243,7 +254,7 @@ clean_fakeval_intree(NODE * node, char *result) } QueryItem * -clean_fakeval(QueryItem * ptr, int4 *len) +clean_fakeval(QueryItem * ptr, int *len) { NODE *root = maketree(ptr); char result = V_UNKNOWN; diff --git a/src/backend/utils/adt/tsquery_op.c b/src/backend/utils/adt/tsquery_op.c index fd97c2796df771580739328c910f3b53cb2342a1..cbf06f7adeb8cc8419a907c9bb642556fa2e4b8a 100644 --- a/src/backend/utils/adt/tsquery_op.c +++ b/src/backend/utils/adt/tsquery_op.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_op.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_op.c,v 1.2 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -30,14 +30,15 @@ tsquery_numnode(PG_FUNCTION_ARGS) } static QTNode * -join_tsqueries(TSQuery a, TSQuery b) +join_tsqueries(TSQuery a, TSQuery b, int8 operator) { QTNode *res = (QTNode *) palloc0(sizeof(QTNode)); res->flags |= QTN_NEEDFREE; res->valnode = (QueryItem *) palloc0(sizeof(QueryItem)); - res->valnode->type = OPR; + res->valnode->type = QI_OPR; + res->valnode->operator.oper = operator; res->child = (QTNode **) palloc0(sizeof(QTNode *) * 2); res->child[0] = QT2QTN(GETQUERY(b), GETOPERAND(b)); @@ -66,9 +67,7 @@ tsquery_and(PG_FUNCTION_ARGS) PG_RETURN_POINTER(a); } - res = join_tsqueries(a, b); - - res->valnode->val = '&'; + res = join_tsqueries(a, b, OP_AND); query = QTN2QT(res); @@ -98,9 +97,7 @@ tsquery_or(PG_FUNCTION_ARGS) PG_RETURN_POINTER(a); } - res = join_tsqueries(a, b); - - res->valnode->val = '|'; + res = join_tsqueries(a, b, OP_OR); query = QTN2QT(res); @@ -126,8 +123,8 @@ tsquery_not(PG_FUNCTION_ARGS) res->flags |= QTN_NEEDFREE; res->valnode = (QueryItem *) palloc0(sizeof(QueryItem)); - res->valnode->type = OPR; - res->valnode->val = '!'; + res->valnode->type = QI_OPR; + res->valnode->operator.oper = OP_NOT; res->child = (QTNode **) palloc0(sizeof(QTNode *)); res->child[0] = QT2QTN(GETQUERY(a), GETOPERAND(a)); @@ -209,8 +206,8 @@ makeTSQuerySign(TSQuery a) for (i = 0; i < a->size; i++) { - if (ptr->type == VAL) - sign |= ((TSQuerySign) 1) << (ptr->val % TSQS_SIGLEN); + if (ptr->type == QI_VAL) + sign |= ((TSQuerySign) 1) << (ptr->operand.valcrc % TSQS_SIGLEN); ptr++; } @@ -253,10 +250,10 @@ tsq_mcontains(PG_FUNCTION_ARGS) for (i = 0; i < ex->size; i++) { iq = GETQUERY(query); - if (ie[i].type != VAL) + if (ie[i].type != QI_VAL) continue; for (j = 0; j < query->size; j++) - if (iq[j].type == VAL && ie[i].val == iq[j].val) + if (iq[j].type == QI_VAL && ie[i].operand.valcrc == iq[j].operand.valcrc) { j = query->size + 1; break; diff --git a/src/backend/utils/adt/tsquery_rewrite.c b/src/backend/utils/adt/tsquery_rewrite.c index f0d22c644ae702b59a0df740631631aa40307e61..db2fe6c53ef91681bc9d39b98c1249d80ef3e103 100644 --- a/src/backend/utils/adt/tsquery_rewrite.c +++ b/src/backend/utils/adt/tsquery_rewrite.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_rewrite.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_rewrite.c,v 1.2 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -34,18 +34,26 @@ addone(int *counters, int last, int total) return 1; } +/* + * If node is equal to ex, replace it with subs. Replacement is actually done + * by returning either node or a copy of subs. + */ static QTNode * findeq(QTNode *node, QTNode *ex, QTNode *subs, bool *isfind) { - if ((node->sign & ex->sign) != ex->sign || node->valnode->type != ex->valnode->type || node->valnode->val != ex->valnode->val) + if ((node->sign & ex->sign) != ex->sign || + node->valnode->type != ex->valnode->type) return node; if (node->flags & QTN_NOCHANGE) return node; - - if (node->valnode->type == OPR) + + if (node->valnode->type == QI_OPR) { + if (node->valnode->operator.oper != ex->valnode->operator.oper) + return node; + if (node->nchild == ex->nchild) { if (QTNEq(node, ex)) @@ -63,6 +71,12 @@ findeq(QTNode *node, QTNode *ex, QTNode *subs, bool *isfind) } else if (node->nchild > ex->nchild) { + /* + * AND and NOT are commutative, so we check if a subset of the + * children match. For example, if tnode is A | B | C, and + * ex is B | C, we have a match after we convert tnode to + * A | (B | C). + */ int *counters = (int *) palloc(sizeof(int) * node->nchild); int i; QTNode *tnode = (QTNode *) palloc(sizeof(QTNode)); @@ -131,19 +145,26 @@ findeq(QTNode *node, QTNode *ex, QTNode *subs, bool *isfind) pfree(counters); } } - else if (QTNEq(node, ex)) + else { - QTNFree(node); - if (subs) - { - node = QTNCopy(subs); - node->flags |= QTN_NOCHANGE; - } - else + Assert(node->valnode->type == QI_VAL); + + if (node->valnode->operand.valcrc != ex->valnode->operand.valcrc) + return node; + else if (QTNEq(node, ex)) { - node = NULL; + QTNFree(node); + if (subs) + { + node = QTNCopy(subs); + node->flags |= QTN_NOCHANGE; + } + else + { + node = NULL; + } + *isfind = true; } - *isfind = true; } return node; @@ -154,7 +175,7 @@ dofindsubquery(QTNode *root, QTNode *ex, QTNode *subs, bool *isfind) { root = findeq(root, ex, subs, isfind); - if (root && (root->flags & QTN_NOCHANGE) == 0 && root->valnode->type == OPR) + if (root && (root->flags & QTN_NOCHANGE) == 0 && root->valnode->type == QI_OPR) { int i; @@ -172,7 +193,7 @@ dropvoidsubtree(QTNode * root) if (!root) return NULL; - if (root->valnode->type == OPR) + if (root->valnode->type == QI_OPR) { int i, j = 0; @@ -188,7 +209,7 @@ dropvoidsubtree(QTNode * root) root->nchild = j; - if (root->valnode->val == (int4) '!' && root->nchild == 0) + if (root->valnode->operator.oper == OP_NOT && root->nchild == 0) { QTNFree(root); root = NULL; @@ -256,9 +277,9 @@ ts_rewrite_accum(PG_FUNCTION_ARGS) elog(ERROR, "array must be one-dimensional, not %d dimensions", ARR_NDIM(qa)); if (ArrayGetNItems(ARR_NDIM(qa), ARR_DIMS(qa)) != 3) - elog(ERROR, "array should have only three elements"); + elog(ERROR, "array must have three elements"); if (ARR_ELEMTYPE(qa) != TSQUERYOID) - elog(ERROR, "array should contain tsquery type"); + elog(ERROR, "array must contain tsquery elements"); deconstruct_array(qa, TSQUERYOID, -1, false, 'i', &elemsp, NULL, &nelemsp); @@ -499,6 +520,7 @@ tsquery_rewrite_query(PG_FUNCTION_ARGS) subs = QT2QTN(GETQUERY(subst), GETOPERAND(subst)); tree = findsubquery(tree, qex, subs, NULL); + QTNFree(qex); QTNFree(subs); diff --git a/src/backend/utils/adt/tsquery_util.c b/src/backend/utils/adt/tsquery_util.c index ae8cc318da93b340c16fa5ec441ade75b5e44c9d..e378661488bd8604958c9802b18b4c51333ad6f5 100644 --- a/src/backend/utils/adt/tsquery_util.c +++ b/src/backend/utils/adt/tsquery_util.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.2 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -17,7 +17,6 @@ #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" - QTNode * QT2QTN(QueryItem * in, char *operand) { @@ -25,24 +24,24 @@ QT2QTN(QueryItem * in, char *operand) node->valnode = in; - if (in->type == OPR) + if (in->type == QI_OPR) { node->child = (QTNode **) palloc0(sizeof(QTNode *) * 2); node->child[0] = QT2QTN(in + 1, operand); node->sign = node->child[0]->sign; - if (in->val == (int4) '!') + if (in->operator.oper == OP_NOT) node->nchild = 1; else { node->nchild = 2; - node->child[1] = QT2QTN(in + in->left, operand); + node->child[1] = QT2QTN(in + in->operator.left, operand); node->sign |= node->child[1]->sign; } } else if (operand) { - node->word = operand + in->distance; - node->sign = 1 << (in->val % 32); + node->word = operand + in->operand.distance; + node->sign = 1 << (in->operand.valcrc % 32); } return node; @@ -54,14 +53,14 @@ QTNFree(QTNode * in) if (!in) return; - if (in->valnode->type == VAL && in->word && (in->flags & QTN_WORDFREE) != 0) + if (in->valnode->type == QI_VAL && in->word && (in->flags & QTN_WORDFREE) != 0) pfree(in->word); if (in->child) { if (in->valnode) { - if (in->valnode->type == OPR && in->nchild > 0) + if (in->valnode->type == QI_OPR && in->nchild > 0) { int i; @@ -82,30 +81,45 @@ QTNodeCompare(QTNode * an, QTNode * bn) { if (an->valnode->type != bn->valnode->type) return (an->valnode->type > bn->valnode->type) ? -1 : 1; - else if (an->valnode->val != bn->valnode->val) - return (an->valnode->val > bn->valnode->val) ? -1 : 1; - else if (an->valnode->type == VAL) - { - if (an->valnode->length == bn->valnode->length) - return strncmp(an->word, bn->word, an->valnode->length); - else - return (an->valnode->length > bn->valnode->length) ? -1 : 1; - } - else if (an->nchild != bn->nchild) + + if (an->valnode->type == QI_OPR) { - return (an->nchild > bn->nchild) ? -1 : 1; + QueryOperator *ao = &an->valnode->operator; + QueryOperator *bo = &bn->valnode->operator; + + if(ao->oper != bo->oper) + return (ao->oper > bo->oper) ? -1 : 1; + + if (an->nchild != bn->nchild) + return (an->nchild > bn->nchild) ? -1 : 1; + + { + int i, + res; + + for (i = 0; i < an->nchild; i++) + if ((res = QTNodeCompare(an->child[i], bn->child[i])) != 0) + return res; + } + return 0; } else { - int i, - res; + QueryOperand *ao = &an->valnode->operand; + QueryOperand *bo = &bn->valnode->operand; - for (i = 0; i < an->nchild; i++) - if ((res = QTNodeCompare(an->child[i], bn->child[i])) != 0) - return res; - } + Assert(an->valnode->type == QI_VAL); + + if (ao->valcrc != bo->valcrc) + { + return (ao->valcrc > bo->valcrc) ? -1 : 1; + } - return 0; + if (ao->length == bo->length) + return strncmp(an->word, bn->word, ao->length); + else + return (ao->length > bo->length) ? -1 : 1; + } } static int @@ -119,7 +133,7 @@ QTNSort(QTNode * in) { int i; - if (in->valnode->type != OPR) + if (in->valnode->type != QI_OPR) return; for (i = 0; i < in->nchild; i++) @@ -139,12 +153,19 @@ QTNEq(QTNode * a, QTNode * b) return (QTNodeCompare(a, b) == 0) ? true : false; } +/* + * Remove unnecessary intermediate nodes. For example: + * + * OR OR + * a OR -> a b c + * b c + */ void QTNTernary(QTNode * in) { int i; - if (in->valnode->type != OPR) + if (in->valnode->type != QI_OPR) return; for (i = 0; i < in->nchild; i++) @@ -152,9 +173,10 @@ QTNTernary(QTNode * in) for (i = 0; i < in->nchild; i++) { - if (in->valnode->type == in->child[i]->valnode->type && in->valnode->val == in->child[i]->valnode->val) + QTNode *cc = in->child[i]; + + if (cc->valnode->type == QI_OPR && in->valnode->operator.oper == cc->valnode->operator.oper) { - QTNode *cc = in->child[i]; int oldnchild = in->nchild; in->nchild += cc->nchild - 1; @@ -167,17 +189,23 @@ QTNTernary(QTNode * in) memcpy(in->child + i, cc->child, cc->nchild * sizeof(QTNode *)); i += cc->nchild - 1; + if(cc->flags & QTN_NEEDFREE) + pfree(cc->valnode); pfree(cc); } } } +/* + * Convert a tree to binary tree by inserting intermediate nodes. + * (Opposite of QTNTernary) + */ void QTNBinary(QTNode * in) { int i; - if (in->valnode->type != OPR) + if (in->valnode->type != QI_OPR) return; for (i = 0; i < in->nchild; i++) @@ -201,7 +229,7 @@ QTNBinary(QTNode * in) nn->sign = nn->child[0]->sign | nn->child[1]->sign; nn->valnode->type = in->valnode->type; - nn->valnode->val = in->valnode->val; + nn->valnode->operator.oper = in->valnode->operator.oper; in->child[0] = nn; in->child[1] = in->child[in->nchild - 1]; @@ -209,11 +237,15 @@ QTNBinary(QTNode * in) } } +/* + * Count the total length of operand string in tree, including '\0'- + * terminators. + */ static void -cntsize(QTNode * in, int4 *sumlen, int4 *nnode) +cntsize(QTNode * in, int *sumlen, int *nnode) { *nnode += 1; - if (in->valnode->type == OPR) + if (in->valnode->type == QI_OPR) { int i; @@ -222,7 +254,7 @@ cntsize(QTNode * in, int4 *sumlen, int4 *nnode) } else { - *sumlen += in->valnode->length + 1; + *sumlen += in->valnode->operand.length + 1; } } @@ -234,22 +266,26 @@ typedef struct } QTN2QTState; static void -fillQT(QTN2QTState * state, QTNode * in) +fillQT(QTN2QTState *state, QTNode *in) { - *(state->curitem) = *(in->valnode); - - if (in->valnode->type == VAL) + if (in->valnode->type == QI_VAL) { - memcpy(state->curoperand, in->word, in->valnode->length); - state->curitem->distance = state->curoperand - state->operand; - state->curoperand[in->valnode->length] = '\0'; - state->curoperand += in->valnode->length + 1; + memcpy(state->curitem, in->valnode, sizeof(QueryOperand)); + + memcpy(state->curoperand, in->word, in->valnode->operand.length); + state->curitem->operand.distance = state->curoperand - state->operand; + state->curoperand[in->valnode->operand.length] = '\0'; + state->curoperand += in->valnode->operand.length + 1; state->curitem++; } else { QueryItem *curitem = state->curitem; + Assert(in->valnode->type == QI_OPR); + + memcpy(state->curitem, in->valnode, sizeof(QueryOperator)); + Assert(in->nchild <= 2); state->curitem++; @@ -257,7 +293,7 @@ fillQT(QTN2QTState * state, QTNode * in) if (in->nchild == 2) { - curitem->left = state->curitem - curitem; + curitem->operator.left = state->curitem - curitem; fillQT(state, in->child[1]); } } @@ -296,11 +332,11 @@ QTNCopy(QTNode *in) *(out->valnode) = *(in->valnode); out->flags |= QTN_NEEDFREE; - if (in->valnode->type == VAL) + if (in->valnode->type == QI_VAL) { - out->word = palloc(in->valnode->length + 1); - memcpy(out->word, in->word, in->valnode->length); - out->word[in->valnode->length] = '\0'; + out->word = palloc(in->valnode->operand.length + 1); + memcpy(out->word, in->word, in->valnode->operand.length); + out->word[in->valnode->operand.length] = '\0'; out->flags |= QTN_WORDFREE; } else diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c index 8b2ab884c8cb6082787e217db6fe587b2d86c05e..d48e9b4a470be27d2843789fe27487c978e4e796 100644 --- a/src/backend/utils/adt/tsrank.c +++ b/src/backend/utils/adt/tsrank.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.2 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -68,7 +68,7 @@ cnt_length(TSVector t) } static int4 -WordECompareQueryItem(char *eval, char *qval, WordEntry * ptr, QueryItem * item) +WordECompareQueryItem(char *eval, char *qval, WordEntry *ptr, QueryOperand *item) { if (ptr->len == item->length) return strncmp( @@ -80,7 +80,7 @@ WordECompareQueryItem(char *eval, char *qval, WordEntry * ptr, QueryItem * item) } static WordEntry * -find_wordentry(TSVector t, TSQuery q, QueryItem * item) +find_wordentry(TSVector t, TSQuery q, QueryOperand *item) { WordEntry *StopLow = ARRPTR(t); WordEntry *StopHigh = (WordEntry *) STRPTR(t); @@ -105,33 +105,48 @@ find_wordentry(TSVector t, TSQuery q, QueryItem * item) } +/* + * sort QueryOperands by (length, word) + */ static int -compareQueryItem(const void *a, const void *b, void *arg) +compareQueryOperand(const void *a, const void *b, void *arg) { char *operand = (char *) arg; + QueryOperand *qa = (*(QueryOperand **) a); + QueryOperand *qb = (*(QueryOperand **) b); - if ((*(QueryItem **) a)->length == (*(QueryItem **) b)->length) - return strncmp(operand + (*(QueryItem **) a)->distance, - operand + (*(QueryItem **) b)->distance, - (*(QueryItem **) b)->length); + if (qa->length == qb->length) + return strncmp(operand + qa->distance, + operand + qb->distance, + qb->length); - return ((*(QueryItem **) a)->length > (*(QueryItem **) b)->length) ? 1 : -1; + return (qa->length > qb->length) ? 1 : -1; } -static QueryItem ** -SortAndUniqItems(char *operand, QueryItem * item, int *size) +/* + * Returns a sorted, de-duplicated array of QueryOperands in a query. + * The returned QueryOperands are pointers to the original QueryOperands + * in the query. + * + * Length of the returned array is stored in *size + */ +static QueryOperand ** +SortAndUniqItems(TSQuery q, int *size) { - QueryItem **res, + char *operand = GETOPERAND(q); + QueryItem * item = GETQUERY(q); + QueryOperand **res, **ptr, **prevptr; - ptr = res = (QueryItem **) palloc(sizeof(QueryItem *) * *size); + ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size); + /* Collect all operands from the tree to res */ while ((*size)--) { - if (item->type == VAL) + if (item->type == QI_VAL) { - *ptr = item; + *ptr = (QueryOperand *) item; ptr++; } item++; @@ -141,14 +156,15 @@ SortAndUniqItems(char *operand, QueryItem * item, int *size) if (*size < 2) return res; - qsort_arg(res, *size, sizeof(QueryItem **), compareQueryItem, (void *) operand); + qsort_arg(res, *size, sizeof(QueryOperand **), compareQueryOperand, (void *) operand); ptr = res + 1; prevptr = res; + /* remove duplicates */ while (ptr - res < *size) { - if (compareQueryItem((void *) ptr, (void *) prevptr, (void *) operand) != 0) + if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0) { prevptr++; *prevptr = *ptr; @@ -180,10 +196,10 @@ calc_rank_and(float *w, TSVector t, TSQuery q) lenct, dist; float res = -1.0; - QueryItem **item; + QueryOperand **item; int size = q->size; - item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size); + item = SortAndUniqItems(q, &size); if (size < 2) { pfree(item); @@ -246,11 +262,11 @@ calc_rank_or(float *w, TSVector t, TSQuery q) j, i; float res = 0.0; - QueryItem **item; + QueryOperand **item; int size = q->size; *(uint16 *) POSNULL = lengthof(POSNULL) - 1; - item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size); + item = SortAndUniqItems(q, &size); for (i = 0; i < size; i++) { @@ -310,7 +326,8 @@ calc_rank(float *w, TSVector t, TSQuery q, int4 method) if (!t->size || !q->size) return 0.0; - res = (item->type != VAL && item->val == (int4) '&') ? + /* XXX: What about NOT? */ + res = (item->type == QI_OPR && item->operator.oper == OP_AND) ? calc_rank_and(w, t, q) : calc_rank_or(w, t, q); if (res < 0) @@ -453,7 +470,7 @@ compareDocR(const void *a, const void *b) } static bool -checkcondition_QueryItem(void *checkval, QueryItem * val) +checkcondition_QueryOperand(void *checkval, QueryOperand *val) { return (bool) (val->istrue); } @@ -467,8 +484,8 @@ reset_istrue_flag(TSQuery query) /* reset istrue flag */ for (i = 0; i < query->size; i++) { - if (item->type == VAL) - item->istrue = 0; + if (item->type == QI_VAL) + item->operand.istrue = 0; item++; } } @@ -484,7 +501,7 @@ typedef struct static bool -Cover(DocRepresentation * doc, int len, TSQuery query, Extention * ext) +Cover(DocRepresentation *doc, int len, TSQuery query, Extention *ext) { DocRepresentation *ptr; int lastpos = ext->pos; @@ -501,8 +518,11 @@ Cover(DocRepresentation * doc, int len, TSQuery query, Extention * ext) while (ptr - doc < len) { for (i = 0; i < ptr->nitem; i++) - ptr->item[i]->istrue = 1; - if (TS_execute(GETQUERY(query), NULL, false, checkcondition_QueryItem)) + { + if(ptr->item[i]->type == QI_VAL) + ptr->item[i]->operand.istrue = 1; + } + if (TS_execute(GETQUERY(query), NULL, false, checkcondition_QueryOperand)) { if (ptr->pos > ext->q) { @@ -527,8 +547,9 @@ Cover(DocRepresentation * doc, int len, TSQuery query, Extention * ext) while (ptr >= doc + ext->pos) { for (i = 0; i < ptr->nitem; i++) - ptr->item[i]->istrue = 1; - if (TS_execute(GETQUERY(query), NULL, true, checkcondition_QueryItem)) + if(ptr->item[i]->type == QI_VAL) /* XXX */ + ptr->item[i]->operand.istrue = 1; + if (TS_execute(GETQUERY(query), NULL, true, checkcondition_QueryOperand)) { if (ptr->pos < ext->p) { @@ -575,10 +596,17 @@ get_docrep(TSVector txt, TSQuery query, int *doclen) for (i = 0; i < query->size; i++) { - if (item[i].type != VAL || item[i].istrue) + QueryOperand *curoperand; + + if (item[i].type != QI_VAL) + continue; + + curoperand = &item[i].operand; + + if(item[i].operand.istrue) continue; - entry = find_wordentry(txt, query, &(item[i])); + entry = find_wordentry(txt, query, curoperand); if (!entry) continue; @@ -603,8 +631,6 @@ get_docrep(TSVector txt, TSQuery query, int *doclen) { if (j == 0) { - QueryItem *kptr, - *iptr = item + i; int k; doc[cur].needfree = false; @@ -613,14 +639,17 @@ get_docrep(TSVector txt, TSQuery query, int *doclen) for (k = 0; k < query->size; k++) { - kptr = item + k; + QueryOperand *kptr = &item[k].operand; + QueryOperand *iptr = &item[i].operand; + if (k == i || - (item[k].type == VAL && - compareQueryItem(&kptr, &iptr, operand) == 0)) + (item[k].type == QI_VAL && + compareQueryOperand(&kptr, &iptr, operand) == 0)) { + /* if k == i, we've already checked above that it's type == Q_VAL */ doc[cur].item[doc[cur].nitem] = item + k; doc[cur].nitem++; - kptr->istrue = 1; + item[k].operand.istrue = 1; } } } @@ -640,8 +669,7 @@ get_docrep(TSVector txt, TSQuery query, int *doclen) if (cur > 0) { - if (cur > 1) - qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR); + qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR); return doc; } @@ -746,7 +774,7 @@ ts_rankcd_wttf(PG_FUNCTION_ARGS) { ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); TSVector txt = PG_GETARG_TSVECTOR(1); - TSQuery query = PG_GETARG_TSQUERY_COPY(2); + TSQuery query = PG_GETARG_TSQUERY_COPY(2); /* copy because we modify the istrue-flag */ int method = PG_GETARG_INT32(3); float res; @@ -763,7 +791,7 @@ ts_rankcd_wtt(PG_FUNCTION_ARGS) { ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); TSVector txt = PG_GETARG_TSVECTOR(1); - TSQuery query = PG_GETARG_TSQUERY_COPY(2); + TSQuery query = PG_GETARG_TSQUERY_COPY(2); /* copy because we modify the istrue-flag */ float res; res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD); @@ -778,7 +806,7 @@ Datum ts_rankcd_ttf(PG_FUNCTION_ARGS) { TSVector txt = PG_GETARG_TSVECTOR(0); - TSQuery query = PG_GETARG_TSQUERY_COPY(1); + TSQuery query = PG_GETARG_TSQUERY_COPY(1); /* copy because we modify the istrue-flag */ int method = PG_GETARG_INT32(2); float res; @@ -793,7 +821,7 @@ Datum ts_rankcd_tt(PG_FUNCTION_ARGS) { TSVector txt = PG_GETARG_TSVECTOR(0); - TSQuery query = PG_GETARG_TSQUERY_COPY(1); + TSQuery query = PG_GETARG_TSQUERY_COPY(1); /* copy because we modify the istrue-flag */ float res; res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD); diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 8ab024650f72a6887bbe4a17453b4decd9115a16..2866e028da02b778c7592d909524b90413f2d612 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.2 2007/08/21 01:45:33 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.3 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -20,22 +20,37 @@ #include "tsearch/ts_utils.h" #include "utils/memutils.h" +typedef struct +{ + WordEntry entry; /* should be first ! */ + WordEntryPos *pos; + int poslen; /* number of elements in pos */ +} WordEntryIN; static int comparePos(const void *a, const void *b) { - if (WEP_GETPOS(*(WordEntryPos *) a) == WEP_GETPOS(*(WordEntryPos *) b)) + int apos = WEP_GETPOS(*(WordEntryPos *) a); + int bpos = WEP_GETPOS(*(WordEntryPos *) b); + + if (apos == bpos) return 0; - return (WEP_GETPOS(*(WordEntryPos *) a) > WEP_GETPOS(*(WordEntryPos *) b)) ? 1 : -1; + return (apos > bpos) ? 1 : -1; } +/* + * Removes duplicate pos entries. If there's two entries with same pos + * but different weight, the higher weight is retained. + * + * Returns new length. + */ static int -uniquePos(WordEntryPos * a, int4 l) +uniquePos(WordEntryPos * a, int l) { WordEntryPos *ptr, *res; - if (l == 1) + if (l <= 1) return l; res = a; @@ -75,21 +90,23 @@ compareentry(const void *a, const void *b, void *arg) } static int -uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen) +uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen) { WordEntryIN *ptr, *res; - res = a; + Assert(l >= 1); + if (l == 1) { if (a->entry.haspos) { - *(uint16 *) (a->pos) = uniquePos(&(a->pos[1]), *(uint16 *) (a->pos)); - *outbuflen = SHORTALIGN(res->entry.len) + (*(uint16 *) (a->pos) + 1) * sizeof(WordEntryPos); + a->poslen = uniquePos(a->pos, a->poslen); + *outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos); } return l; } + res = a; ptr = a + 1; qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf); @@ -101,8 +118,8 @@ uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen) { if (res->entry.haspos) { - *(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos)); - *outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos); + res->poslen = uniquePos(res->pos, res->poslen); + *outbuflen += res->poslen * sizeof(WordEntryPos); } *outbuflen += SHORTALIGN(res->entry.len); res++; @@ -112,12 +129,14 @@ uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen) { if (res->entry.haspos) { - int4 len = *(uint16 *) (ptr->pos) + 1 + *(uint16 *) (res->pos); + int newlen = ptr->poslen + res->poslen; + + /* Append res to pos */ - res->pos = (WordEntryPos *) repalloc(res->pos, len * sizeof(WordEntryPos)); - memcpy(&(res->pos[*(uint16 *) (res->pos) + 1]), - &(ptr->pos[1]), *(uint16 *) (ptr->pos) * sizeof(WordEntryPos)); - *(uint16 *) (res->pos) += *(uint16 *) (ptr->pos); + res->pos = (WordEntryPos *) repalloc(res->pos, newlen * sizeof(WordEntryPos)); + memcpy(&res->pos[res->poslen], + ptr->pos, ptr->poslen * sizeof(WordEntryPos)); + res->poslen = newlen; pfree(ptr->pos); } else @@ -130,8 +149,8 @@ uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen) } if (res->entry.haspos) { - *(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos)); - *outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos); + res->poslen = uniquePos(res->pos, res->poslen); + *outbuflen += res->poslen * sizeof(WordEntryPos); } *outbuflen += SHORTALIGN(res->entry.len); @@ -144,248 +163,6 @@ WordEntryCMP(WordEntry * a, WordEntry * b, char *buf) return compareentry(a, b, buf); } -#define WAITWORD 1 -#define WAITENDWORD 2 -#define WAITNEXTCHAR 3 -#define WAITENDCMPLX 4 -#define WAITPOSINFO 5 -#define INPOSINFO 6 -#define WAITPOSDELIM 7 -#define WAITCHARCMPLX 8 - -#define RESIZEPRSBUF \ -do { \ - if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \ - { \ - int4 clen = state->curpos - state->word; \ - state->len *= 2; \ - state->word = (char*)repalloc( (void*)state->word, state->len ); \ - state->curpos = state->word + clen; \ - } \ -} while (0) - -bool -gettoken_tsvector(TSVectorParseState *state) -{ - int4 oldstate = 0; - - state->curpos = state->word; - state->state = WAITWORD; - state->alen = 0; - - while (1) - { - if (state->state == WAITWORD) - { - if (*(state->prsbuf) == '\0') - return false; - else if (t_iseq(state->prsbuf, '\'')) - state->state = WAITENDCMPLX; - else if (t_iseq(state->prsbuf, '\\')) - { - state->state = WAITNEXTCHAR; - oldstate = WAITENDWORD; - } - else if (state->oprisdelim && ISOPERATOR(state->prsbuf)) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); - else if (!t_isspace(state->prsbuf)) - { - COPYCHAR(state->curpos, state->prsbuf); - state->curpos += pg_mblen(state->prsbuf); - state->state = WAITENDWORD; - } - } - else if (state->state == WAITNEXTCHAR) - { - if (*(state->prsbuf) == '\0') - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("there is no escaped character"))); - else - { - RESIZEPRSBUF; - COPYCHAR(state->curpos, state->prsbuf); - state->curpos += pg_mblen(state->prsbuf); - state->state = oldstate; - } - } - else if (state->state == WAITENDWORD) - { - if (t_iseq(state->prsbuf, '\\')) - { - state->state = WAITNEXTCHAR; - oldstate = WAITENDWORD; - } - else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || - (state->oprisdelim && ISOPERATOR(state->prsbuf))) - { - RESIZEPRSBUF; - if (state->curpos == state->word) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); - *(state->curpos) = '\0'; - return true; - } - else if (t_iseq(state->prsbuf, ':')) - { - if (state->curpos == state->word) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); - *(state->curpos) = '\0'; - if (state->oprisdelim) - return true; - else - state->state = INPOSINFO; - } - else - { - RESIZEPRSBUF; - COPYCHAR(state->curpos, state->prsbuf); - state->curpos += pg_mblen(state->prsbuf); - } - } - else if (state->state == WAITENDCMPLX) - { - if (t_iseq(state->prsbuf, '\'')) - { - state->state = WAITCHARCMPLX; - } - else if (t_iseq(state->prsbuf, '\\')) - { - state->state = WAITNEXTCHAR; - oldstate = WAITENDCMPLX; - } - else if (*(state->prsbuf) == '\0') - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); - else - { - RESIZEPRSBUF; - COPYCHAR(state->curpos, state->prsbuf); - state->curpos += pg_mblen(state->prsbuf); - } - } - else if (state->state == WAITCHARCMPLX) - { - if (t_iseq(state->prsbuf, '\'')) - { - RESIZEPRSBUF; - COPYCHAR(state->curpos, state->prsbuf); - state->curpos += pg_mblen(state->prsbuf); - state->state = WAITENDCMPLX; - } - else - { - RESIZEPRSBUF; - *(state->curpos) = '\0'; - if (state->curpos == state->word) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); - if (state->oprisdelim) - { - /* state->prsbuf+=pg_mblen(state->prsbuf); */ - return true; - } - else - state->state = WAITPOSINFO; - continue; /* recheck current character */ - } - } - else if (state->state == WAITPOSINFO) - { - if (t_iseq(state->prsbuf, ':')) - state->state = INPOSINFO; - else - return true; - } - else if (state->state == INPOSINFO) - { - if (t_isdigit(state->prsbuf)) - { - if (state->alen == 0) - { - state->alen = 4; - state->pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * state->alen); - *(uint16 *) (state->pos) = 0; - } - else if (*(uint16 *) (state->pos) + 1 >= state->alen) - { - state->alen *= 2; - state->pos = (WordEntryPos *) repalloc(state->pos, sizeof(WordEntryPos) * state->alen); - } - (*(uint16 *) (state->pos))++; - WEP_SETPOS(state->pos[*(uint16 *) (state->pos)], LIMITPOS(atoi(state->prsbuf))); - if (WEP_GETPOS(state->pos[*(uint16 *) (state->pos)]) == 0) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("wrong position info in tsvector"))); - WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0); - state->state = WAITPOSDELIM; - } - else - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); - } - else if (state->state == WAITPOSDELIM) - { - if (t_iseq(state->prsbuf, ',')) - state->state = INPOSINFO; - else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*')) - { - if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); - WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3); - } - else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B')) - { - if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); - WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2); - } - else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C')) - { - if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); - WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1); - } - else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D')) - { - if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); - WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0); - } - else if (t_isspace(state->prsbuf) || - *(state->prsbuf) == '\0') - return true; - else if (!t_isdigit(state->prsbuf)) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error in tsvector"))); - } - else /* internal error */ - elog(ERROR, "internal error in gettoken_tsvector"); - - /* get next char */ - state->prsbuf += pg_mblen(state->prsbuf); - } - - return false; -} Datum tsvectorin(PG_FUNCTION_ARGS) @@ -393,70 +170,82 @@ tsvectorin(PG_FUNCTION_ARGS) char *buf = PG_GETARG_CSTRING(0); TSVectorParseState state; WordEntryIN *arr; + int totallen; + int arrlen; /* allocated size of arr */ WordEntry *inarr; - int4 len = 0, - totallen = 64; + int len = 0; TSVector in; - char *tmpbuf, - *cur; - int4 i, - buflen = 256; + int i; + char *token; + int toklen; + WordEntryPos *pos; + int poslen; + + /* + * Tokens are appended to tmpbuf, cur is a pointer + * to the end of used space in tmpbuf. + */ + char *tmpbuf; + char *cur; + int buflen = 256; /* allocated size of tmpbuf */ pg_verifymbstr(buf, strlen(buf), false); - state.prsbuf = buf; - state.len = 32; - state.word = (char *) palloc(state.len); - state.oprisdelim = false; - arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * totallen); + state = init_tsvector_parser(buf, false); + + arrlen = 64; + arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen); cur = tmpbuf = (char *) palloc(buflen); - while (gettoken_tsvector(&state)) + while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL)) { - /* - * Realloc buffers if it's needed - */ - if (len >= totallen) - { - totallen *= 2; - arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * totallen); - } - - while ((cur - tmpbuf) + (state.curpos - state.word) >= buflen) - { - int4 dist = cur - tmpbuf; - - buflen *= 2; - tmpbuf = (char *) repalloc((void *) tmpbuf, buflen); - cur = tmpbuf + dist; - } - if (state.curpos - state.word >= MAXSTRLEN) + if (toklen >= MAXSTRLEN) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("word is too long (%ld bytes, max %ld bytes)", - (long) (state.curpos - state.word), + (long) toklen, (long) MAXSTRLEN))); - arr[len].entry.len = state.curpos - state.word; + if (cur - tmpbuf > MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("position value too large"))); + + /* + * Enlarge buffers if needed + */ + if (len >= arrlen) + { + arrlen *= 2; + arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * arrlen); + } + while ((cur - tmpbuf) + toklen >= buflen) + { + int dist = cur - tmpbuf; + + buflen *= 2; + tmpbuf = (char *) repalloc((void *) tmpbuf, buflen); + cur = tmpbuf + dist; + } + arr[len].entry.len = toklen; arr[len].entry.pos = cur - tmpbuf; - memcpy((void *) cur, (void *) state.word, arr[len].entry.len); - cur += arr[len].entry.len; + memcpy((void *) cur, (void *) token, toklen); + cur += toklen; - if (state.alen) + if (poslen != 0) { arr[len].entry.haspos = 1; - arr[len].pos = state.pos; + arr[len].pos = pos; + arr[len].poslen = poslen; } else arr[len].entry.haspos = 0; len++; } - pfree(state.word); + + close_tsvector_parser(state); if (len > 0) len = uniqueentry(arr, len, tmpbuf, &buflen); @@ -476,8 +265,21 @@ tsvectorin(PG_FUNCTION_ARGS) cur += SHORTALIGN(arr[i].entry.len); if (arr[i].entry.haspos) { - memcpy(cur, arr[i].pos, (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos)); - cur += (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos); + uint16 tmplen; + + if(arr[i].poslen > 0xFFFF) + elog(ERROR, "positions array too long"); + + tmplen = (uint16) arr[i].poslen; + + /* Copy length to output struct */ + memcpy(cur, &tmplen, sizeof(uint16)); + cur += sizeof(uint16); + + /* Copy positions */ + memcpy(cur, arr[i].pos, (arr[i].poslen) * sizeof(WordEntryPos)); + cur += arr[i].poslen * sizeof(WordEntryPos); + pfree(arr[i].pos); } inarr[i] = arr[i].entry; @@ -604,26 +406,26 @@ tsvectorrecv(PG_FUNCTION_ARGS) { StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); TSVector vec; - int i, - size, - len = DATAHDRSIZE; + int i; + uint32 size; WordEntry *weptr; int datalen = 0; + Size len; size = pq_getmsgint(buf, sizeof(uint32)); if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry))) elog(ERROR, "invalid size of tsvector"); - len += sizeof(WordEntry) * size; + len = DATAHDRSIZE + sizeof(WordEntry) * size; - len *= 2; + len = len * 2; /* times two to make room for lexemes */ vec = (TSVector) palloc0(len); vec->size = size; weptr = ARRPTR(vec); for (i = 0; i < size; i++) { - int tmp; + int32 tmp; weptr = ARRPTR(vec) + i; @@ -654,7 +456,7 @@ tsvectorrecv(PG_FUNCTION_ARGS) npos; WordEntryPos *wepptr; - npos = (uint16) pq_getmsgint(buf, sizeof(int16)); + npos = (uint16) pq_getmsgint(buf, sizeof(uint16)); if (npos > MAXNUMPOS) elog(ERROR, "unexpected number of positions"); diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 8567172c64f6f3c7330fff535dd0ba5c1d893eae..d34ab1fcf0bd68872f76fb7043cf8b442f1c4921 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.2 2007/08/31 02:26:29 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.3 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -66,6 +66,9 @@ typedef struct static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column); +/* + * Order: haspos, len, word, for all positions (pos, weight) + */ static int silly_cmp_tsvector(const TSVector a, const TSVector b) { @@ -464,7 +467,7 @@ tsvector_concat(PG_FUNCTION_ARGS) * compare 2 string values */ static int4 -ValCompare(CHKVAL * chkval, WordEntry * ptr, QueryItem * item) +ValCompare(CHKVAL * chkval, WordEntry * ptr, QueryOperand * item) { if (ptr->len == item->length) return strncmp( @@ -479,7 +482,7 @@ ValCompare(CHKVAL * chkval, WordEntry * ptr, QueryItem * item) * check weight info */ static bool -checkclass_str(CHKVAL * chkval, WordEntry * val, QueryItem * item) +checkclass_str(CHKVAL * chkval, WordEntry * val, QueryOperand * item) { WordEntryPos *ptr = (WordEntryPos *) (chkval->values + val->pos + SHORTALIGN(val->len) + sizeof(uint16)); uint16 len = *((uint16 *) (chkval->values + val->pos + SHORTALIGN(val->len))); @@ -497,10 +500,11 @@ checkclass_str(CHKVAL * chkval, WordEntry * val, QueryItem * item) * is there value 'val' in array or not ? */ static bool -checkcondition_str(void *checkval, QueryItem * val) +checkcondition_str(void *checkval, QueryOperand * val) { - WordEntry *StopLow = ((CHKVAL *) checkval)->arrb; - WordEntry *StopHigh = ((CHKVAL *) checkval)->arre; + CHKVAL *chkval = (CHKVAL *) checkval; + WordEntry *StopLow = chkval->arrb; + WordEntry *StopHigh = chkval->arre; WordEntry *StopMiddle; int difference; @@ -509,10 +513,10 @@ checkcondition_str(void *checkval, QueryItem * val) while (StopLow < StopHigh) { StopMiddle = StopLow + (StopHigh - StopLow) / 2; - difference = ValCompare((CHKVAL *) checkval, StopMiddle, val); + difference = ValCompare(chkval, StopMiddle, val); if (difference == 0) return (val->weight && StopMiddle->haspos) ? - checkclass_str((CHKVAL *) checkval, StopMiddle, val) : true; + checkclass_str(chkval, StopMiddle, val) : true; else if (difference < 0) StopLow = StopMiddle + 1; else @@ -523,37 +527,48 @@ checkcondition_str(void *checkval, QueryItem * val) } /* - * check for boolean condition + * check for boolean condition. + * + * if calcnot is false, NOT expressions are always evaluated to be true. This is used in ranking. + * checkval can be used to pass information to the callback. TS_execute doesn't + * do anything with it. + * chkcond is a callback function used to evaluate each VAL node in the query. + * */ bool TS_execute(QueryItem * curitem, void *checkval, bool calcnot, - bool (*chkcond) (void *checkval, QueryItem * val)) + bool (*chkcond) (void *checkval, QueryOperand * val)) { /* since this function recurses, it could be driven to stack overflow */ check_stack_depth(); - if (curitem->type == VAL) - return chkcond(checkval, curitem); - else if (curitem->val == (int4) '!') - { - return (calcnot) ? - !TS_execute(curitem + 1, checkval, calcnot, chkcond) - : true; - } - else if (curitem->val == (int4) '&') + if (curitem->type == QI_VAL) + return chkcond(checkval, (QueryOperand *) curitem); + + switch(curitem->operator.oper) { - if (TS_execute(curitem + curitem->left, checkval, calcnot, chkcond)) - return TS_execute(curitem + 1, checkval, calcnot, chkcond); - else - return false; - } - else - { /* |-operator */ - if (TS_execute(curitem + curitem->left, checkval, calcnot, chkcond)) - return true; - else - return TS_execute(curitem + 1, checkval, calcnot, chkcond); + case OP_NOT: + if (calcnot) + return !TS_execute(curitem + 1, checkval, calcnot, chkcond); + else + return true; + case OP_AND: + if (TS_execute(curitem + curitem->operator.left, checkval, calcnot, chkcond)) + return TS_execute(curitem + 1, checkval, calcnot, chkcond); + else + return false; + + case OP_OR: + if (TS_execute(curitem + curitem->operator.left, checkval, calcnot, chkcond)) + return true; + else + return TS_execute(curitem + 1, checkval, calcnot, chkcond); + + default: + elog(ERROR, "unknown operator %d", curitem->operator.oper); } + + /* not reachable, but keep compiler quiet */ return false; } diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c new file mode 100644 index 0000000000000000000000000000000000000000..26a271679d4cc95addaa9d5a5f75670b127ab790 --- /dev/null +++ b/src/backend/utils/adt/tsvector_parser.c @@ -0,0 +1,357 @@ +/*------------------------------------------------------------------------- + * + * tsvector_parser.c + * Parser for tsvector + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_parser.c,v 1.1 2007/09/07 15:09:56 teodor Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "libpq/pqformat.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" +#include "utils/memutils.h" + +struct TSVectorParseStateData +{ + char *prsbuf; + char *word; /* buffer to hold the current word */ + int len; /* size in bytes allocated for 'word' */ + bool oprisdelim; +}; + +/* + * Initializes parser for the input string. If oprisdelim is set, the + * following characters are treated as delimiters in addition to whitespace: + * ! | & ( ) + */ +TSVectorParseState +init_tsvector_parser(char *input, bool oprisdelim) +{ + TSVectorParseState state; + + state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData)); + state->prsbuf = input; + state->len = 32; + state->word = (char *) palloc(state->len); + state->oprisdelim = oprisdelim; + + return state; +} + +/* + * Reinitializes parser for parsing 'input', instead of previous input. + */ +void +reset_tsvector_parser(TSVectorParseState state, char *input) +{ + state->prsbuf = input; +} + +/* + * Shuts down a tsvector parser. + */ +void +close_tsvector_parser(TSVectorParseState state) +{ + pfree(state->word); + pfree(state); +} + +#define RESIZEPRSBUF \ +do { \ + if ( curpos - state->word + pg_database_encoding_max_length() >= state->len ) \ + { \ + int clen = curpos - state->word; \ + state->len *= 2; \ + state->word = (char*)repalloc( (void*)state->word, state->len ); \ + curpos = state->word + clen; \ + } \ +} while (0) + + +#define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) ) + +/* Fills the output parameters, and returns true */ +#define RETURN_TOKEN \ +do { \ + if (pos_ptr != NULL) \ + { \ + *pos_ptr = pos; \ + *poslen = npos; \ + } \ + else if (pos != NULL) \ + pfree(pos); \ + \ + if (strval != NULL) \ + *strval = state->word; \ + if (lenval != NULL) \ + *lenval = curpos - state->word; \ + if (endptr != NULL) \ + *endptr = state->prsbuf; \ + return true; \ +} while(0) + + +/* State codes used in gettoken_tsvector */ +#define WAITWORD 1 +#define WAITENDWORD 2 +#define WAITNEXTCHAR 3 +#define WAITENDCMPLX 4 +#define WAITPOSINFO 5 +#define INPOSINFO 6 +#define WAITPOSDELIM 7 +#define WAITCHARCMPLX 8 + +/* + * Get next token from string being parsed. Returns false if + * end of input string is reached, otherwise strval, lenval, pos_ptr + * and poslen output parameters are filled in: + * + * *strval token + * *lenval length of*strval + * *pos_ptr pointer to a palloc'd array of positions and weights + * associated with the token. If the caller is not interested + * in the information, NULL can be supplied. Otherwise + * the caller is responsible for pfreeing the array. + * *poslen number of elements in *pos_ptr + */ +bool +gettoken_tsvector(TSVectorParseState state, + char **strval, int *lenval, + WordEntryPos **pos_ptr, int *poslen, + char **endptr) +{ + int oldstate = 0; + char *curpos = state->word; + int statecode = WAITWORD; + + /* pos is for collecting the comma delimited list of positions followed + * by the actual token. + */ + WordEntryPos *pos = NULL; + int npos = 0; /* elements of pos used */ + int posalen = 0; /* allocated size of pos */ + + while (1) + { + if (statecode == WAITWORD) + { + if (*(state->prsbuf) == '\0') + return false; + else if (t_iseq(state->prsbuf, '\'')) + statecode = WAITENDCMPLX; + else if (t_iseq(state->prsbuf, '\\')) + { + statecode = WAITNEXTCHAR; + oldstate = WAITENDWORD; + } + else if (state->oprisdelim && ISOPERATOR(state->prsbuf)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + else if (!t_isspace(state->prsbuf)) + { + COPYCHAR(curpos, state->prsbuf); + curpos += pg_mblen(state->prsbuf); + statecode = WAITENDWORD; + } + } + else if (statecode == WAITNEXTCHAR) + { + if (*(state->prsbuf) == '\0') + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("there is no escaped character"))); + else + { + RESIZEPRSBUF; + COPYCHAR(curpos, state->prsbuf); + curpos += pg_mblen(state->prsbuf); + Assert(oldstate != 0); + statecode = oldstate; + } + } + else if (statecode == WAITENDWORD) + { + if (t_iseq(state->prsbuf, '\\')) + { + statecode = WAITNEXTCHAR; + oldstate = WAITENDWORD; + } + else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || + (state->oprisdelim && ISOPERATOR(state->prsbuf))) + { + RESIZEPRSBUF; + if (curpos == state->word) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + *(curpos) = '\0'; + RETURN_TOKEN; + } + else if (t_iseq(state->prsbuf, ':')) + { + if (curpos == state->word) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + *(curpos) = '\0'; + if (state->oprisdelim) + RETURN_TOKEN; + else + statecode = INPOSINFO; + } + else + { + RESIZEPRSBUF; + COPYCHAR(curpos, state->prsbuf); + curpos += pg_mblen(state->prsbuf); + } + } + else if (statecode == WAITENDCMPLX) + { + if (t_iseq(state->prsbuf, '\'')) + { + statecode = WAITCHARCMPLX; + } + else if (t_iseq(state->prsbuf, '\\')) + { + statecode = WAITNEXTCHAR; + oldstate = WAITENDCMPLX; + } + else if (*(state->prsbuf) == '\0') + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + else + { + RESIZEPRSBUF; + COPYCHAR(curpos, state->prsbuf); + curpos += pg_mblen(state->prsbuf); + } + } + else if (statecode == WAITCHARCMPLX) + { + if (t_iseq(state->prsbuf, '\'')) + { + RESIZEPRSBUF; + COPYCHAR(curpos, state->prsbuf); + curpos += pg_mblen(state->prsbuf); + statecode = WAITENDCMPLX; + } + else + { + RESIZEPRSBUF; + *(curpos) = '\0'; + if (curpos == state->word) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + if (state->oprisdelim) + { + /* state->prsbuf+=pg_mblen(state->prsbuf); */ + RETURN_TOKEN; + } + else + statecode = WAITPOSINFO; + continue; /* recheck current character */ + } + } + else if (statecode == WAITPOSINFO) + { + if (t_iseq(state->prsbuf, ':')) + statecode = INPOSINFO; + else + RETURN_TOKEN; + } + else if (statecode == INPOSINFO) + { + if (t_isdigit(state->prsbuf)) + { + if (posalen == 0) + { + posalen = 4; + pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen); + npos = 0; + } + else if (npos + 1 >= posalen) + { + posalen *= 2; + pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen); + } + npos++; + WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf))); + if (WEP_GETPOS(pos[npos - 1]) == 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("wrong position info in tsvector"))); + WEP_SETWEIGHT(pos[npos - 1], 0); + statecode = WAITPOSDELIM; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + } + else if (statecode == WAITPOSDELIM) + { + if (t_iseq(state->prsbuf, ',')) + statecode = INPOSINFO; + else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*')) + { + if (WEP_GETWEIGHT(pos[npos - 1])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(pos[npos - 1], 3); + } + else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B')) + { + if (WEP_GETWEIGHT(pos[npos - 1])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(pos[npos - 1], 2); + } + else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C')) + { + if (WEP_GETWEIGHT(pos[npos - 1])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(pos[npos - 1], 1); + } + else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D')) + { + if (WEP_GETWEIGHT(pos[npos - 1])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(pos[npos - 1], 0); + } + else if (t_isspace(state->prsbuf) || + *(state->prsbuf) == '\0') + RETURN_TOKEN; + else if (!t_isdigit(state->prsbuf)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + } + else /* internal error */ + elog(ERROR, "internal error in gettoken_tsvector"); + + /* get next char */ + state->prsbuf += pg_mblen(state->prsbuf); + } + + return false; +} diff --git a/src/include/tsearch/ts_public.h b/src/include/tsearch/ts_public.h index 148129aa8bc5b165959d8223c4b4f0d7d640d179..ab19de7924f05037e9e7a572d067b1070f2dfe9d 100644 --- a/src/include/tsearch/ts_public.h +++ b/src/include/tsearch/ts_public.h @@ -6,7 +6,7 @@ * * Copyright (c) 1998-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.3 2007/08/25 00:03:59 tgl Exp $ + * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.4 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -42,7 +42,7 @@ typedef struct type:8, len:16; char *word; - QueryItem *item; + QueryOperand *item; } HeadlineWordEntry; typedef struct diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h index ec22f96f59f91393f90aa06c1e986eeb63c80120..91d724ef1c67752e313e24fe22fbbbe16774e407 100644 --- a/src/include/tsearch/ts_type.h +++ b/src/include/tsearch/ts_type.h @@ -5,7 +5,7 @@ * * Copyright (c) 1998-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.1 2007/08/21 01:11:29 tgl Exp $ + * $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.2 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -13,6 +13,8 @@ #define _PG_TSTYPE_H_ #include "fmgr.h" +#include "utils/pg_crc.h" + /* * TSVector type. @@ -27,8 +29,8 @@ typedef struct pos:20; /* MAX 1Mb */ } WordEntry; -#define MAXSTRLEN ( 1<<11 ) -#define MAXSTRPOS ( 1<<20 ) +#define MAXSTRLEN ( (1<<11) - 1) +#define MAXSTRPOS ( (1<<20) - 1) /* * Equivalent to @@ -68,7 +70,7 @@ typedef uint16 WordEntryPos; typedef struct { int32 vl_len_; /* varlena header (do not touch directly!) */ - int4 size; + uint32 size; char data[1]; } TSVectorData; @@ -140,36 +142,65 @@ extern Datum ts_rankcd_wttf(PG_FUNCTION_ARGS); /* * TSQuery + * + * */ +typedef int8 QueryItemType; + +/* Valid values for QueryItemType: */ +#define QI_VAL 1 +#define QI_OPR 2 +#define QI_VALSTOP 3 /* This is only used in an intermediate stack representation in parse_tsquery. It's not a legal type elsewhere. */ + /* * QueryItem is one node in tsquery - operator or operand. */ - -typedef struct QueryItem +typedef struct { - int8 type; /* operand or kind of operator */ - int8 weight; /* weights of operand to search */ - int2 left; /* pointer to left operand Right operand is - * item + 1, left operand is placed - * item+item->left */ - int4 val; /* crc32 value of operand's value */ + QueryItemType type; /* operand or kind of operator (ts_tokentype) */ + int8 weight; /* weights of operand to search. It's a bitmask of allowed weights. + * if it =0 then any weight are allowed */ + int32 valcrc; /* XXX: pg_crc32 would be a more appropriate data type, + * but we use comparisons to signed integers in the code. + * They would need to be changed as well. */ + /* pointer to text value of operand, must correlate with WordEntry */ uint32 istrue:1, /* use for ranking in Cover */ length:11, distance:20; -} QueryItem; +} QueryOperand; + + +/* Legal values for QueryOperator.operator */ +#define OP_NOT 1 +#define OP_AND 2 +#define OP_OR 3 + +typedef struct +{ + QueryItemType type; + int8 oper; /* see above */ + int16 left; /* pointer to left operand. Right operand is + * item + 1, left operand is placed + * item+item->left */ +} QueryOperator; /* - * It's impossible to use offsetof(QueryItem, istrue) + * Note: TSQuery is 4-bytes aligned, so make sure there's no fields + * inside QueryItem requiring 8-byte alignment, like int64. */ -#define HDRSIZEQI ( sizeof(int8) + sizeof(int8) + sizeof(int2) + sizeof(int4) ) +typedef union +{ + QueryItemType type; + QueryOperator operator; + QueryOperand operand; +} QueryItem; /* * Storage: - * (len)(size)(array of ITEM)(array of operand in text form) - * operands are always finished by '\0' + * (len)(size)(array of QueryItem)(operands as '\0'-terminated c-strings) */ typedef struct @@ -182,13 +213,17 @@ typedef struct typedef TSQueryData *TSQuery; #define HDRSIZETQ ( VARHDRSZ + sizeof(int4) ) -#define COMPUTESIZE(size,lenofoperand) ( HDRSIZETQ + (size) * sizeof(QueryItem) + (lenofoperand) ) -#define GETQUERY(x) ((QueryItem*)( (char*)(x)+HDRSIZETQ )) -#define GETOPERAND(x) ( (char*)GETQUERY(x) + ((TSQuery)(x))->size * sizeof(QueryItem) ) -#define OPERANDSSIZE(x) ( (x)->len - HDRSIZETQ - (x)->size * sizeof(QueryItem) ) -#define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) ) +/* Computes the size of header and all QueryItems. size is the number of + * QueryItems, and lenofoperand is the total length of all operands + */ +#define COMPUTESIZE(size, lenofoperand) ( HDRSIZETQ + (size) * sizeof(QueryItem) + (lenofoperand) ) +/* Returns a pointer to the first QueryItem in a TSVector */ +#define GETQUERY(x) ((QueryItem*)( (char*)(x)+HDRSIZETQ )) + +/* Returns a pointer to the beginning of operands in a TSVector */ +#define GETOPERAND(x) ( (char*)GETQUERY(x) + ((TSQuery)(x))->size * sizeof(QueryItem) ) /* * fmgr interface macros diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h index d2e5c8d8e4957d231897fe4e9cad33072ce43e80..31a76e50b6cf23bf8c00ac7a4096f8c1daf91826 100644 --- a/src/include/tsearch/ts_utils.h +++ b/src/include/tsearch/ts_utils.h @@ -5,7 +5,7 @@ * * Copyright (c) 1998-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/tsearch/ts_utils.h,v 1.2 2007/08/25 00:03:59 tgl Exp $ + * $PostgreSQL: pgsql/src/include/tsearch/ts_utils.h,v 1.3 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -14,65 +14,41 @@ #include "tsearch/ts_type.h" #include "tsearch/ts_public.h" +#include "nodes/pg_list.h" /* * Common parse definitions for tsvector and tsquery */ -typedef struct -{ - WordEntry entry; /* should be first ! */ - WordEntryPos *pos; -} WordEntryIN; - -typedef struct -{ - char *prsbuf; - char *word; - char *curpos; - int4 len; - int4 state; - int4 alen; - WordEntryPos *pos; - bool oprisdelim; -} TSVectorParseState; - -extern bool gettoken_tsvector(TSVectorParseState *state); +/* tsvector parser support. */ -struct ParseQueryNode; /* private in backend/utils/adt/tsquery.c */ +struct TSVectorParseStateData; +typedef struct TSVectorParseStateData *TSVectorParseState; -typedef struct -{ - char *buffer; /* entire string we are scanning */ - char *buf; /* current scan point */ - int4 state; - int4 count; +extern TSVectorParseState init_tsvector_parser(char *input, bool oprisdelim); +extern void reset_tsvector_parser(TSVectorParseState state, char *input); +extern bool gettoken_tsvector(TSVectorParseState state, + char **token, int *len, + WordEntryPos **pos, int *poslen, + char **endptr); +extern void close_tsvector_parser(TSVectorParseState state); - /* reverse polish notation in list (for temporary usage) */ - struct ParseQueryNode *str; +/* parse_tsquery */ - /* number in str */ - int4 num; +struct TSQueryParserStateData; /* private in backend/utils/adt/tsquery.c */ +typedef struct TSQueryParserStateData *TSQueryParserState; - /* text-form operand */ - int4 lenop; - int4 sumlen; - char *op; - char *curop; - - /* state for value's parser */ - TSVectorParseState valstate; - /* tscfg */ - Oid cfg_id; -} TSQueryParserState; +typedef void (*PushFunction)(void *opaque, TSQueryParserState state, char *, int, int2); extern TSQuery parse_tsquery(char *buf, - void (*pushval) (TSQueryParserState *, int, char *, int, int2), - Oid cfg_id, bool isplain); -extern void pushval_asis(TSQueryParserState * state, - int type, char *strval, int lenval, int2 weight); -extern void pushquery(TSQueryParserState * state, int4 type, int4 val, - int4 distance, int4 lenval, int2 weight); + PushFunction pushval, + void *opaque, bool isplain); + +/* Functions for use by PushFunction implementations */ +extern void pushValue(TSQueryParserState state, + char *strval, int lenval, int2 weight); +extern void pushStop(TSQueryParserState state); +extern void pushOperator(TSQueryParserState state, int8 operator); /* * parse plain text and lexize words @@ -84,6 +60,11 @@ typedef struct union { uint16 pos; + /* + * When apos array is used, apos[0] is the number of elements + * in the array (excluding apos[0]), and alen is the allocated + * size of the array. + */ uint16 *apos; } pos; char *word; @@ -111,23 +92,12 @@ extern void hlparsetext(Oid cfgId, HeadlineParsedText * prs, TSQuery query, char *buf, int4 buflen); extern text *generateHeadline(HeadlineParsedText * prs); -/* - * token/node types for parsing - */ -#define END 0 -#define ERR 1 -#define VAL 2 -#define OPR 3 -#define OPEN 4 -#define CLOSE 5 -#define VALSTOP 6 /* for stop words */ - /* * Common check function for tsvector @@ tsquery */ extern bool TS_execute(QueryItem * curitem, void *checkval, bool calcnot, - bool (*chkcond) (void *checkval, QueryItem * val)); + bool (*chkcond) (void *checkval, QueryOperand * val)); /* * Useful conversion macros