Skip to content
Snippets Groups Projects
Select Git revision
  • benchmark-tools
  • postgres-lambda
  • master default
  • REL9_4_25
  • REL9_5_20
  • REL9_6_16
  • REL_10_11
  • REL_11_6
  • REL_12_1
  • REL_12_0
  • REL_12_RC1
  • REL_12_BETA4
  • REL9_4_24
  • REL9_5_19
  • REL9_6_15
  • REL_10_10
  • REL_11_5
  • REL_12_BETA3
  • REL9_4_23
  • REL9_5_18
  • REL9_6_14
  • REL_10_9
  • REL_11_4
23 results

tsvector.c

Blame
  • tsvector.c 11.99 KiB
    /*-------------------------------------------------------------------------
     *
     * tsvector.c
     *	  I/O functions for tsvector
     *
     * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
     *
     *
     * IDENTIFICATION
     *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.16 2009/05/21 12:54:27 meskes Exp $
     *
     *-------------------------------------------------------------------------
     */
    
    #include "postgres.h"
    
    #include "libpq/pqformat.h"
    #include "tsearch/ts_type.h"
    #include "tsearch/ts_locale.h"
    #include "tsearch/ts_utils.h"
    #include "utils/memutils.h"
    
    typedef struct
    {
    	WordEntry	entry;			/* must be first! */
    	WordEntryPos *pos;
    	int			poslen;			/* number of elements in pos */
    } WordEntryIN;
    
    
    /* Compare two WordEntryPos values for qsort */
    static int
    comparePos(const void *a, const void *b)
    {
    	int			apos = WEP_GETPOS(*(const WordEntryPos *) a);
    	int			bpos = WEP_GETPOS(*(const WordEntryPos *) b);
    
    	if (apos == bpos)
    		return 0;
    	return (apos > bpos) ? 1 : -1;
    }
    
    /*
     * Removes duplicate pos entries. If there's two entries with same pos
     * but different weight, the higher weight is retained.
     *
     * Returns new length.
     */
    static int
    uniquePos(WordEntryPos *a, int l)
    {
    	WordEntryPos *ptr,
    			   *res;
    
    	if (l <= 1)
    		return l;
    
    	qsort((void *) a, l, sizeof(WordEntryPos), comparePos);
    
    	res = a;
    	ptr = a + 1;
    	while (ptr - a < l)
    	{
    		if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
    		{
    			res++;
    			*res = *ptr;
    			if (res - a >= MAXNUMPOS - 1 ||
    				WEP_GETPOS(*res) == MAXENTRYPOS - 1)
    				break;
    		}
    		else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
    			WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
    		ptr++;
    	}
    
    	return res + 1 - a;
    }
    
    /* Compare two WordEntryIN values for qsort */
    static int
    compareentry(const void *va, const void *vb, void *arg)
    {
    	const WordEntryIN *a = (const WordEntryIN *) va;
    	const WordEntryIN *b = (const WordEntryIN *) vb;
    	char	   *BufferStr = (char *) arg;
    
    	return tsCompareString( &BufferStr[a->entry.pos], a->entry.len,
    							&BufferStr[b->entry.pos], b->entry.len,
    							false );
    }
    
    /*
     * Sort an array of WordEntryIN, remove duplicates.
     * *outbuflen receives the amount of space needed for strings and positions.
     */
    static int
    uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
    {
    	int			buflen;
    	WordEntryIN *ptr,
    			   *res;
    
    	Assert(l >= 1);
    
    	if (l > 1)
    		qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
    				  (void *) buf);
    
    	buflen = 0;
    	res = a;
    	ptr = a + 1;
    	while (ptr - a < l)
    	{
    		if (!(ptr->entry.len == res->entry.len &&
    			  strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
    					  res->entry.len) == 0))
    		{
    			/* done accumulating data into *res, count space needed */
    			buflen += res->entry.len;
    			if (res->entry.haspos)
    			{
    				res->poslen = uniquePos(res->pos, res->poslen);
    				buflen = SHORTALIGN(buflen);
    				buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
    			}
    			res++;
    			memcpy(res, ptr, sizeof(WordEntryIN));
    		}
    		else if (ptr->entry.haspos)
    		{
    			if (res->entry.haspos)
    			{
    				/* append ptr's positions to res's positions */
    				int			newlen = ptr->poslen + res->poslen;
    
    				res->pos = (WordEntryPos *)
    					repalloc(res->pos, newlen * sizeof(WordEntryPos));
    				memcpy(&res->pos[res->poslen], ptr->pos,
    					   ptr->poslen * sizeof(WordEntryPos));
    				res->poslen = newlen;
    				pfree(ptr->pos);
    			}
    			else
    			{
    				/* just give ptr's positions to pos */
    				res->entry.haspos = 1;
    				res->pos = ptr->pos;
    				res->poslen = ptr->poslen;
    			}
    		}
    		ptr++;
    	}
    
    	/* count space needed for last item */
    	buflen += res->entry.len;
    	if (res->entry.haspos)
    	{
    		res->poslen = uniquePos(res->pos, res->poslen);
    		buflen = SHORTALIGN(buflen);
    		buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
    	}
    
    	*outbuflen = buflen;
    	return res + 1 - a;
    }
    
    static int
    WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
    {
    	return compareentry(a, b, buf);
    }
    
    
    Datum
    tsvectorin(PG_FUNCTION_ARGS)
    {
    	char	   *buf = PG_GETARG_CSTRING(0);
    	TSVectorParseState state;
    	WordEntryIN *arr;
    	int			totallen;
    	int			arrlen;			/* allocated size of arr */
    	WordEntry  *inarr;
    	int			len = 0;
    	TSVector	in;
    	int			i;
    	char	   *token;
    	int			toklen;
    	WordEntryPos *pos;
    	int			poslen;
    	char	   *strbuf;
    	int			stroff;
    
    	/*
    	 * Tokens are appended to tmpbuf, cur is a pointer to the end of used
    	 * space in tmpbuf.
    	 */
    	char	   *tmpbuf;
    	char	   *cur;
    	int			buflen = 256;	/* allocated size of tmpbuf */
    
    	pg_verifymbstr(buf, strlen(buf), false);
    
    	state = init_tsvector_parser(buf, false, false);
    
    	arrlen = 64;
    	arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
    	cur = tmpbuf = (char *) palloc(buflen);
    
    	while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
    	{
    		if (toklen >= MAXSTRLEN)
    			ereport(ERROR,
    					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
    					 errmsg("word is too long (%ld bytes, max %ld bytes)",
    							(long) toklen,
    							(long) (MAXSTRLEN - 1))));
    
    		if (cur - tmpbuf > MAXSTRPOS)
    			ereport(ERROR,
    					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
    				 	 errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
    							(long) (cur - tmpbuf), (long) MAXSTRPOS)));
    
    		/*
    		 * Enlarge buffers if needed
    		 */
    		if (len >= arrlen)
    		{
    			arrlen *= 2;
    			arr = (WordEntryIN *)
    				repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
    		}
    		while ((cur - tmpbuf) + toklen >= buflen)
    		{
    			int			dist = cur - tmpbuf;
    
    			buflen *= 2;
    			tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
    			cur = tmpbuf + dist;
    		}
    		arr[len].entry.len = toklen;
    		arr[len].entry.pos = cur - tmpbuf;
    		memcpy((void *) cur, (void *) token, toklen);
    		cur += toklen;
    
    		if (poslen != 0)
    		{
    			arr[len].entry.haspos = 1;
    			arr[len].pos = pos;
    			arr[len].poslen = poslen;
    		}
    		else
    		{
    			arr[len].entry.haspos = 0;
    			arr[len].pos = NULL;
    			arr[len].poslen = 0;
    		}
    		len++;
    	}
    
    	close_tsvector_parser(state);
    
    	if (len > 0)
    		len = uniqueentry(arr, len, tmpbuf, &buflen);
    	else
    		buflen = 0;
    
    	if (buflen > MAXSTRPOS)
    		ereport(ERROR,
    				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
    				 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
    
    	totallen = CALCDATASIZE(len, buflen);
    	in = (TSVector) palloc0(totallen);
    	SET_VARSIZE(in, totallen);
    	in->size = len;
    	inarr = ARRPTR(in);
    	strbuf = STRPTR(in);
    	stroff = 0;
    	for (i = 0; i < len; i++)
    	{
    		memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
    		arr[i].entry.pos = stroff;
    		stroff += arr[i].entry.len;
    		if (arr[i].entry.haspos)
    		{
    			if (arr[i].poslen > 0xFFFF)
    				elog(ERROR, "positions array too long");
    
    			/* Copy number of positions */
    			stroff = SHORTALIGN(stroff);
    			*(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
    			stroff += sizeof(uint16);
    
    			/* Copy positions */
    			memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
    			stroff += arr[i].poslen * sizeof(WordEntryPos);
    
    			pfree(arr[i].pos);
    		}
    		inarr[i] = arr[i].entry;
    	}
    
    	Assert((strbuf + stroff - (char *) in) == totallen);
    
    	PG_RETURN_TSVECTOR(in);
    }
    
    Datum
    tsvectorout(PG_FUNCTION_ARGS)
    {
    	TSVector	out = PG_GETARG_TSVECTOR(0);
    	char	   *outbuf;
    	int4		i,
    				lenbuf = 0,
    				pp;
    	WordEntry  *ptr = ARRPTR(out);
    	char	   *curbegin,
    			   *curin,
    			   *curout;
    
    	lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
    	for (i = 0; i < out->size; i++)
    	{
    		lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
    		if (ptr[i].haspos)
    			lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
    	}
    
    	curout = outbuf = (char *) palloc(lenbuf);
    	for (i = 0; i < out->size; i++)
    	{
    		curbegin = curin = STRPTR(out) + ptr->pos;
    		if (i != 0)
    			*curout++ = ' ';
    		*curout++ = '\'';
    		while (curin - curbegin < ptr->len)
    		{
    			int			len = pg_mblen(curin);
    
    			if (t_iseq(curin, '\''))
    				*curout++ = '\'';
    			else if (t_iseq(curin, '\\'))
    				*curout++ = '\\';
    
    			while (len--)
    				*curout++ = *curin++;
    		}
    
    		*curout++ = '\'';
    		if ((pp = POSDATALEN(out, ptr)) != 0)
    		{
    			WordEntryPos *wptr;
    
    			*curout++ = ':';
    			wptr = POSDATAPTR(out, ptr);
    			while (pp)
    			{
    				curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
    				switch (WEP_GETWEIGHT(*wptr))
    				{
    					case 3:
    						*curout++ = 'A';
    						break;
    					case 2:
    						*curout++ = 'B';
    						break;
    					case 1:
    						*curout++ = 'C';
    						break;
    					case 0:
    					default:
    						break;
    				}
    
    				if (pp > 1)
    					*curout++ = ',';
    				pp--;
    				wptr++;
    			}
    		}
    		ptr++;
    	}
    
    	*curout = '\0';
    	PG_FREE_IF_COPY(out, 0);
    	PG_RETURN_CSTRING(outbuf);
    }
    
    /*
     * Binary Input / Output functions. The binary format is as follows:
     *
     * uint32	number of lexemes
     *
     * for each lexeme:
     *		lexeme text in client encoding, null-terminated
     *		uint16	number of positions
     *		for each position:
     *			uint16 WordEntryPos
     */
    
    Datum
    tsvectorsend(PG_FUNCTION_ARGS)
    {
    	TSVector	vec = PG_GETARG_TSVECTOR(0);
    	StringInfoData buf;
    	int			i,
    				j;
    	WordEntry  *weptr = ARRPTR(vec);
    
    	pq_begintypsend(&buf);
    
    	pq_sendint(&buf, vec->size, sizeof(int32));
    	for (i = 0; i < vec->size; i++)
    	{
    		uint16		npos;
    
    		/*
    		 * the strings in the TSVector array are not null-terminated, so we
    		 * have to send the null-terminator separately
    		 */
    		pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
    		pq_sendbyte(&buf, '\0');
    
    		npos = POSDATALEN(vec, weptr);
    		pq_sendint(&buf, npos, sizeof(uint16));
    
    		if (npos > 0)
    		{
    			WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
    
    			for (j = 0; j < npos; j++)
    				pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
    		}
    		weptr++;
    	}
    
    	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
    }
    
    Datum
    tsvectorrecv(PG_FUNCTION_ARGS)
    {
    	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
    	TSVector	vec;
    	int			i;
    	int32		nentries;
    	int			datalen;		/* number of bytes used in the variable size
    								 * area after fixed size TSVector header and
    								 * WordEntries */
    	Size		hdrlen;
    	Size		len;			/* allocated size of vec */
    
    	nentries = pq_getmsgint(buf, sizeof(int32));
    	if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
    		elog(ERROR, "invalid size of tsvector");
    
    	hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
    
    	len = hdrlen * 2;			/* times two to make room for lexemes */
    	vec = (TSVector) palloc0(len);
    	vec->size = nentries;
    
    	datalen = 0;
    	for (i = 0; i < nentries; i++)
    	{
    		const char *lexeme;
    		uint16		npos;
    		size_t		lex_len;
    
    		lexeme = pq_getmsgstring(buf);
    		npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
    
    		/* sanity checks */
    
    		lex_len = strlen(lexeme);
    		if (lex_len > MAXSTRLEN)
    			elog(ERROR, "invalid tsvector: lexeme too long");
    
    		if (datalen > MAXSTRPOS)
    			elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
    
    		if (npos > MAXNUMPOS)
    			elog(ERROR, "unexpected number of tsvector positions");
    
    		/*
    		 * Looks valid. Fill the WordEntry struct, and copy lexeme.
    		 *
    		 * But make sure the buffer is large enough first.
    		 */
    		while (hdrlen + SHORTALIGN(datalen + lex_len) +
    			   (npos + 1) * sizeof(WordEntryPos) >= len)
    		{
    			len *= 2;
    			vec = (TSVector) repalloc(vec, len);
    		}
    
    		vec->entries[i].haspos = (npos > 0) ? 1 : 0;
    		vec->entries[i].len = lex_len;
    		vec->entries[i].pos = datalen;
    
    		memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
    
    		datalen += lex_len;
    
    		if (i > 0 && WordEntryCMP(&vec->entries[i],
    								  &vec->entries[i - 1],
    								  STRPTR(vec)) <= 0)
    			elog(ERROR, "lexemes are misordered");
    
    		/* Receive positions */
    		if (npos > 0)
    		{
    			uint16		j;
    			WordEntryPos *wepptr;
    
    			/*
    			 * Pad to 2-byte alignment if necessary. Though we used palloc0
    			 * for the initial allocation, subsequent repalloc'd memory areas
    			 * are not initialized to zero.
    			 */
    			if (datalen != SHORTALIGN(datalen))
    			{
    				*(STRPTR(vec) + datalen) = '\0';
    				datalen = SHORTALIGN(datalen);
    			}
    
    			memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
    
    			wepptr = POSDATAPTR(vec, &vec->entries[i]);
    			for (j = 0; j < npos; j++)
    			{
    				wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
    				if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
    					elog(ERROR, "position information is misordered");
    			}
    
    			datalen += (npos + 1) * sizeof(WordEntry);
    		}
    	}
    
    	SET_VARSIZE(vec, hdrlen + datalen);
    
    	PG_RETURN_TSVECTOR(vec);
    }