From d4fd7d85f3a0b1cb33079ef0475d4d59dbc5362d Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 31 Jan 2004 00:45:21 +0000
Subject: [PATCH] Fix text_position to not scan past end of source string in
 multibyte case, per report from Korea PostgreSQL Users' Group.  Also do some
 cosmetic cleanup in nearby code.

---
 src/backend/utils/adt/varlena.c | 207 +++++++++++++++++---------------
 1 file changed, 113 insertions(+), 94 deletions(-)

diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index f4d21571ff3..e3337fe7d80 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.109 2003/12/19 04:56:41 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.110 2004/01/31 00:45:21 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -46,7 +46,7 @@ typedef struct varlena unknown;
 #define TEXTLEN(textp) \
 	text_length(PointerGetDatum(textp))
 #define TEXTPOS(buf_text, from_sub_text) \
-	text_position(PointerGetDatum(buf_text), PointerGetDatum(from_sub_text), 1)
+	text_position(buf_text, from_sub_text, 1)
 #define TEXTDUP(textp) \
 	DatumGetTextPCopy(PointerGetDatum(textp))
 #define LEFT(buf_text, from_sub_text) \
@@ -55,12 +55,12 @@ typedef struct varlena unknown;
 					TEXTPOS(buf_text, from_sub_text) - 1, false)
 #define RIGHT(buf_text, from_sub_text, from_sub_text_len) \
 	text_substring(PointerGetDatum(buf_text), \
-					TEXTPOS(buf_text, from_sub_text) + from_sub_text_len, \
+					TEXTPOS(buf_text, from_sub_text) + (from_sub_text_len), \
 					-1, true)
 
 static int	text_cmp(text *arg1, text *arg2);
 static int32 text_length(Datum str);
-static int32 text_position(Datum str, Datum search_str, int matchnum);
+static int32 text_position(text *t1, text *t2, int matchnum);
 static text *text_substring(Datum str,
 			   int32 start,
 			   int32 length,
@@ -403,14 +403,20 @@ unknownsend(PG_FUNCTION_ARGS)
 Datum
 textlen(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_INT32(text_length(PG_GETARG_DATUM(0)));
+	Datum		str = PG_GETARG_DATUM(0);
+
+	/* try to avoid decompressing argument */
+	PG_RETURN_INT32(text_length(str));
 }
 
 /*
  * text_length -
  *	Does the real work for textlen()
+ *
  *	This is broken out so it can be called directly by other string processing
- *	functions.
+ *	functions.  Note that the argument is passed as a Datum, to indicate that
+ *	it may still be in compressed form.  We can avoid decompressing it at all
+ *	in some cases.
  */
 static int32
 text_length(Datum str)
@@ -418,20 +424,13 @@ text_length(Datum str)
 	/* fastpath when max encoding length is one */
 	if (pg_database_encoding_max_length() == 1)
 		PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
-
-	if (pg_database_encoding_max_length() > 1)
+	else
 	{
 		text	   *t = DatumGetTextP(str);
 
 		PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA(t),
 											 VARSIZE(t) - VARHDRSZ));
 	}
-
-	/* should never get here */
-	elog(ERROR, "invalid backend encoding: encoding max length < 1");
-
-	/* not reached: suppress compiler warning */
-	return 0;
 }
 
 /*
@@ -442,7 +441,10 @@ text_length(Datum str)
 Datum
 textoctetlen(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_INT32(toast_raw_datum_size(PG_GETARG_DATUM(0)) - VARHDRSZ);
+	Datum		str = PG_GETARG_DATUM(0);
+
+	/* We need not detoast the input at all */
+	PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
 }
 
 /*
@@ -504,9 +506,6 @@ textcat(PG_FUNCTION_ARGS)
  *	adjusting the length to be consistent with the "negative start" per SQL92.
  * If the length is less than zero, return the remaining string.
  *
- * Note that the arguments operate on octet length,
- *	so not aware of multibyte character sets.
- *
  * Added multibyte support.
  * - Tatsuo Ishii 1998-4-21
  * Changed behavior if starting position is less than one to conform to SQL92 behavior.
@@ -545,8 +544,11 @@ text_substr_no_len(PG_FUNCTION_ARGS)
 /*
  * text_substring -
  *	Does the real work for text_substr() and text_substr_no_len()
+ *
  *	This is broken out so it can be called directly by other string processing
- *	functions.
+ *	functions.  Note that the argument is passed as a Datum, to indicate that
+ *	it may still be in compressed/toasted form.  We can avoid detoasting all
+ *	of it in some cases.
  */
 static text *
 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
@@ -717,7 +719,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 		elog(ERROR, "invalid backend encoding: encoding max length < 1");
 
 	/* not reached: suppress compiler warning */
-	return PG_STR_GET_TEXT("");
+	return NULL;
 }
 
 /*
@@ -730,51 +732,61 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 Datum
 textpos(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_INT32(text_position(PG_GETARG_DATUM(0), PG_GETARG_DATUM(1), 1));
+	text	   *str = PG_GETARG_TEXT_P(0);
+	text	   *search_str = PG_GETARG_TEXT_P(1);
+
+	PG_RETURN_INT32(text_position(str, search_str, 1));
 }
 
 /*
  * text_position -
  *	Does the real work for textpos()
+ *
+ * Inputs:
+ *		t1 - string to be searched
+ *		t2 - pattern to match within t1
+ *		matchnum - number of the match to be found (1 is the first match)
+ * Result:
+ *		Character index of the first matched char, starting from 1,
+ *		or 0 if no match.
+ *
  *	This is broken out so it can be called directly by other string processing
  *	functions.
  */
 static int32
-text_position(Datum str, Datum search_str, int matchnum)
+text_position(text *t1, text *t2, int matchnum)
 {
-	int			eml = pg_database_encoding_max_length();
-	text	   *t1 = DatumGetTextP(str);
-	text	   *t2 = DatumGetTextP(search_str);
 	int			match = 0,
 				pos = 0,
-				p = 0,
+				p,
 				px,
 				len1,
 				len2;
 
-	if (matchnum == 0)
+	if (matchnum <= 0)
 		return 0;				/* result for 0th match */
 
 	if (VARSIZE(t2) <= VARHDRSZ)
-		PG_RETURN_INT32(1);		/* result for empty pattern */
+		return 1;				/* result for empty pattern */
 
 	len1 = (VARSIZE(t1) - VARHDRSZ);
 	len2 = (VARSIZE(t2) - VARHDRSZ);
 
-	/* no use in searching str past point where search_str will fit */
-	px = (len1 - len2);
-
-	if (eml == 1)				/* simple case - single byte encoding */
+	if (pg_database_encoding_max_length() == 1)
 	{
+		/* simple case - single byte encoding */
 		char	   *p1,
 				   *p2;
 
 		p1 = VARDATA(t1);
 		p2 = VARDATA(t2);
 
+		/* no use in searching str past point where search_str will fit */
+		px = (len1 - len2);
+
 		for (p = 0; p <= px; p++)
 		{
-			if ((*p2 == *p1) && (strncmp(p1, p2, len2) == 0))
+			if ((*p1 == *p2) && (strncmp(p1, p2, len2) == 0))
 			{
 				if (++match == matchnum)
 				{
@@ -785,8 +797,9 @@ text_position(Datum str, Datum search_str, int matchnum)
 			p1++;
 		}
 	}
-	else if (eml > 1)			/* not as simple - multibyte encoding */
+	else
 	{
+		/* not as simple - multibyte encoding */
 		pg_wchar   *p1,
 				   *p2,
 				   *ps1,
@@ -799,9 +812,12 @@ text_position(Datum str, Datum search_str, int matchnum)
 		(void) pg_mb2wchar_with_len((unsigned char *) VARDATA(t2), p2, len2);
 		len2 = pg_wchar_strlen(p2);
 
+		/* no use in searching str past point where search_str will fit */
+		px = (len1 - len2);
+
 		for (p = 0; p <= px; p++)
 		{
-			if ((*p2 == *p1) && (pg_wchar_strncmp(p1, p2, len2) == 0))
+			if ((*p1 == *p2) && (pg_wchar_strncmp(p1, p2, len2) == 0))
 			{
 				if (++match == matchnum)
 				{
@@ -815,10 +831,8 @@ text_position(Datum str, Datum search_str, int matchnum)
 		pfree(ps1);
 		pfree(ps2);
 	}
-	else
-		elog(ERROR, "invalid backend encoding: encoding max length < 1");
 
-	PG_RETURN_INT32(pos);
+	return pos;
 }
 
 /* varstr_cmp()
@@ -1199,7 +1213,10 @@ bttext_pattern_cmp(PG_FUNCTION_ARGS)
 Datum
 byteaoctetlen(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_INT32(toast_raw_datum_size(PG_GETARG_DATUM(0)) - VARHDRSZ);
+	Datum		str = PG_GETARG_DATUM(0);
+
+	/* We need not detoast the input at all */
+	PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
 }
 
 /*
@@ -1925,17 +1942,17 @@ byteacmp(PG_FUNCTION_ARGS)
 Datum
 replace_text(PG_FUNCTION_ARGS)
 {
+	text	   *src_text = PG_GETARG_TEXT_P(0);
+	text	   *from_sub_text = PG_GETARG_TEXT_P(1);
+	text	   *to_sub_text = PG_GETARG_TEXT_P(2);
+	int			src_text_len = TEXTLEN(src_text);
+	int			from_sub_text_len = TEXTLEN(from_sub_text);
+	char	   *to_sub_str = PG_TEXT_GET_STR(to_sub_text);
 	text	   *left_text;
 	text	   *right_text;
 	text	   *buf_text;
 	text	   *ret_text;
 	int			curr_posn;
-	text	   *src_text = PG_GETARG_TEXT_P(0);
-	int			src_text_len = TEXTLEN(src_text);
-	text	   *from_sub_text = PG_GETARG_TEXT_P(1);
-	int			from_sub_text_len = TEXTLEN(from_sub_text);
-	text	   *to_sub_text = PG_GETARG_TEXT_P(2);
-	char	   *to_sub_str = PG_TEXT_GET_STR(to_sub_text);
 	StringInfo	str = makeStringInfo();
 
 	if (src_text_len == 0 || from_sub_text_len == 0)
@@ -1978,14 +1995,20 @@ Datum
 split_text(PG_FUNCTION_ARGS)
 {
 	text	   *inputstring = PG_GETARG_TEXT_P(0);
-	int			inputstring_len = TEXTLEN(inputstring);
 	text	   *fldsep = PG_GETARG_TEXT_P(1);
-	int			fldsep_len = TEXTLEN(fldsep);
 	int			fldnum = PG_GETARG_INT32(2);
-	int			start_posn = 0;
-	int			end_posn = 0;
+	int			inputstring_len = TEXTLEN(inputstring);
+	int			fldsep_len = TEXTLEN(fldsep);
+	int			start_posn;
+	int			end_posn;
 	text	   *result_text;
 
+	/* field number is 1 based */
+	if (fldnum < 1)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("field position must be greater than zero")));
+
 	/* return empty string for empty input string */
 	if (inputstring_len < 1)
 		PG_RETURN_TEXT_P(PG_STR_GET_TEXT(""));
@@ -1993,52 +2016,45 @@ split_text(PG_FUNCTION_ARGS)
 	/* empty field separator */
 	if (fldsep_len < 1)
 	{
-		if (fldnum == 1)		/* first field - just return the input
-								 * string */
+		/* if first field, return input string, else empty string */
+		if (fldnum == 1)
 			PG_RETURN_TEXT_P(inputstring);
 		else
-/* otherwise return an empty string */
 			PG_RETURN_TEXT_P(PG_STR_GET_TEXT(""));
 	}
 
-	/* field number is 1 based */
-	if (fldnum < 1)
-		ereport(ERROR,
-				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-				 errmsg("field position must be greater than zero")));
-
-	start_posn = text_position(PointerGetDatum(inputstring),
-							   PointerGetDatum(fldsep),
-							   fldnum - 1);
-	end_posn = text_position(PointerGetDatum(inputstring),
-							 PointerGetDatum(fldsep),
-							 fldnum);
+	start_posn = text_position(inputstring, fldsep, fldnum - 1);
+	end_posn = text_position(inputstring, fldsep, fldnum);
 
 	if ((start_posn == 0) && (end_posn == 0))	/* fldsep not found */
 	{
-		if (fldnum == 1)		/* first field - just return the input
-								 * string */
+		/* if first field, return input string, else empty string */
+		if (fldnum == 1)
 			PG_RETURN_TEXT_P(inputstring);
 		else
-/* otherwise return an empty string */
 			PG_RETURN_TEXT_P(PG_STR_GET_TEXT(""));
 	}
-	else if ((start_posn != 0) && (end_posn == 0))
+	else if (start_posn == 0)
 	{
-		/* last field requested */
-		result_text = text_substring(PointerGetDatum(inputstring), start_posn + fldsep_len, -1, true);
+		/* first field requested */
+		result_text = LEFT(inputstring, fldsep);
 		PG_RETURN_TEXT_P(result_text);
 	}
-	else if ((start_posn == 0) && (end_posn != 0))
+	else if (end_posn == 0)
 	{
-		/* first field requested */
-		result_text = LEFT(inputstring, fldsep);
+		/* last field requested */
+		result_text = text_substring(PointerGetDatum(inputstring),
+									 start_posn + fldsep_len,
+									 -1, true);
 		PG_RETURN_TEXT_P(result_text);
 	}
 	else
 	{
-		/* prior to last field requested */
-		result_text = text_substring(PointerGetDatum(inputstring), start_posn + fldsep_len, end_posn - start_posn - fldsep_len, false);
+		/* interior field requested */
+		result_text = text_substring(PointerGetDatum(inputstring),
+									 start_posn + fldsep_len,
+									 end_posn - start_posn - fldsep_len,
+									 false);
 		PG_RETURN_TEXT_P(result_text);
 	}
 }
@@ -2053,15 +2069,14 @@ Datum
 text_to_array(PG_FUNCTION_ARGS)
 {
 	text	   *inputstring = PG_GETARG_TEXT_P(0);
-	int			inputstring_len = TEXTLEN(inputstring);
 	text	   *fldsep = PG_GETARG_TEXT_P(1);
+	int			inputstring_len = TEXTLEN(inputstring);
 	int			fldsep_len = TEXTLEN(fldsep);
 	int			fldnum;
-	int			start_posn = 0;
-	int			end_posn = 0;
-	text	   *result_text = NULL;
+	int			start_posn;
+	int			end_posn;
+	text	   *result_text;
 	ArrayBuildState *astate = NULL;
-	MemoryContext oldcontext = CurrentMemoryContext;
 
 	/* return NULL for empty input string */
 	if (inputstring_len < 1)
@@ -2083,9 +2098,7 @@ text_to_array(PG_FUNCTION_ARGS)
 		bool		disnull = false;
 
 		start_posn = end_posn;
-		end_posn = text_position(PointerGetDatum(inputstring),
-								 PointerGetDatum(fldsep),
-								 fldnum);
+		end_posn = text_position(inputstring, fldsep, fldnum);
 
 		if ((start_posn == 0) && (end_posn == 0))		/* fldsep not found */
 		{
@@ -2101,30 +2114,36 @@ text_to_array(PG_FUNCTION_ARGS)
 			else
 			{
 				/* otherwise create array and exit */
-				PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, oldcontext));
+				PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
+													  CurrentMemoryContext));
 			}
 		}
-		else if ((start_posn != 0) && (end_posn == 0))
-		{
-			/* last field requested */
-			result_text = text_substring(PointerGetDatum(inputstring), start_posn + fldsep_len, -1, true);
-		}
-		else if ((start_posn == 0) && (end_posn != 0))
+		else if (start_posn == 0)
 		{
 			/* first field requested */
 			result_text = LEFT(inputstring, fldsep);
 		}
+		else if (end_posn == 0)
+		{
+			/* last field requested */
+			result_text = text_substring(PointerGetDatum(inputstring),
+										 start_posn + fldsep_len,
+										 -1, true);
+		}
 		else
 		{
-			/* prior to last field requested */
-			result_text = text_substring(PointerGetDatum(inputstring), start_posn + fldsep_len, end_posn - start_posn - fldsep_len, false);
+			/* interior field requested */
+			result_text = text_substring(PointerGetDatum(inputstring),
+										 start_posn + fldsep_len,
+										 end_posn - start_posn - fldsep_len,
+										 false);
 		}
 
 		/* stash away current value */
 		dvalue = PointerGetDatum(result_text);
 		astate = accumArrayResult(astate, dvalue,
-								  disnull, TEXTOID, oldcontext);
-
+								  disnull, TEXTOID,
+								  CurrentMemoryContext);
 	}
 
 	/* never reached -- keep compiler quiet */
-- 
GitLab