From 799ac992014374c23a1fc437f4fd9aa413be4920 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 27 Sep 2009 03:27:24 +0000
Subject: [PATCH] Sync psql's scanner with recent changes in backend scanner's
 flex rules. Marko Kreen, Tom Lane

---
 src/backend/parser/scan.l | 10 ++++-----
 src/bin/psql/psqlscan.l   | 44 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index fdc95135509..150202e77ce 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -24,7 +24,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.161 2009/09/25 21:13:06 petere Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.162 2009/09/27 03:27:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -571,18 +571,16 @@ other			.
 
 					BEGIN(xe);
 				}
-<xeu>.			|
-<xeu>\n			|
+<xeu>.			{ yyerror("invalid Unicode surrogate pair"); }
+<xeu>\n			{ yyerror("invalid Unicode surrogate pair"); }
 <xeu><<EOF>>	{ yyerror("invalid Unicode surrogate pair"); }
-
 <xe,xeu>{xeunicodefail}	{
 						ereport(ERROR,
 								(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
 								 errmsg("invalid Unicode escape"),
 								 errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
 								 lexer_errposition()));
-					}
-
+				}
 <xe>{xeescape}  {
 					if (yytext[1] == '\'')
 					{
diff --git a/src/bin/psql/psqlscan.l b/src/bin/psql/psqlscan.l
index 235fe9599c6..894800aaf58 100644
--- a/src/bin/psql/psqlscan.l
+++ b/src/bin/psql/psqlscan.l
@@ -33,7 +33,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.28 2009/01/01 17:23:55 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.29 2009/09/27 03:27:24 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -117,6 +117,7 @@ static void push_new_buffer(const char *newstr);
 static YY_BUFFER_STATE prepare_buffer(const char *txt, int len,
 									  char **txtcopy);
 static void emit(const char *txt, int len);
+static bool is_utf16_surrogate_first(uint32 c);
 
 #define ECHO emit(yytext, yyleng)
 
@@ -158,6 +159,7 @@ static void emit(const char *txt, int len);
  *  <xdolq> $foo$ quoted strings
  *  <xui> quoted identifier with Unicode escapes
  *  <xus> quoted string with Unicode escapes
+ *  <xeu> Unicode surrogate pair in extended quoted string
  */
 
 %x xb
@@ -169,6 +171,7 @@ static void emit(const char *txt, int len);
 %x xdolq
 %x xui
 %x xus
+%x xeu
 /* Additional exclusive states for psql only: lex backslash commands */
 %x xslashcmd
 %x xslasharg
@@ -192,6 +195,9 @@ static void emit(const char *txt, int len);
  * did not end with a newline.
  *
  * XXX perhaps \f (formfeed) should be treated as a newline as well?
+ *
+ * XXX if you change the set of whitespace characters, fix scanner_isspace()
+ * to agree, and see also the plpgsql lexer.
  */
 
 space			[ \t\n\r\f]
@@ -253,6 +259,8 @@ xeinside		[^\\']+
 xeescape		[\\][^0-7]
 xeoctesc		[\\][0-7]{1,3}
 xehexesc		[\\]x[0-9A-Fa-f]{1,2}
+xeunicode		[\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
+xeunicodefail	[\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
 
 /* Extended quote
  * xqdouble implements embedded quote, ''''
@@ -334,6 +342,10 @@ identifier		{ident_start}{ident_cont}*
 
 typecast		"::"
 
+/* these two token types are used by PL/pgsql, though not in core SQL */
+dot_dot			\.\.
+colon_equals	":="
+
 /*
  * "self" is the set of chars that should be returned as single-character
  * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
@@ -511,6 +523,22 @@ other			.
 <xe>{xeinside}  {
 					ECHO;
 				}
+<xe>{xeunicode} {
+					uint32 c = strtoul(yytext+2, NULL, 16);
+
+					if (is_utf16_surrogate_first(c))
+						BEGIN(xeu);
+					ECHO;
+				}
+<xeu>{xeunicode} {
+					BEGIN(xe);
+					ECHO;
+				}
+<xeu>.			{ ECHO; }
+<xeu>\n			{ ECHO; }
+<xe,xeu>{xeunicodefail}	{
+					ECHO;
+				}
 <xe>{xeescape}  {
 					ECHO;
 				}
@@ -605,6 +633,14 @@ other			.
 					ECHO;
 				}
 
+{dot_dot}		{
+					ECHO;
+				}
+
+{colon_equals}	{
+					ECHO;
+				}
+
 	/*
 	 * These rules are specific to psql --- they implement parenthesis
 	 * counting and detection of command-ending semicolon.  These must
@@ -1690,3 +1726,9 @@ emit(const char *txt, int len)
 		}
 	}
 }
+
+static bool
+is_utf16_surrogate_first(uint32 c)
+{
+	return (c >= 0xD800 && c <= 0xDBFF);
+}
-- 
GitLab