Handle carriage returns and line feeds in COPY CSV mode.

Andrew Dunstan

Handle carriage returns and line feeds in COPY CSV mode.
5fdd9418 · Bruce Momjian · 06a61d66 · 5fdd9418
Commit 5fdd9418 authored Mar 12, 2005 by Bruce Momjian
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.236 2004/12/31 21:59:41 pgsql Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.237 2005/03/12 05:41:34 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -98,7 +98,6 @@ static bool fe_eof;				/* true if detected end of copy data */
 static EolType eol_type;		/* EOL type of input */
 static int	client_encoding;	/* remote side's character encoding */
 static int	server_encoding;	/* local encoding */
-static bool embedded_line_warning;

 /* these are just for error messages, see copy_in_error_callback */
 static bool copy_binary;		/* is it a binary copy? */
@@ -139,7 +138,7 @@ static void CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
 static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
 char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
 		 List *force_notnull_atts);
-static bool CopyReadLine(void);
+static bool CopyReadLine(char * quote, char * escape);
 static char *CopyReadAttribute(const char *delim, const char *null_print,
 				  CopyReadResult *result, bool *isnull);
 static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
@@ -1191,7 +1190,6 @@ CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
 	attr = tupDesc->attrs;
 	num_phys_attrs = tupDesc->natts;
 	attr_count = list_length(attnumlist);
-	embedded_line_warning = false;

 	/*
 	 * Get info about the columns we need to process.
@@ -1718,7 +1716,8 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
 			ListCell   *cur;

 			/* Actually read the line into memory here */
-			done = CopyReadLine();
+			done = csv_mode ? 
+				CopyReadLine(quote, escape) : CopyReadLine(NULL, NULL);

 			/*
 			 * EOF at start of line means we're done.  If we see EOF after
@@ -2006,7 +2005,7 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
 * by newline.
 */
 static bool
-CopyReadLine(void)
+CopyReadLine(char * quote, char * escape)
 {
 	bool		result;
 	bool		change_encoding = (client_encoding != server_encoding);
@@ -2015,6 +2014,19 @@ CopyReadLine(void)
 	int			j;
 	unsigned char s[2];
 	char	   *cvt;
+	bool        in_quote = false, last_was_esc = false, csv_mode = false;
+	char        quotec = '\0', escapec = '\0';
+
+	if (quote)
+	{
+		csv_mode = true;
+		quotec = quote[0];
+		escapec = escape[0];
+		/* ignore special escape processing if it's the same as quotec */
+		if (quotec == escapec)
+			escapec = '\0';
+	}
+

 	s[1] = 0;

@@ -2031,11 +2043,20 @@ CopyReadLine(void)

 	/*
 	 * In this loop we only care for detecting newlines (\r and/or \n) and
-	 * the end-of-copy marker (\.).  For backwards compatibility we allow
+	 * the end-of-copy marker (\.).  
+	 *
+	 * In Text mode, for backwards compatibility we allow
 	 * backslashes to escape newline characters.  Backslashes other than
 	 * the end marker get put into the line_buf, since CopyReadAttribute
-	 * does its own escape processing.	These four characters, and only
-	 * these four, are assumed the same in frontend and backend encodings.
+	 * does its own escape processing.	
+	 *
+	 * In CSV mode, CR and NL inside q quoted field are just part of the
+	 * data value and are put in line_buf. We keep just enough state
+	 * to know if we are currently in a quoted field or not.
+	 *
+	 * These four characters, and only these four, are assumed the same in 
+	 * frontend and backend encodings.
+	 *
 	 * We do not assume that second and later bytes of a frontend
 	 * multibyte character couldn't look like ASCII characters.
 	 */
@@ -2047,13 +2068,49 @@ CopyReadLine(void)
 			result = true;
 			break;
 		}
-		if (c == '\r')
+
+		if (csv_mode)
+		{
+			/*  
+			 * Dealing with quotes and escapes here is mildly tricky. If the
+			 * quote char is also the escape char, there's no problem - we  
+			 * just use the char as a toggle. If they are different, we need
+			 * to ensure that we only take account of an escape inside a quoted
+			 * field and immediately preceding a quote char, and not the
+			 * second in a escape-escape sequence.
+			 */ 
+
+			if (in_quote && c == escapec)
+				last_was_esc = ! last_was_esc;
+			if (c == quotec && ! last_was_esc)
+				in_quote = ! in_quote;
+			if (c != escapec)
+				last_was_esc = false;
+
+			/*
+			 * updating the line count for embedded CR and/or LF chars is 
+			 * necessarily a little fragile - this test is probably about 
+			 * the best we can do.
+			 */ 
+			if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n')) 
+				copy_lineno++; 
+		}
+
+		if (!in_quote && c == '\r')
 		{
 			if (eol_type == EOL_NL)
+			{
+				if (! csv_mode)
 					ereport(ERROR,
 							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 							 errmsg("literal carriage return found in data"),
 							 errhint("Use \"\\r\" to represent carriage return.")));
+				else
+					ereport(ERROR,
+							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+							 errmsg("unquoted carriage return found in CSV data"),
+							 errhint("Use quoted CSV field to represent carriage return.")));
+			}
 			/* Check for \r\n on first line, _and_ handle \r\n. */
 			if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
 			{
@@ -2068,10 +2125,19 @@ CopyReadLine(void)
 				{
 					/* found \r, but no \n */
 					if (eol_type == EOL_CRNL)
+					{
+						if (!csv_mode)
 							ereport(ERROR,
 									(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 									 errmsg("literal carriage return found in data"),
 									 errhint("Use \"\\r\" to represent carriage return.")));
+						else
+							ereport(ERROR,
+									(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+									 errmsg("unquoted carriage return found in data"),
+									 errhint("Use quoted CSV field to represent carriage return.")));
+
+					}

 					/*
 					 * if we got here, it is the first line and we didn't
@@ -2083,26 +2149,47 @@ CopyReadLine(void)
 			}
 			break;
 		}
-		if (c == '\n')
+		if (!in_quote && c == '\n')
 		{
 			if (eol_type == EOL_CR || eol_type == EOL_CRNL)
+			{
+				if (!csv_mode)
 					ereport(ERROR,
 							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 							 errmsg("literal newline found in data"),
 							 errhint("Use \"\\n\" to represent newline.")));
+				else
+					ereport(ERROR,
+							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+							 errmsg("unquoted newline found in data"),
+							 errhint("Use quoted CSV field to represent newline.")));
+					
+			}
 			eol_type = EOL_NL;
 			break;
 		}
-		if (c == '\\')
+
+		if ((line_buf.len == 0 || !csv_mode) && c == '\\')
 		{
-			c = CopyGetChar();
-			if (c == EOF)
+			int c2;
+			
+			if (csv_mode)
+				c2 = CopyPeekChar();
+			else
+				c2 = c = CopyGetChar();
+
+			if (c2 == EOF)
 			{
 				result = true;
+				if (csv_mode)
+					CopyDonePeek(c2, true);
 				break;
 			}
-			if (c == '.')
+			if (c2 == '.')
 			{
+				if (csv_mode)
+					CopyDonePeek(c2, true); /* allow keep calling GetChar() */
+
 				if (eol_type == EOL_CRNL)
 				{
 					c = CopyGetChar();
@@ -2140,6 +2227,10 @@ CopyReadLine(void)
 				result = true;	/* report EOF */
 				break;
 			}
+			
+			if (csv_mode)
+				CopyDonePeek(c2, false); /* not a dot, so put it back */ 
+			else
 				/* not EOF mark, so emit \ and following char literally */
 				appendStringInfoCharMacro(&line_buf, '\\');
 		}
@@ -2369,34 +2460,6 @@ CopyReadAttributeCSV(const char *delim, const char *null_print, char *quote,

 	for (;;)
 	{
-		/* handle multiline quoted fields */
-		if (in_quote && line_buf.cursor >= line_buf.len)
-		{
-			bool		done;
-
-			switch (eol_type)
-			{
-				case EOL_NL:
-					appendStringInfoString(&attribute_buf, "\n");
-					break;
-				case EOL_CR:
-					appendStringInfoString(&attribute_buf, "\r");
-					break;
-				case EOL_CRNL:
-					appendStringInfoString(&attribute_buf, "\r\n");
-					break;
-				case EOL_UNKNOWN:
-					/* shouldn't happen - just keep going */
-					break;
-			}
-
-			copy_lineno++;
-			done = CopyReadLine();
-			if (done && line_buf.len == 0)
-				break;
-			start_cursor = line_buf.cursor;
-		}
-
 		end_cursor = line_buf.cursor;
 		if (line_buf.cursor >= line_buf.len)
 			break;
@@ -2629,25 +2692,6 @@ CopyAttributeOutCSV(char *server_string, char *delim, char *quote,
 		 !use_quote && (c = *test_string) != '\0';
 		 test_string += mblen)
 	{
-		/*
-		 * We don't know here what the surrounding line end characters
-		 * might be. It might not even be under postgres' control. So
-		 * we simple warn on ANY embedded line ending character.
-		 *
-		 * This warning will disappear when we make line parsing field-aware,
-		 * so that we can reliably read in embedded line ending characters
-		 * regardless of the file's line-end context.
-		 *
-		 */
-
-		if (!embedded_line_warning  && (c == '\n' || c == '\r') )
-		{
-			embedded_line_warning = true;
-			elog(WARNING,
-				 "CSV fields with embedded linefeed or carriage return "
-				 "characters might not be able to be reimported");
-		}
-
 		if (c == delimc || c == quotec || c == '\n' || c == '\r')
 			use_quote = true;
 		if (!same_encoding)