diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 9f99bdd9e482868a29f5d92a572b8c40990689e4..ed815098aba7577ce2b29286bf108f8e40bc3a12 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.236 2004/12/31 21:59:41 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.237 2005/03/12 05:41:34 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -98,7 +98,6 @@ static bool fe_eof; /* true if detected end of copy data */ static EolType eol_type; /* EOL type of input */ static int client_encoding; /* remote side's character encoding */ static int server_encoding; /* local encoding */ -static bool embedded_line_warning; /* these are just for error messages, see copy_in_error_callback */ static bool copy_binary; /* is it a binary copy? */ @@ -139,7 +138,7 @@ static void CopyTo(Relation rel, List *attnumlist, bool binary, bool oids, static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids, char *delim, char *null_print, bool csv_mode, char *quote, char *escape, List *force_notnull_atts); -static bool CopyReadLine(void); +static bool CopyReadLine(char * quote, char * escape); static char *CopyReadAttribute(const char *delim, const char *null_print, CopyReadResult *result, bool *isnull); static char *CopyReadAttributeCSV(const char *delim, const char *null_print, @@ -1191,7 +1190,6 @@ CopyTo(Relation rel, List *attnumlist, bool binary, bool oids, attr = tupDesc->attrs; num_phys_attrs = tupDesc->natts; attr_count = list_length(attnumlist); - embedded_line_warning = false; /* * Get info about the columns we need to process. @@ -1718,7 +1716,8 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids, ListCell *cur; /* Actually read the line into memory here */ - done = CopyReadLine(); + done = csv_mode ? + CopyReadLine(quote, escape) : CopyReadLine(NULL, NULL); /* * EOF at start of line means we're done. If we see EOF after @@ -2006,7 +2005,7 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids, * by newline. */ static bool -CopyReadLine(void) +CopyReadLine(char * quote, char * escape) { bool result; bool change_encoding = (client_encoding != server_encoding); @@ -2015,6 +2014,19 @@ CopyReadLine(void) int j; unsigned char s[2]; char *cvt; + bool in_quote = false, last_was_esc = false, csv_mode = false; + char quotec = '\0', escapec = '\0'; + + if (quote) + { + csv_mode = true; + quotec = quote[0]; + escapec = escape[0]; + /* ignore special escape processing if it's the same as quotec */ + if (quotec == escapec) + escapec = '\0'; + } + s[1] = 0; @@ -2031,11 +2043,20 @@ CopyReadLine(void) /* * In this loop we only care for detecting newlines (\r and/or \n) and - * the end-of-copy marker (\.). For backwards compatibility we allow + * the end-of-copy marker (\.). + * + * In Text mode, for backwards compatibility we allow * backslashes to escape newline characters. Backslashes other than * the end marker get put into the line_buf, since CopyReadAttribute - * does its own escape processing. These four characters, and only - * these four, are assumed the same in frontend and backend encodings. + * does its own escape processing. + * + * In CSV mode, CR and NL inside q quoted field are just part of the + * data value and are put in line_buf. We keep just enough state + * to know if we are currently in a quoted field or not. + * + * These four characters, and only these four, are assumed the same in + * frontend and backend encodings. + * * We do not assume that second and later bytes of a frontend * multibyte character couldn't look like ASCII characters. */ @@ -2047,13 +2068,49 @@ CopyReadLine(void) result = true; break; } - if (c == '\r') + + if (csv_mode) + { + /* + * Dealing with quotes and escapes here is mildly tricky. If the + * quote char is also the escape char, there's no problem - we + * just use the char as a toggle. If they are different, we need + * to ensure that we only take account of an escape inside a quoted + * field and immediately preceding a quote char, and not the + * second in a escape-escape sequence. + */ + + if (in_quote && c == escapec) + last_was_esc = ! last_was_esc; + if (c == quotec && ! last_was_esc) + in_quote = ! in_quote; + if (c != escapec) + last_was_esc = false; + + /* + * updating the line count for embedded CR and/or LF chars is + * necessarily a little fragile - this test is probably about + * the best we can do. + */ + if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n')) + copy_lineno++; + } + + if (!in_quote && c == '\r') { if (eol_type == EOL_NL) - ereport(ERROR, - (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("literal carriage return found in data"), - errhint("Use \"\\r\" to represent carriage return."))); + { + if (! csv_mode) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("literal carriage return found in data"), + errhint("Use \"\\r\" to represent carriage return."))); + else + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("unquoted carriage return found in CSV data"), + errhint("Use quoted CSV field to represent carriage return."))); + } /* Check for \r\n on first line, _and_ handle \r\n. */ if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL) { @@ -2068,10 +2125,19 @@ CopyReadLine(void) { /* found \r, but no \n */ if (eol_type == EOL_CRNL) - ereport(ERROR, - (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("literal carriage return found in data"), - errhint("Use \"\\r\" to represent carriage return."))); + { + if (!csv_mode) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("literal carriage return found in data"), + errhint("Use \"\\r\" to represent carriage return."))); + else + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("unquoted carriage return found in data"), + errhint("Use quoted CSV field to represent carriage return."))); + + } /* * if we got here, it is the first line and we didn't @@ -2083,26 +2149,47 @@ CopyReadLine(void) } break; } - if (c == '\n') + if (!in_quote && c == '\n') { if (eol_type == EOL_CR || eol_type == EOL_CRNL) - ereport(ERROR, - (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("literal newline found in data"), - errhint("Use \"\\n\" to represent newline."))); + { + if (!csv_mode) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("literal newline found in data"), + errhint("Use \"\\n\" to represent newline."))); + else + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("unquoted newline found in data"), + errhint("Use quoted CSV field to represent newline."))); + + } eol_type = EOL_NL; break; } - if (c == '\\') + + if ((line_buf.len == 0 || !csv_mode) && c == '\\') { - c = CopyGetChar(); - if (c == EOF) + int c2; + + if (csv_mode) + c2 = CopyPeekChar(); + else + c2 = c = CopyGetChar(); + + if (c2 == EOF) { result = true; + if (csv_mode) + CopyDonePeek(c2, true); break; } - if (c == '.') + if (c2 == '.') { + if (csv_mode) + CopyDonePeek(c2, true); /* allow keep calling GetChar() */ + if (eol_type == EOL_CRNL) { c = CopyGetChar(); @@ -2140,8 +2227,12 @@ CopyReadLine(void) result = true; /* report EOF */ break; } - /* not EOF mark, so emit \ and following char literally */ - appendStringInfoCharMacro(&line_buf, '\\'); + + if (csv_mode) + CopyDonePeek(c2, false); /* not a dot, so put it back */ + else + /* not EOF mark, so emit \ and following char literally */ + appendStringInfoCharMacro(&line_buf, '\\'); } appendStringInfoCharMacro(&line_buf, c); @@ -2369,34 +2460,6 @@ CopyReadAttributeCSV(const char *delim, const char *null_print, char *quote, for (;;) { - /* handle multiline quoted fields */ - if (in_quote && line_buf.cursor >= line_buf.len) - { - bool done; - - switch (eol_type) - { - case EOL_NL: - appendStringInfoString(&attribute_buf, "\n"); - break; - case EOL_CR: - appendStringInfoString(&attribute_buf, "\r"); - break; - case EOL_CRNL: - appendStringInfoString(&attribute_buf, "\r\n"); - break; - case EOL_UNKNOWN: - /* shouldn't happen - just keep going */ - break; - } - - copy_lineno++; - done = CopyReadLine(); - if (done && line_buf.len == 0) - break; - start_cursor = line_buf.cursor; - } - end_cursor = line_buf.cursor; if (line_buf.cursor >= line_buf.len) break; @@ -2629,25 +2692,6 @@ CopyAttributeOutCSV(char *server_string, char *delim, char *quote, !use_quote && (c = *test_string) != '\0'; test_string += mblen) { - /* - * We don't know here what the surrounding line end characters - * might be. It might not even be under postgres' control. So - * we simple warn on ANY embedded line ending character. - * - * This warning will disappear when we make line parsing field-aware, - * so that we can reliably read in embedded line ending characters - * regardless of the file's line-end context. - * - */ - - if (!embedded_line_warning && (c == '\n' || c == '\r') ) - { - embedded_line_warning = true; - elog(WARNING, - "CSV fields with embedded linefeed or carriage return " - "characters might not be able to be reimported"); - } - if (c == delimc || c == quotec || c == '\n' || c == '\r') use_quote = true; if (!same_encoding)