Skip to content
Snippets Groups Projects
Commit 5fdd9418 authored by Bruce Momjian's avatar Bruce Momjian
Browse files

Handle carriage returns and line feeds in COPY CSV mode.

Andrew Dunstan
parent 06a61d66
No related branches found
No related tags found
No related merge requests found
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.236 2004/12/31 21:59:41 pgsql Exp $
* $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.237 2005/03/12 05:41:34 momjian Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -98,7 +98,6 @@ static bool fe_eof; /* true if detected end of copy data */
static EolType eol_type; /* EOL type of input */
static int client_encoding; /* remote side's character encoding */
static int server_encoding; /* local encoding */
static bool embedded_line_warning;
/* these are just for error messages, see copy_in_error_callback */
static bool copy_binary; /* is it a binary copy? */
......@@ -139,7 +138,7 @@ static void CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
List *force_notnull_atts);
static bool CopyReadLine(void);
static bool CopyReadLine(char * quote, char * escape);
static char *CopyReadAttribute(const char *delim, const char *null_print,
CopyReadResult *result, bool *isnull);
static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
......@@ -1191,7 +1190,6 @@ CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
attr = tupDesc->attrs;
num_phys_attrs = tupDesc->natts;
attr_count = list_length(attnumlist);
embedded_line_warning = false;
/*
* Get info about the columns we need to process.
......@@ -1718,7 +1716,8 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
ListCell *cur;
/* Actually read the line into memory here */
done = CopyReadLine();
done = csv_mode ?
CopyReadLine(quote, escape) : CopyReadLine(NULL, NULL);
/*
* EOF at start of line means we're done. If we see EOF after
......@@ -2006,7 +2005,7 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
* by newline.
*/
static bool
CopyReadLine(void)
CopyReadLine(char * quote, char * escape)
{
bool result;
bool change_encoding = (client_encoding != server_encoding);
......@@ -2015,6 +2014,19 @@ CopyReadLine(void)
int j;
unsigned char s[2];
char *cvt;
bool in_quote = false, last_was_esc = false, csv_mode = false;
char quotec = '\0', escapec = '\0';
if (quote)
{
csv_mode = true;
quotec = quote[0];
escapec = escape[0];
/* ignore special escape processing if it's the same as quotec */
if (quotec == escapec)
escapec = '\0';
}
s[1] = 0;
......@@ -2031,11 +2043,20 @@ CopyReadLine(void)
/*
* In this loop we only care for detecting newlines (\r and/or \n) and
* the end-of-copy marker (\.). For backwards compatibility we allow
* the end-of-copy marker (\.).
*
* In Text mode, for backwards compatibility we allow
* backslashes to escape newline characters. Backslashes other than
* the end marker get put into the line_buf, since CopyReadAttribute
* does its own escape processing. These four characters, and only
* these four, are assumed the same in frontend and backend encodings.
* does its own escape processing.
*
* In CSV mode, CR and NL inside q quoted field are just part of the
* data value and are put in line_buf. We keep just enough state
* to know if we are currently in a quoted field or not.
*
* These four characters, and only these four, are assumed the same in
* frontend and backend encodings.
*
* We do not assume that second and later bytes of a frontend
* multibyte character couldn't look like ASCII characters.
*/
......@@ -2047,13 +2068,49 @@ CopyReadLine(void)
result = true;
break;
}
if (c == '\r')
if (csv_mode)
{
/*
* Dealing with quotes and escapes here is mildly tricky. If the
* quote char is also the escape char, there's no problem - we
* just use the char as a toggle. If they are different, we need
* to ensure that we only take account of an escape inside a quoted
* field and immediately preceding a quote char, and not the
* second in a escape-escape sequence.
*/
if (in_quote && c == escapec)
last_was_esc = ! last_was_esc;
if (c == quotec && ! last_was_esc)
in_quote = ! in_quote;
if (c != escapec)
last_was_esc = false;
/*
* updating the line count for embedded CR and/or LF chars is
* necessarily a little fragile - this test is probably about
* the best we can do.
*/
if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n'))
copy_lineno++;
}
if (!in_quote && c == '\r')
{
if (eol_type == EOL_NL)
{
if (! csv_mode)
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("literal carriage return found in data"),
errhint("Use \"\\r\" to represent carriage return.")));
else
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("unquoted carriage return found in CSV data"),
errhint("Use quoted CSV field to represent carriage return.")));
}
/* Check for \r\n on first line, _and_ handle \r\n. */
if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
{
......@@ -2068,10 +2125,19 @@ CopyReadLine(void)
{
/* found \r, but no \n */
if (eol_type == EOL_CRNL)
{
if (!csv_mode)
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("literal carriage return found in data"),
errhint("Use \"\\r\" to represent carriage return.")));
else
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("unquoted carriage return found in data"),
errhint("Use quoted CSV field to represent carriage return.")));
}
/*
* if we got here, it is the first line and we didn't
......@@ -2083,26 +2149,47 @@ CopyReadLine(void)
}
break;
}
if (c == '\n')
if (!in_quote && c == '\n')
{
if (eol_type == EOL_CR || eol_type == EOL_CRNL)
{
if (!csv_mode)
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("literal newline found in data"),
errhint("Use \"\\n\" to represent newline.")));
else
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("unquoted newline found in data"),
errhint("Use quoted CSV field to represent newline.")));
}
eol_type = EOL_NL;
break;
}
if (c == '\\')
if ((line_buf.len == 0 || !csv_mode) && c == '\\')
{
c = CopyGetChar();
if (c == EOF)
int c2;
if (csv_mode)
c2 = CopyPeekChar();
else
c2 = c = CopyGetChar();
if (c2 == EOF)
{
result = true;
if (csv_mode)
CopyDonePeek(c2, true);
break;
}
if (c == '.')
if (c2 == '.')
{
if (csv_mode)
CopyDonePeek(c2, true); /* allow keep calling GetChar() */
if (eol_type == EOL_CRNL)
{
c = CopyGetChar();
......@@ -2140,6 +2227,10 @@ CopyReadLine(void)
result = true; /* report EOF */
break;
}
if (csv_mode)
CopyDonePeek(c2, false); /* not a dot, so put it back */
else
/* not EOF mark, so emit \ and following char literally */
appendStringInfoCharMacro(&line_buf, '\\');
}
......@@ -2369,34 +2460,6 @@ CopyReadAttributeCSV(const char *delim, const char *null_print, char *quote,
for (;;)
{
/* handle multiline quoted fields */
if (in_quote && line_buf.cursor >= line_buf.len)
{
bool done;
switch (eol_type)
{
case EOL_NL:
appendStringInfoString(&attribute_buf, "\n");
break;
case EOL_CR:
appendStringInfoString(&attribute_buf, "\r");
break;
case EOL_CRNL:
appendStringInfoString(&attribute_buf, "\r\n");
break;
case EOL_UNKNOWN:
/* shouldn't happen - just keep going */
break;
}
copy_lineno++;
done = CopyReadLine();
if (done && line_buf.len == 0)
break;
start_cursor = line_buf.cursor;
}
end_cursor = line_buf.cursor;
if (line_buf.cursor >= line_buf.len)
break;
......@@ -2629,25 +2692,6 @@ CopyAttributeOutCSV(char *server_string, char *delim, char *quote,
!use_quote && (c = *test_string) != '\0';
test_string += mblen)
{
/*
* We don't know here what the surrounding line end characters
* might be. It might not even be under postgres' control. So
* we simple warn on ANY embedded line ending character.
*
* This warning will disappear when we make line parsing field-aware,
* so that we can reliably read in embedded line ending characters
* regardless of the file's line-end context.
*
*/
if (!embedded_line_warning && (c == '\n' || c == '\r') )
{
embedded_line_warning = true;
elog(WARNING,
"CSV fields with embedded linefeed or carriage return "
"characters might not be able to be reimported");
}
if (c == delimc || c == quotec || c == '\n' || c == '\r')
use_quote = true;
if (!same_encoding)
......
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment