From 00cdd83521cfdaaff0f566ebeadecc2cad4d51cf Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 21 Nov 2015 20:21:31 -0500
Subject: [PATCH] Adopt the GNU convention for handling tar-archive members
 exceeding 8GB.

The POSIX standard for tar headers requires archive member sizes to be
printed in octal with at most 11 digits, limiting the representable file
size to 8GB.  However, GNU tar and apparently most other modern tars
support a convention in which oversized values can be stored in base-256,
allowing any practical file to be a tar member.  Adopt this convention
to remove two limitations:
* pg_dump with -Ft output format failed if the contents of any one table
exceeded 8GB.
* pg_basebackup failed if the data directory contained any file exceeding
8GB.  (This would be a fatal problem for installations configured with a
table segment size of 8GB or more, and it has also been seen to fail when
large core dump files exist in the data directory.)

File sizes under 8GB are still printed in octal, so that no compatibility
issues are created except in cases that would have failed entirely before.

In addition, this patch fixes several bugs in the same area:

* In 9.3 and later, we'd defined tarCreateHeader's file-size argument as
size_t, which meant that on 32-bit machines it would write a corrupt tar
header for file sizes between 4GB and 8GB, even though no error was raised.
This broke both "pg_dump -Ft" and pg_basebackup for such cases.

* pg_restore from a tar archive would fail on tables of size between 4GB
and 8GB, on machines where either "size_t" or "unsigned long" is 32 bits.
This happened even with an archive file not affected by the previous bug.

* pg_basebackup would fail if there were files of size between 4GB and 8GB,
even on 64-bit machines.

* In 9.3 and later, "pg_basebackup -Ft" failed entirely, for any file size,
on 64-bit big-endian machines.

In view of these potential data-loss bugs, back-patch to all supported
branches, even though removal of the documented 8GB limit might otherwise
be considered a new feature rather than a bug fix.
---
 doc/src/sgml/ref/pg_dump.sgml         |  19 +---
 src/backend/replication/basebackup.c  |  18 +---
 src/bin/pg_basebackup/pg_basebackup.c |  22 ++---
 src/bin/pg_dump/pg_backup_tar.c       |  50 ++++------
 src/include/pgtar.h                   |   4 +-
 src/port/tar.c                        | 128 ++++++++++++++++++--------
 6 files changed, 125 insertions(+), 116 deletions(-)

diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 9d84f8b4cc3..25b3d9632dd 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -272,12 +272,12 @@ PostgreSQL documentation
          <listitem>
           <para>
            Output a <command>tar</command>-format archive suitable for input
-           into <application>pg_restore</application>. The tar-format is
-           compatible with the directory-format; extracting a tar-format
+           into <application>pg_restore</application>. The tar format is
+           compatible with the directory format: extracting a tar-format
            archive produces a valid directory-format archive.
-           However, the tar-format does not support compression and has a
-           limit of 8 GB on the size of individual tables. Also, the relative
-           order of table data items cannot be changed during restore.
+           However, the tar format does not support compression. Also, when
+           using tar format the relative order of table data items cannot be
+           changed during restore.
           </para>
          </listitem>
         </varlistentry>
@@ -1140,15 +1140,6 @@ CREATE DATABASE foo WITH TEMPLATE template0;
    catalogs might be left in the wrong state.
   </para>
 
-  <para>
-   Members of tar archives are limited to a size less than 8 GB.
-   (This is an inherent limitation of the tar file format.)  Therefore
-   this format cannot be used if the textual representation of any one table
-   exceeds that size.  The total size of a tar archive and any of the
-   other output formats is not limited, except possibly by the
-   operating system.
-  </para>
-
   <para>
    The dump file produced by <application>pg_dump</application>
    does not contain the statistics used by the optimizer to make
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 1af011ee6e0..6120c8f6db3 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -698,7 +698,7 @@ SendBackupHeader(List *tablespaces)
 		}
 		else
 		{
-			Size	len;
+			Size		len;
 
 			len = strlen(ti->oid);
 			pq_sendint(&buf, len, 4);
@@ -1131,13 +1131,6 @@ sendDir(char *path, int basepathlen, bool sizeonly, List *tablespaces,
  */
 
 
-/*
- * Maximum file size for a tar member: The limit inherent in the
- * format is 2^33-1 bytes (nearly 8 GB).  But we don't want to exceed
- * what we can represent in pgoff_t.
- */
-#define MAX_TAR_MEMBER_FILELEN (((int64) 1 << Min(33, sizeof(pgoff_t)*8 - 1)) - 1)
-
 /*
  * Given the member, write the TAR header & send the file.
  *
@@ -1166,15 +1159,6 @@ sendFile(char *readfilename, char *tarfilename, struct stat * statbuf,
 				 errmsg("could not open file \"%s\": %m", readfilename)));
 	}
 
-	/*
-	 * Some compilers will throw a warning knowing this test can never be true
-	 * because pgoff_t can't exceed the compared maximum on their platform.
-	 */
-	if (statbuf->st_size > MAX_TAR_MEMBER_FILELEN)
-		ereport(ERROR,
-				(errmsg("archive member \"%s\" too large for tar format",
-						tarfilename)));
-
 	_tarWriteHeader(tarfilename, NULL, statbuf);
 
 	while ((cnt = fread(buf, 1, Min(sizeof(buf), statbuf->st_size - len), fp)) > 0)
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 80de8820ff7..8c4dffea936 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -781,7 +781,7 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum)
 	bool		in_tarhdr = true;
 	bool		skip_file = false;
 	size_t		tarhdrsz = 0;
-	size_t		filesz = 0;
+	pgoff_t		filesz = 0;
 
 #ifdef HAVE_LIBZ
 	gzFile		ztarfile = NULL;
@@ -1046,7 +1046,7 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum)
 
 						skip_file = (strcmp(&tarhdr[0], "recovery.conf") == 0);
 
-						sscanf(&tarhdr[124], "%11o", (unsigned int *) &filesz);
+						filesz = read_tar_number(&tarhdr[124], 12);
 
 						padding = ((filesz + 511) & ~511) - filesz;
 						filesz += padding;
@@ -1139,7 +1139,7 @@ ReceiveAndUnpackTarFile(PGconn *conn, PGresult *res, int rownum)
 	char		current_path[MAXPGPATH];
 	char		filename[MAXPGPATH];
 	const char *mapped_tblspc_path;
-	int			current_len_left;
+	pgoff_t		current_len_left = 0;
 	int			current_padding = 0;
 	bool		basetablespace;
 	char	   *copybuf = NULL;
@@ -1208,20 +1208,10 @@ ReceiveAndUnpackTarFile(PGconn *conn, PGresult *res, int rownum)
 			}
 			totaldone += 512;
 
-			if (sscanf(copybuf + 124, "%11o", &current_len_left) != 1)
-			{
-				fprintf(stderr, _("%s: could not parse file size\n"),
-						progname);
-				disconnect_and_exit(1);
-			}
+			current_len_left = read_tar_number(&copybuf[124], 12);
 
 			/* Set permissions on the file */
-			if (sscanf(&copybuf[100], "%07o ", &filemode) != 1)
-			{
-				fprintf(stderr, _("%s: could not parse file mode\n"),
-						progname);
-				disconnect_and_exit(1);
-			}
+			filemode = read_tar_number(&copybuf[100], 8);
 
 			/*
 			 * All files are padded up to 512 bytes
@@ -2180,7 +2170,7 @@ main(int argc, char **argv)
 	if (replication_slot && !streamwal)
 	{
 		fprintf(stderr,
-				_("%s: replication slots can only be used with WAL streaming\n"),
+			_("%s: replication slots can only be used with WAL streaming\n"),
 				progname);
 		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 				progname);
diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c
index 532eacc066e..c40dfe5726a 100644
--- a/src/bin/pg_dump/pg_backup_tar.c
+++ b/src/bin/pg_dump/pg_backup_tar.c
@@ -78,13 +78,6 @@ typedef struct
 	ArchiveHandle *AH;
 } TAR_MEMBER;
 
-/*
- * Maximum file size for a tar member: The limit inherent in the
- * format is 2^33-1 bytes (nearly 8 GB).  But we don't want to exceed
- * what we can represent in pgoff_t.
- */
-#define MAX_TAR_MEMBER_FILELEN (((int64) 1 << Min(33, sizeof(pgoff_t)*8 - 1)) - 1)
-
 typedef struct
 {
 	int			hasSeek;
@@ -1049,7 +1042,7 @@ isValidTarHeader(char *header)
 	int			sum;
 	int			chk = tarChecksum(header);
 
-	sscanf(&header[148], "%8o", &sum);
+	sum = read_tar_number(&header[148], 8);
 
 	if (sum != chk)
 		return false;
@@ -1091,13 +1084,6 @@ _tarAddFile(ArchiveHandle *AH, TAR_MEMBER *th)
 					  strerror(errno));
 	fseeko(tmp, 0, SEEK_SET);
 
-	/*
-	 * Some compilers will throw a warning knowing this test can never be true
-	 * because pgoff_t can't exceed the compared maximum on their platform.
-	 */
-	if (th->fileLen > MAX_TAR_MEMBER_FILELEN)
-		exit_horribly(modulename, "archive member too large for tar format\n");
-
 	_tarWriteHeader(th);
 
 	while ((cnt = fread(buf, 1, sizeof(buf), tmp)) > 0)
@@ -1222,11 +1208,10 @@ _tarGetHeader(ArchiveHandle *AH, TAR_MEMBER *th)
 {
 	lclContext *ctx = (lclContext *) AH->formatData;
 	char		h[512];
-	char		tag[100];
+	char		tag[100 + 1];
 	int			sum,
 				chk;
-	size_t		len;
-	unsigned long ullen;
+	pgoff_t		len;
 	pgoff_t		hPos;
 	bool		gotBlock = false;
 
@@ -1249,7 +1234,7 @@ _tarGetHeader(ArchiveHandle *AH, TAR_MEMBER *th)
 
 		/* Calc checksum */
 		chk = tarChecksum(h);
-		sscanf(&h[148], "%8o", &sum);
+		sum = read_tar_number(&h[148], 8);
 
 		/*
 		 * If the checksum failed, see if it is a null block. If so, silently
@@ -1272,27 +1257,31 @@ _tarGetHeader(ArchiveHandle *AH, TAR_MEMBER *th)
 		}
 	}
 
-	sscanf(&h[0], "%99s", tag);
-	sscanf(&h[124], "%12lo", &ullen);
-	len = (size_t) ullen;
+	/* Name field is 100 bytes, might not be null-terminated */
+	strlcpy(tag, &h[0], 100 + 1);
+
+	len = read_tar_number(&h[124], 12);
 
 	{
-		char		buf[100];
+		char		posbuf[32];
+		char		lenbuf[32];
 
-		snprintf(buf, sizeof(buf), INT64_FORMAT, (int64) hPos);
-		ahlog(AH, 3, "TOC Entry %s at %s (length %lu, checksum %d)\n",
-			  tag, buf, (unsigned long) len, sum);
+		snprintf(posbuf, sizeof(posbuf), UINT64_FORMAT, (uint64) hPos);
+		snprintf(lenbuf, sizeof(lenbuf), UINT64_FORMAT, (uint64) len);
+		ahlog(AH, 3, "TOC Entry %s at %s (length %s, checksum %d)\n",
+			  tag, posbuf, lenbuf, sum);
 	}
 
 	if (chk != sum)
 	{
-		char		buf[100];
+		char		posbuf[32];
 
-		snprintf(buf, sizeof(buf), INT64_FORMAT, (int64) ftello(ctx->tarFH));
+		snprintf(posbuf, sizeof(posbuf), UINT64_FORMAT,
+				 (uint64) ftello(ctx->tarFH));
 		exit_horribly(modulename,
 					  "corrupt tar header found in %s "
 					  "(expected %d, computed %d) file position %s\n",
-					  tag, sum, chk, buf);
+					  tag, sum, chk, posbuf);
 	}
 
 	th->targetFile = pg_strdup(tag);
@@ -1307,7 +1296,8 @@ _tarWriteHeader(TAR_MEMBER *th)
 {
 	char		h[512];
 
-	tarCreateHeader(h, th->targetFile, NULL, th->fileLen, 0600, 04000, 02000, time(NULL));
+	tarCreateHeader(h, th->targetFile, NULL, th->fileLen,
+					0600, 04000, 02000, time(NULL));
 
 	/* Now write the completed header. */
 	if (fwrite(h, 1, 512, th->tarFH) != 512)
diff --git a/src/include/pgtar.h b/src/include/pgtar.h
index 906db7cebcb..9c94a58b52e 100644
--- a/src/include/pgtar.h
+++ b/src/include/pgtar.h
@@ -19,5 +19,7 @@ enum tarError
 	TAR_SYMLINK_TOO_LONG
 };
 
-extern enum tarError tarCreateHeader(char *h, const char *filename, const char *linktarget, size_t size, mode_t mode, uid_t uid, gid_t gid, time_t mtime);
+extern enum tarError tarCreateHeader(char *h, const char *filename, const char *linktarget,
+			  pgoff_t size, mode_t mode, uid_t uid, gid_t gid, time_t mtime);
+extern uint64 read_tar_number(const char *s, int len);
 extern int	tarChecksum(char *header);
diff --git a/src/port/tar.c b/src/port/tar.c
index 72fd4e13aca..52a2113a47e 100644
--- a/src/port/tar.c
+++ b/src/port/tar.c
@@ -3,21 +3,80 @@
 #include <sys/stat.h>
 
 /*
- * Utility routine to print possibly larger than 32 bit integers in a
- * portable fashion.  Filled with zeros.
+ * Print a numeric field in a tar header.  The field starts at *s and is of
+ * length len; val is the value to be written.
+ *
+ * Per POSIX, the way to write a number is in octal with leading zeroes and
+ * one trailing space (or NUL, but we use space) at the end of the specified
+ * field width.
+ *
+ * However, the given value may not fit in the available space in octal form.
+ * If that's true, we use the GNU extension of writing \200 followed by the
+ * number in base-256 form (ie, stored in binary MSB-first).  (Note: here we
+ * support only non-negative numbers, so we don't worry about the GNU rules
+ * for handling negative numbers.)
  */
 static void
-print_val(char *s, uint64 val, unsigned int base, size_t len)
+print_tar_number(char *s, int len, uint64 val)
 {
-	int			i;
-
-	for (i = len; i > 0; i--)
+	if (val < (((uint64) 1) << ((len - 1) * 3)))
+	{
+		/* Use octal with trailing space */
+		s[--len] = ' ';
+		while (len)
+		{
+			s[--len] = (val & 7) + '0';
+			val >>= 3;
+		}
+	}
+	else
 	{
-		int			digit = val % base;
+		/* Use base-256 with leading \200 */
+		s[0] = '\200';
+		while (len > 1)
+		{
+			s[--len] = (val & 255);
+			val >>= 8;
+		}
+	}
+}
 
-		s[i - 1] = '0' + digit;
-		val = val / base;
+
+/*
+ * Read a numeric field in a tar header.  The field starts at *s and is of
+ * length len.
+ *
+ * The POSIX-approved format for a number is octal, ending with a space or
+ * NUL.  However, for values that don't fit, we recognize the GNU extension
+ * of \200 followed by the number in base-256 form (ie, stored in binary
+ * MSB-first).  (Note: here we support only non-negative numbers, so we don't
+ * worry about the GNU rules for handling negative numbers.)
+ */
+uint64
+read_tar_number(const char *s, int len)
+{
+	uint64		result = 0;
+
+	if (*s == '\200')
+	{
+		/* base-256 */
+		while (--len)
+		{
+			result <<= 8;
+			result |= (unsigned char) (*++s);
+		}
 	}
+	else
+	{
+		/* octal */
+		while (len-- && *s >= '0' && *s <= '7')
+		{
+			result <<= 3;
+			result |= (*s - '0');
+			s++;
+		}
+	}
+	return result;
 }
 
 
@@ -46,12 +105,12 @@ tarChecksum(char *header)
 
 /*
  * Fill in the buffer pointed to by h with a tar format header. This buffer
- * must always have space for 512 characters, which is a requirement by
+ * must always have space for 512 characters, which is a requirement of
  * the tar format.
  */
 enum tarError
 tarCreateHeader(char *h, const char *filename, const char *linktarget,
-				size_t size, mode_t mode, uid_t uid, gid_t gid, time_t mtime)
+				pgoff_t size, mode_t mode, uid_t uid, gid_t gid, time_t mtime)
 {
 	if (strlen(filename) > 99)
 		return TAR_NAME_TOO_LONG;
@@ -59,12 +118,6 @@ tarCreateHeader(char *h, const char *filename, const char *linktarget,
 	if (linktarget && strlen(linktarget) > 99)
 		return TAR_SYMLINK_TOO_LONG;
 
-	/*
-	 * Note: most of the fields in a tar header are not supposed to be
-	 * null-terminated.  We use sprintf, which will write a null after the
-	 * required bytes; that null goes into the first byte of the next field.
-	 * This is okay as long as we fill the fields in order.
-	 */
 	memset(h, 0, 512);			/* assume tar header size */
 
 	/* Name 100 */
@@ -84,46 +137,49 @@ tarCreateHeader(char *h, const char *filename, const char *linktarget,
 	}
 
 	/* Mode 8 - this doesn't include the file type bits (S_IFMT)  */
-	sprintf(&h[100], "%07o ", (int) (mode & 07777));
+	print_tar_number(&h[100], 8, (mode & 07777));
 
 	/* User ID 8 */
-	sprintf(&h[108], "%07o ", (int) uid);
+	print_tar_number(&h[108], 8, uid);
 
 	/* Group 8 */
-	sprintf(&h[116], "%07o ", (int) gid);
+	print_tar_number(&h[116], 8, gid);
 
-	/* File size 12 - 11 digits, 1 space; use print_val for 64 bit support */
+	/* File size 12 */
 	if (linktarget != NULL || S_ISDIR(mode))
 		/* Symbolic link or directory has size zero */
-		print_val(&h[124], 0, 8, 11);
+		print_tar_number(&h[124], 12, 0);
 	else
-		print_val(&h[124], size, 8, 11);
-	sprintf(&h[135], " ");
+		print_tar_number(&h[124], 12, size);
 
 	/* Mod Time 12 */
-	sprintf(&h[136], "%011o ", (int) mtime);
+	print_tar_number(&h[136], 12, mtime);
 
 	/* Checksum 8 cannot be calculated until we've filled all other fields */
 
 	if (linktarget != NULL)
 	{
 		/* Type - Symbolic link */
-		sprintf(&h[156], "2");
+		h[156] = '2';
 		/* Link Name 100 */
 		strlcpy(&h[157], linktarget, 100);
 	}
 	else if (S_ISDIR(mode))
+	{
 		/* Type - directory */
-		sprintf(&h[156], "5");
+		h[156] = '5';
+	}
 	else
+	{
 		/* Type - regular file */
-		sprintf(&h[156], "0");
+		h[156] = '0';
+	}
 
 	/* Magic 6 */
-	sprintf(&h[257], "ustar");
+	strcpy(&h[257], "ustar");
 
 	/* Version 2 */
-	sprintf(&h[263], "00");
+	memcpy(&h[263], "00", 2);
 
 	/* User 32 */
 	/* XXX: Do we need to care about setting correct username? */
@@ -134,19 +190,15 @@ tarCreateHeader(char *h, const char *filename, const char *linktarget,
 	strlcpy(&h[297], "postgres", 32);
 
 	/* Major Dev 8 */
-	sprintf(&h[329], "%07o ", 0);
+	print_tar_number(&h[329], 8, 0);
 
 	/* Minor Dev 8 */
-	sprintf(&h[337], "%07o ", 0);
+	print_tar_number(&h[337], 8, 0);
 
 	/* Prefix 155 - not used, leave as nulls */
 
-	/*
-	 * We mustn't overwrite the next field while inserting the checksum.
-	 * Fortunately, the checksum can't exceed 6 octal digits, so we just write
-	 * 6 digits, a space, and a null, which is legal per POSIX.
-	 */
-	sprintf(&h[148], "%06o ", tarChecksum(h));
+	/* Finally, compute and insert the checksum */
+	print_tar_number(&h[148], 8, tarChecksum(h));
 
 	return TAR_OK;
 }
-- 
GitLab