From f0828b2fc3d021ef8d64337a3593eb44bd3b6114 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 10 Mar 2008 20:06:27 +0000
Subject: [PATCH] Provide a build-time option to store large relations as
 single files, rather than dividing them into 1GB segments as has been our
 longtime practice.  This requires working support for large files in the
 operating system; at least for the time being, it won't be the default.

Zdenek Kotala
---
 configure                           | 446 ++++++++++++++++++++++++++++
 configure.in                        |  15 +-
 doc/src/sgml/installation.sgml      |  16 +-
 doc/src/sgml/storage.sgml           |   8 +-
 src/backend/storage/file/buffile.c  |  41 ++-
 src/backend/storage/file/fd.c       |  53 ++--
 src/backend/storage/smgr/md.c       |  76 ++---
 src/backend/utils/sort/tuplestore.c |   8 +-
 src/include/pg_config.h.in          |   6 +
 src/include/pg_config_manual.h      |  14 +-
 src/include/storage/buffile.h       |   6 +-
 src/include/storage/fd.h            |   6 +-
 12 files changed, 595 insertions(+), 100 deletions(-)

diff --git a/configure b/configure
index 99b0722ca1b..476ce76c8ad 100755
--- a/configure
+++ b/configure
@@ -1357,6 +1357,7 @@ Optional Features:
   --enable-debug          build with debugging symbols (-g)
   --enable-profiling      build with profiling enabled
   --enable-dtrace         build with DTrace support
+  --disable-segmented-files disable data file segmentation (requires largefile support)
   --enable-depend         turn on automatic dependency tracking
   --enable-cassert        enable assertion checks (for debugging)
   --enable-thread-safety  make client libraries thread-safe
@@ -2541,6 +2542,36 @@ fi
 
 
 
+#
+# Data file segmentation
+#
+
+pgac_args="$pgac_args enable_segmented_files"
+
+# Check whether --enable-segmented-files was given.
+if test "${enable_segmented_files+set}" = set; then
+  enableval=$enable_segmented_files;
+  case $enableval in
+    yes)
+      :
+      ;;
+    no)
+      :
+      ;;
+    *)
+      { { echo "$as_me:$LINENO: error: no argument expected for --enable-segmented-files option" >&5
+echo "$as_me: error: no argument expected for --enable-segmented-files option" >&2;}
+   { (exit 1); exit 1; }; }
+      ;;
+  esac
+
+else
+  enable_segmented_files=yes
+
+fi
+
+
+
 #
 # C compiler
 #
@@ -23642,6 +23673,421 @@ fi
 
 fi
 
+# Check for largefile support (must be after AC_SYS_LARGEFILE)
+{ echo "$as_me:$LINENO: checking for off_t" >&5
+echo $ECHO_N "checking for off_t... $ECHO_C" >&6; }
+if test "${ac_cv_type_off_t+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+typedef off_t ac__type_new_;
+int
+main ()
+{
+if ((ac__type_new_ *) 0)
+  return 0;
+if (sizeof (ac__type_new_))
+  return 0;
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_cv_type_off_t=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_type_off_t=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_type_off_t" >&5
+echo "${ECHO_T}$ac_cv_type_off_t" >&6; }
+
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ echo "$as_me:$LINENO: checking size of off_t" >&5
+echo $ECHO_N "checking size of off_t... $ECHO_C" >&6; }
+if test "${ac_cv_sizeof_off_t+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test "$cross_compiling" = yes; then
+  # Depending upon the size, compute the lo and hi bounds.
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+   typedef off_t ac__type_sizeof_;
+int
+main ()
+{
+static int test_array [1 - 2 * !(((long int) (sizeof (ac__type_sizeof_))) >= 0)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_lo=0 ac_mid=0
+  while :; do
+    cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+   typedef off_t ac__type_sizeof_;
+int
+main ()
+{
+static int test_array [1 - 2 * !(((long int) (sizeof (ac__type_sizeof_))) <= $ac_mid)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_hi=$ac_mid; break
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_lo=`expr $ac_mid + 1`
+			if test $ac_lo -le $ac_mid; then
+			  ac_lo= ac_hi=
+			  break
+			fi
+			ac_mid=`expr 2 '*' $ac_mid + 1`
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  done
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+   typedef off_t ac__type_sizeof_;
+int
+main ()
+{
+static int test_array [1 - 2 * !(((long int) (sizeof (ac__type_sizeof_))) < 0)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_hi=-1 ac_mid=-1
+  while :; do
+    cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+   typedef off_t ac__type_sizeof_;
+int
+main ()
+{
+static int test_array [1 - 2 * !(((long int) (sizeof (ac__type_sizeof_))) >= $ac_mid)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_lo=$ac_mid; break
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_hi=`expr '(' $ac_mid ')' - 1`
+			if test $ac_mid -le $ac_hi; then
+			  ac_lo= ac_hi=
+			  break
+			fi
+			ac_mid=`expr 2 '*' $ac_mid`
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  done
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_lo= ac_hi=
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+# Binary search between lo and hi bounds.
+while test "x$ac_lo" != "x$ac_hi"; do
+  ac_mid=`expr '(' $ac_hi - $ac_lo ')' / 2 + $ac_lo`
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+   typedef off_t ac__type_sizeof_;
+int
+main ()
+{
+static int test_array [1 - 2 * !(((long int) (sizeof (ac__type_sizeof_))) <= $ac_mid)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_hi=$ac_mid
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_lo=`expr '(' $ac_mid ')' + 1`
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+done
+case $ac_lo in
+?*) ac_cv_sizeof_off_t=$ac_lo;;
+'') if test "$ac_cv_type_off_t" = yes; then
+     { { echo "$as_me:$LINENO: error: cannot compute sizeof (off_t)
+See \`config.log' for more details." >&5
+echo "$as_me: error: cannot compute sizeof (off_t)
+See \`config.log' for more details." >&2;}
+   { (exit 77); exit 77; }; }
+   else
+     ac_cv_sizeof_off_t=0
+   fi ;;
+esac
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+   typedef off_t ac__type_sizeof_;
+static long int longval () { return (long int) (sizeof (ac__type_sizeof_)); }
+static unsigned long int ulongval () { return (long int) (sizeof (ac__type_sizeof_)); }
+#include <stdio.h>
+#include <stdlib.h>
+int
+main ()
+{
+
+  FILE *f = fopen ("conftest.val", "w");
+  if (! f)
+    return 1;
+  if (((long int) (sizeof (ac__type_sizeof_))) < 0)
+    {
+      long int i = longval ();
+      if (i != ((long int) (sizeof (ac__type_sizeof_))))
+	return 1;
+      fprintf (f, "%ld\n", i);
+    }
+  else
+    {
+      unsigned long int i = ulongval ();
+      if (i != ((long int) (sizeof (ac__type_sizeof_))))
+	return 1;
+      fprintf (f, "%lu\n", i);
+    }
+  return ferror (f) || fclose (f) != 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_sizeof_off_t=`cat conftest.val`
+else
+  echo "$as_me: program exited with status $ac_status" >&5
+echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+( exit $ac_status )
+if test "$ac_cv_type_off_t" = yes; then
+     { { echo "$as_me:$LINENO: error: cannot compute sizeof (off_t)
+See \`config.log' for more details." >&5
+echo "$as_me: error: cannot compute sizeof (off_t)
+See \`config.log' for more details." >&2;}
+   { (exit 77); exit 77; }; }
+   else
+     ac_cv_sizeof_off_t=0
+   fi
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f conftest.val
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_sizeof_off_t" >&5
+echo "${ECHO_T}$ac_cv_sizeof_off_t" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_OFF_T $ac_cv_sizeof_off_t
+_ACEOF
+
+
+
+if test "$ac_cv_sizeof_off_t" -lt 8 -o "$enable_segmented_files" = "yes"; then
+
+cat >>confdefs.h <<\_ACEOF
+#define USE_SEGMENTED_FILES 1
+_ACEOF
+
+fi
+
 # SunOS doesn't handle negative byte comparisons properly with +/- return
 { echo "$as_me:$LINENO: checking for working memcmp" >&5
 echo $ECHO_N "checking for working memcmp... $ECHO_C" >&6; }
diff --git a/configure.in b/configure.in
index 2bdc3719846..020009785c6 100644
--- a/configure.in
+++ b/configure.in
@@ -1,5 +1,5 @@
 dnl Process this file with autoconf to produce a configure script.
-dnl $PostgreSQL: pgsql/configure.in,v 1.552 2008/02/24 05:21:54 tgl Exp $
+dnl $PostgreSQL: pgsql/configure.in,v 1.553 2008/03/10 20:06:27 tgl Exp $
 dnl
 dnl Developers, please strive to achieve this order:
 dnl
@@ -217,6 +217,12 @@ fi
 AC_SUBST(DTRACEFLAGS)])
 AC_SUBST(enable_dtrace)
 
+#
+# Data file segmentation
+#
+PGAC_ARG_BOOL(enable, segmented-files, yes,
+              [  --disable-segmented-files disable data file segmentation (requires largefile support)])
+
 #
 # C compiler
 #
@@ -1411,6 +1417,13 @@ if test $ac_cv_func_fseeko = yes; then
 AC_SYS_LARGEFILE
 fi
 
+# Check for largefile support (must be after AC_SYS_LARGEFILE)
+AC_CHECK_SIZEOF([off_t])
+
+if test "$ac_cv_sizeof_off_t" -lt 8 -o "$enable_segmented_files" = "yes"; then 
+  AC_DEFINE([USE_SEGMENTED_FILES], 1, [Define to split data files into 1GB segments.]) 
+fi
+
 # SunOS doesn't handle negative byte comparisons properly with +/- return
 AC_FUNC_MEMCMP
 
diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml
index a9990023466..95a3f10be67 100644
--- a/doc/src/sgml/installation.sgml
+++ b/doc/src/sgml/installation.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/installation.sgml,v 1.303 2008/03/06 21:37:33 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/installation.sgml,v 1.304 2008/03/10 20:06:27 tgl Exp $ -->
 
 <chapter id="installation">
  <title><![%standalone-include[<productname>PostgreSQL</>]]>
@@ -1025,6 +1025,20 @@ su - postgres
        </listitem>
       </varlistentry>
 
+      <varlistentry>
+       <term><option>--disable-segmented-files</option></term>
+       <listitem>
+        <para>
+         Store large tables as single operating-system files, rather than
+         dividing them into 1GB segments as is the default.  This option
+         is ignored unless the operating system has <quote>largefile</>
+         support (which most do, nowadays).  It can be helpful to reduce
+         the number of file descriptors consumed when working with very
+         large tables.
+        </para>
+       </listitem>
+      </varlistentry>
+
       <varlistentry>
        <term><option>--disable-spinlocks</option></term>
        <listitem>
diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml
index fe9ae611bf1..7ba0c1e343f 100644
--- a/doc/src/sgml/storage.sgml
+++ b/doc/src/sgml/storage.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/storage.sgml,v 1.21 2007/11/23 00:24:12 ishii Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/storage.sgml,v 1.22 2008/03/10 20:06:27 tgl Exp $ -->
 
 <chapter id="storage">
 
@@ -138,10 +138,14 @@ Avoid assuming that filenode and table OID are the same.
 </caution>
 
 <para>
-When a table or index exceeds 1 GB, it is divided into gigabyte-sized
+When a table or index exceeds 1 GB, it is normally divided into gigabyte-sized
 <firstterm>segments</>.  The first segment's file name is the same as the
 filenode; subsequent segments are named filenode.1, filenode.2, etc.
 This arrangement avoids problems on platforms that have file size limitations.
+(But if the platform does not have such a limitation, and
+<option>--disable-segmented-files</option> was specified when
+<productname>PostgreSQL</> was built, then each table or index is stored
+as a single file, without segmentation.)
 The contents of tables and indexes are discussed further in
 <xref linkend="storage-page-layout">.
 </para>
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index 8d79e9574b8..94e5c67911a 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/file/buffile.c,v 1.29 2008/01/01 19:45:51 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/file/buffile.c,v 1.30 2008/03/10 20:06:27 tgl Exp $
  *
  * NOTES:
  *
@@ -38,13 +38,12 @@
 #include "storage/buffile.h"
 
 /*
- * The maximum safe file size is presumed to be RELSEG_SIZE * BLCKSZ.
- * Note we adhere to this limit whether or not LET_OS_MANAGE_FILESIZE
- * is defined, although md.c ignores it when that symbol is defined.
- * The reason for doing this is that we'd like large temporary BufFiles
- * to be spread across multiple tablespaces when available.
+ * We break BufFiles into gigabyte-sized segments, whether or not
+ * USE_SEGMENTED_FILES is defined.  The reason is that we'd like large
+ * temporary BufFiles to be spread across multiple tablespaces when available.
  */
-#define MAX_PHYSICAL_FILESIZE  (RELSEG_SIZE * BLCKSZ)
+#define MAX_PHYSICAL_FILESIZE	0x40000000
+#define BUFFILE_SEG_SIZE		(MAX_PHYSICAL_FILESIZE / BLCKSZ)
 
 /*
  * This data structure represents a buffered file that consists of one or
@@ -56,7 +55,7 @@ struct BufFile
 	int			numFiles;		/* number of physical files in set */
 	/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
 	File	   *files;			/* palloc'd array with numFiles entries */
-	long	   *offsets;		/* palloc'd array with numFiles entries */
+	off_t	   *offsets;		/* palloc'd array with numFiles entries */
 
 	/*
 	 * offsets[i] is the current seek position of files[i].  We use this to
@@ -72,7 +71,7 @@ struct BufFile
 	 * Position as seen by user of BufFile is (curFile, curOffset + pos).
 	 */
 	int			curFile;		/* file index (0..n) part of current pos */
-	int			curOffset;		/* offset part of current pos */
+	off_t		curOffset;		/* offset part of current pos */
 	int			pos;			/* next read/write position in buffer */
 	int			nbytes;			/* total # of valid bytes in buffer */
 	char		buffer[BLCKSZ];
@@ -97,7 +96,7 @@ makeBufFile(File firstfile)
 	file->numFiles = 1;
 	file->files = (File *) palloc(sizeof(File));
 	file->files[0] = firstfile;
-	file->offsets = (long *) palloc(sizeof(long));
+	file->offsets = (off_t *) palloc(sizeof(off_t));
 	file->offsets[0] = 0L;
 	file->isTemp = false;
 	file->isInterXact = false;
@@ -124,8 +123,8 @@ extendBufFile(BufFile *file)
 
 	file->files = (File *) repalloc(file->files,
 									(file->numFiles + 1) * sizeof(File));
-	file->offsets = (long *) repalloc(file->offsets,
-									  (file->numFiles + 1) * sizeof(long));
+	file->offsets = (off_t *) repalloc(file->offsets,
+									  (file->numFiles + 1) * sizeof(off_t));
 	file->files[file->numFiles] = pfile;
 	file->offsets[file->numFiles] = 0L;
 	file->numFiles++;
@@ -279,9 +278,9 @@ BufFileDumpBuffer(BufFile *file)
 		bytestowrite = file->nbytes - wpos;
 		if (file->isTemp)
 		{
-			long		availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
+			off_t		availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
 
-			if ((long) bytestowrite > availbytes)
+			if ((off_t) bytestowrite > availbytes)
 				bytestowrite = (int) availbytes;
 		}
 
@@ -451,10 +450,10 @@ BufFileFlush(BufFile *file)
  * impossible seek is attempted.
  */
 int
-BufFileSeek(BufFile *file, int fileno, long offset, int whence)
+BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
 {
 	int			newFile;
-	long		newOffset;
+	off_t		newOffset;
 
 	switch (whence)
 	{
@@ -469,7 +468,7 @@ BufFileSeek(BufFile *file, int fileno, long offset, int whence)
 			/*
 			 * Relative seek considers only the signed offset, ignoring
 			 * fileno. Note that large offsets (> 1 gig) risk overflow in this
-			 * add...
+			 * add, unless we have 64-bit off_t.
 			 */
 			newFile = file->curFile;
 			newOffset = (file->curOffset + file->pos) + offset;
@@ -537,7 +536,7 @@ BufFileSeek(BufFile *file, int fileno, long offset, int whence)
 }
 
 void
-BufFileTell(BufFile *file, int *fileno, long *offset)
+BufFileTell(BufFile *file, int *fileno, off_t *offset)
 {
 	*fileno = file->curFile;
 	*offset = file->curOffset + file->pos;
@@ -558,8 +557,8 @@ int
 BufFileSeekBlock(BufFile *file, long blknum)
 {
 	return BufFileSeek(file,
-					   (int) (blknum / RELSEG_SIZE),
-					   (blknum % RELSEG_SIZE) * BLCKSZ,
+					   (int) (blknum / BUFFILE_SEG_SIZE),
+					   (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ,
 					   SEEK_SET);
 }
 
@@ -575,7 +574,7 @@ BufFileTellBlock(BufFile *file)
 	long		blknum;
 
 	blknum = (file->curOffset + file->pos) / BLCKSZ;
-	blknum += file->curFile * RELSEG_SIZE;
+	blknum += file->curFile * BUFFILE_SEG_SIZE;
 	return blknum;
 }
 
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 2a0108fcee0..edce52155f6 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.143 2008/01/01 19:45:51 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.144 2008/03/10 20:06:27 tgl Exp $
  *
  * NOTES:
  *
@@ -115,7 +115,7 @@ static int	max_safe_fds = 32;	/* default if not changed */
 
 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
 
-#define FileUnknownPos (-1L)
+#define FileUnknownPos ((off_t) -1)
 
 /* these are the assigned bits in fdstate below: */
 #define FD_TEMPORARY		(1 << 0)	/* T = delete when closed */
@@ -123,13 +123,13 @@ static int	max_safe_fds = 32;	/* default if not changed */
 
 typedef struct vfd
 {
-	signed short fd;			/* current FD, or VFD_CLOSED if none */
+	int			fd;				/* current FD, or VFD_CLOSED if none */
 	unsigned short fdstate;		/* bitflags for VFD's state */
-	SubTransactionId create_subid;		/* for TEMPORARY fds, creating subxact */
+	SubTransactionId create_subid;	/* for TEMPORARY fds, creating subxact */
 	File		nextFree;		/* link to next free VFD, if in freelist */
 	File		lruMoreRecently;	/* doubly linked recency-of-use list */
 	File		lruLessRecently;
-	long		seekPos;		/* current logical file position */
+	off_t		seekPos;		/* current logical file position */
 	char	   *fileName;		/* name of file, or NULL for unused VFD */
 	/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
 	int			fileFlags;		/* open(2) flags for (re)opening the file */
@@ -544,8 +544,8 @@ LruDelete(File file)
 	Delete(file);
 
 	/* save the seek position */
-	vfdP->seekPos = (long) lseek(vfdP->fd, 0L, SEEK_CUR);
-	Assert(vfdP->seekPos != -1L);
+	vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
+	Assert(vfdP->seekPos != (off_t) -1);
 
 	/* close the file */
 	if (close(vfdP->fd))
@@ -616,12 +616,12 @@ LruInsert(File file)
 		}
 
 		/* seek to the right position */
-		if (vfdP->seekPos != 0L)
+		if (vfdP->seekPos != (off_t) 0)
 		{
-			long		returnValue;
+			off_t		returnValue;
 
-			returnValue = (long) lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
-			Assert(returnValue != -1L);
+			returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
+			Assert(returnValue != (off_t) -1);
 		}
 	}
 
@@ -1027,9 +1027,10 @@ FileRead(File file, char *buffer, int amount)
 
 	Assert(FileIsValid(file));
 
-	DO_DB(elog(LOG, "FileRead: %d (%s) %ld %d %p",
+	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
 			   file, VfdCache[file].fileName,
-			   VfdCache[file].seekPos, amount, buffer));
+			   (int64) VfdCache[file].seekPos,
+			   amount, buffer));
 
 	returnCode = FileAccess(file);
 	if (returnCode < 0)
@@ -1081,9 +1082,10 @@ FileWrite(File file, char *buffer, int amount)
 
 	Assert(FileIsValid(file));
 
-	DO_DB(elog(LOG, "FileWrite: %d (%s) %ld %d %p",
+	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
 			   file, VfdCache[file].fileName,
-			   VfdCache[file].seekPos, amount, buffer));
+			   (int64) VfdCache[file].seekPos,
+			   amount, buffer));
 
 	returnCode = FileAccess(file);
 	if (returnCode < 0)
@@ -1146,16 +1148,17 @@ FileSync(File file)
 	return pg_fsync(VfdCache[file].fd);
 }
 
-long
-FileSeek(File file, long offset, int whence)
+off_t
+FileSeek(File file, off_t offset, int whence)
 {
 	int			returnCode;
 
 	Assert(FileIsValid(file));
 
-	DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d",
+	DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
 			   file, VfdCache[file].fileName,
-			   VfdCache[file].seekPos, offset, whence));
+			   (int64) VfdCache[file].seekPos,
+			   (int64) offset, whence));
 
 	if (FileIsNotOpen(file))
 	{
@@ -1163,7 +1166,8 @@ FileSeek(File file, long offset, int whence)
 		{
 			case SEEK_SET:
 				if (offset < 0)
-					elog(ERROR, "invalid seek offset: %ld", offset);
+					elog(ERROR, "invalid seek offset: " INT64_FORMAT,
+						 (int64) offset);
 				VfdCache[file].seekPos = offset;
 				break;
 			case SEEK_CUR:
@@ -1187,7 +1191,8 @@ FileSeek(File file, long offset, int whence)
 		{
 			case SEEK_SET:
 				if (offset < 0)
-					elog(ERROR, "invalid seek offset: %ld", offset);
+					elog(ERROR, "invalid seek offset: " INT64_FORMAT,
+						 (int64) offset);
 				if (VfdCache[file].seekPos != offset)
 					VfdCache[file].seekPos = lseek(VfdCache[file].fd,
 												   offset, whence);
@@ -1213,7 +1218,7 @@ FileSeek(File file, long offset, int whence)
  * XXX not actually used but here for completeness
  */
 #ifdef NOT_USED
-long
+off_t
 FileTell(File file)
 {
 	Assert(FileIsValid(file));
@@ -1224,7 +1229,7 @@ FileTell(File file)
 #endif
 
 int
-FileTruncate(File file, long offset)
+FileTruncate(File file, off_t offset)
 {
 	int			returnCode;
 
@@ -1237,7 +1242,7 @@ FileTruncate(File file, long offset)
 	if (returnCode < 0)
 		return returnCode;
 
-	returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
+	returnCode = ftruncate(VfdCache[file].fd, offset);
 	return returnCode;
 }
 
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 543574be400..6ea4a00b017 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.135 2008/01/01 19:45:52 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.136 2008/03/10 20:06:27 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -89,16 +89,16 @@
  *
  *	All MdfdVec objects are palloc'd in the MdCxt memory context.
  *
- *	Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic,
- *	for use on machines that support large files.  Beware that that
- *	code has not been tested in a long time and is probably bit-rotted.
+ *	On platforms that support large files, USE_SEGMENTED_FILES can be
+ *	#undef'd to disable the segmentation logic.  In that case each
+ *	relation is a single operating-system file.
  */
 
 typedef struct _MdfdVec
 {
 	File		mdfd_vfd;		/* fd number in fd.c's pool */
 	BlockNumber mdfd_segno;		/* segment number, from 0 */
-#ifndef LET_OS_MANAGE_FILESIZE	/* for large relations */
+#ifdef USE_SEGMENTED_FILES
 	struct _MdfdVec *mdfd_chain;	/* next segment, or NULL */
 #endif
 } MdfdVec;
@@ -162,7 +162,7 @@ static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
 static void register_unlink(RelFileNode rnode);
 static MdfdVec *_fdvec_alloc(void);
 
-#ifndef LET_OS_MANAGE_FILESIZE
+#ifdef USE_SEGMENTED_FILES
 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
 			  int oflags);
 #endif
@@ -258,7 +258,7 @@ mdcreate(SMgrRelation reln, bool isRedo)
 
 	reln->md_fd->mdfd_vfd = fd;
 	reln->md_fd->mdfd_segno = 0;
-#ifndef LET_OS_MANAGE_FILESIZE
+#ifdef USE_SEGMENTED_FILES
 	reln->md_fd->mdfd_chain = NULL;
 #endif
 }
@@ -344,7 +344,7 @@ mdunlink(RelFileNode rnode, bool isRedo)
 							rnode.relNode)));
 	}
 
-#ifndef LET_OS_MANAGE_FILESIZE
+#ifdef USE_SEGMENTED_FILES
 	/* Delete the additional segments, if any */
 	else
 	{
@@ -395,7 +395,7 @@ mdunlink(RelFileNode rnode, bool isRedo)
 void
 mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 {
-	long		seekpos;
+	off_t		seekpos;
 	int			nbytes;
 	MdfdVec    *v;
 
@@ -420,11 +420,11 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 
 	v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
 
-#ifndef LET_OS_MANAGE_FILESIZE
-	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
-	Assert(seekpos < BLCKSZ * RELSEG_SIZE);
+#ifdef USE_SEGMENTED_FILES
+	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 #else
-	seekpos = (long) (BLCKSZ * (blocknum));
+	seekpos = (off_t) BLCKSZ * blocknum;
 #endif
 
 	/*
@@ -469,7 +469,7 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 	if (!isTemp)
 		register_dirty_segment(reln, v);
 
-#ifndef LET_OS_MANAGE_FILESIZE
+#ifdef USE_SEGMENTED_FILES
 	Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
 #endif
 }
@@ -530,7 +530,7 @@ mdopen(SMgrRelation reln, ExtensionBehavior behavior)
 
 	mdfd->mdfd_vfd = fd;
 	mdfd->mdfd_segno = 0;
-#ifndef LET_OS_MANAGE_FILESIZE
+#ifdef USE_SEGMENTED_FILES
 	mdfd->mdfd_chain = NULL;
 	Assert(_mdnblocks(reln, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 #endif
@@ -552,7 +552,7 @@ mdclose(SMgrRelation reln)
 
 	reln->md_fd = NULL;			/* prevent dangling pointer after error */
 
-#ifndef LET_OS_MANAGE_FILESIZE
+#ifdef USE_SEGMENTED_FILES
 	while (v != NULL)
 	{
 		MdfdVec    *ov = v;
@@ -577,17 +577,17 @@ mdclose(SMgrRelation reln)
 void
 mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 {
-	long		seekpos;
+	off_t		seekpos;
 	int			nbytes;
 	MdfdVec    *v;
 
 	v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
 
-#ifndef LET_OS_MANAGE_FILESIZE
-	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
-	Assert(seekpos < BLCKSZ * RELSEG_SIZE);
+#ifdef USE_SEGMENTED_FILES
+	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 #else
-	seekpos = (long) (BLCKSZ * (blocknum));
+	seekpos = (off_t) BLCKSZ * blocknum;
 #endif
 
 	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
@@ -642,7 +642,7 @@ mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 void
 mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 {
-	long		seekpos;
+	off_t		seekpos;
 	int			nbytes;
 	MdfdVec    *v;
 
@@ -653,11 +653,11 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 
 	v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
 
-#ifndef LET_OS_MANAGE_FILESIZE
-	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
-	Assert(seekpos < BLCKSZ * RELSEG_SIZE);
+#ifdef USE_SEGMENTED_FILES
+	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 #else
-	seekpos = (long) (BLCKSZ * (blocknum));
+	seekpos = (off_t) BLCKSZ * blocknum;
 #endif
 
 	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
@@ -708,7 +708,7 @@ mdnblocks(SMgrRelation reln)
 {
 	MdfdVec    *v = mdopen(reln, EXTENSION_FAIL);
 
-#ifndef LET_OS_MANAGE_FILESIZE
+#ifdef USE_SEGMENTED_FILES
 	BlockNumber nblocks;
 	BlockNumber segno = 0;
 
@@ -778,7 +778,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 	MdfdVec    *v;
 	BlockNumber curnblk;
 
-#ifndef LET_OS_MANAGE_FILESIZE
+#ifdef USE_SEGMENTED_FILES
 	BlockNumber priorblocks;
 #endif
 
@@ -804,7 +804,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 
 	v = mdopen(reln, EXTENSION_FAIL);
 
-#ifndef LET_OS_MANAGE_FILESIZE
+#ifdef USE_SEGMENTED_FILES
 	priorblocks = 0;
 	while (v != NULL)
 	{
@@ -843,7 +843,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 			 */
 			BlockNumber lastsegblocks = nblocks - priorblocks;
 
-			if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
+			if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
@@ -867,7 +867,8 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 		priorblocks += RELSEG_SIZE;
 	}
 #else
-	if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
+	/* For unsegmented files, it's a lot easier */
+	if (FileTruncate(v->mdfd_vfd, (off_t) nblocks * BLCKSZ) < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 			  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
@@ -900,7 +901,7 @@ mdimmedsync(SMgrRelation reln)
 
 	v = mdopen(reln, EXTENSION_FAIL);
 
-#ifndef LET_OS_MANAGE_FILESIZE
+#ifdef USE_SEGMENTED_FILES
 	while (v != NULL)
 	{
 		if (FileSync(v->mdfd_vfd) < 0)
@@ -917,8 +918,7 @@ mdimmedsync(SMgrRelation reln)
 	if (FileSync(v->mdfd_vfd) < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
-				 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
-						v->mdfd_segno,
+				 errmsg("could not fsync relation %u/%u/%u: %m",
 						reln->smgr_rnode.spcNode,
 						reln->smgr_rnode.dbNode,
 						reln->smgr_rnode.relNode)));
@@ -1453,7 +1453,7 @@ _fdvec_alloc(void)
 	return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
 }
 
-#ifndef LET_OS_MANAGE_FILESIZE
+#ifdef USE_SEGMENTED_FILES
 
 /*
  * Open the specified segment of the relation,
@@ -1499,7 +1499,7 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
 	/* all done */
 	return v;
 }
-#endif   /* LET_OS_MANAGE_FILESIZE */
+#endif   /* USE_SEGMENTED_FILES */
 
 /*
  *	_mdfd_getseg() -- Find the segment of the relation holding the
@@ -1515,7 +1515,7 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
 {
 	MdfdVec    *v = mdopen(reln, behavior);
 
-#ifndef LET_OS_MANAGE_FILESIZE
+#ifdef USE_SEGMENTED_FILES
 	BlockNumber targetseg;
 	BlockNumber nextsegno;
 
@@ -1588,7 +1588,7 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
 static BlockNumber
 _mdnblocks(SMgrRelation reln, MdfdVec *seg)
 {
-	long		len;
+	off_t		len;
 
 	len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
 	if (len < 0)
diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c
index e297579674a..d6c192993e2 100644
--- a/src/backend/utils/sort/tuplestore.c
+++ b/src/backend/utils/sort/tuplestore.c
@@ -38,7 +38,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/sort/tuplestore.c,v 1.36 2008/01/01 19:45:55 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/sort/tuplestore.c,v 1.37 2008/03/10 20:06:27 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -134,14 +134,14 @@ struct Tuplestorestate
 	bool		eof_reached;	/* read reached EOF (always valid) */
 	int			current;		/* next array index (valid if INMEM) */
 	int			readpos_file;	/* file# (valid if WRITEFILE and not eof) */
-	long		readpos_offset; /* offset (valid if WRITEFILE and not eof) */
+	off_t		readpos_offset; /* offset (valid if WRITEFILE and not eof) */
 	int			writepos_file;	/* file# (valid if READFILE) */
-	long		writepos_offset;	/* offset (valid if READFILE) */
+	off_t		writepos_offset; /* offset (valid if READFILE) */
 
 	/* markpos_xxx holds marked position for mark and restore */
 	int			markpos_current;	/* saved "current" */
 	int			markpos_file;	/* saved "readpos_file" */
-	long		markpos_offset; /* saved "readpos_offset" */
+	off_t		markpos_offset; /* saved "readpos_offset" */
 };
 
 #define COPYTUP(state,tup)	((*(state)->copytup) (state, tup))
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index b48e261cbf5..24b7c0dc86d 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -637,6 +637,9 @@
    your system. */
 #undef PTHREAD_CREATE_JOINABLE
 
+/* The size of `off_t', as computed by sizeof. */
+#undef SIZEOF_OFF_T
+
 /* The size of `size_t', as computed by sizeof. */
 #undef SIZEOF_SIZE_T
 
@@ -685,6 +688,9 @@
 /* Use replacement snprintf() functions. */
 #undef USE_REPL_SNPRINTF
 
+/* Define to split data files into 1GB segments. */
+#undef USE_SEGMENTED_FILES
+
 /* Define to build with (Open)SSL support. (--with-openssl) */
 #undef USE_SSL
 
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index 0a1e8233d3e..c0d546761a8 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -6,7 +6,7 @@
  * for developers.	If you edit any of these, be sure to do a *full*
  * rebuild (and an initdb if noted).
  *
- * $PostgreSQL: pgsql/src/include/pg_config_manual.h,v 1.28 2008/02/29 20:58:33 alvherre Exp $
+ * $PostgreSQL: pgsql/src/include/pg_config_manual.h,v 1.29 2008/03/10 20:06:27 tgl Exp $
  *------------------------------------------------------------------------
  */
 
@@ -27,8 +27,9 @@
 
 /*
  * RELSEG_SIZE is the maximum number of blocks allowed in one disk
- * file.  Thus, the maximum size of a single file is RELSEG_SIZE *
- * BLCKSZ; relations bigger than that are divided into multiple files.
+ * file when USE_SEGMENTED_FILES is defined.  Thus, the maximum size 
+ * of a single file is RELSEG_SIZE * BLCKSZ; relations bigger than that 
+ * are divided into multiple files.
  *
  * RELSEG_SIZE * BLCKSZ must be less than your OS' limit on file size.
  * This is often 2 GB or 4GB in a 32-bit operating system, unless you
@@ -39,9 +40,16 @@
  * in the direction of a small limit.  (Besides, a power-of-2 value
  * saves a few cycles in md.c.)
  *
+ * When not using segmented files, RELSEG_SIZE is set to zero so that
+ * this behavior can be distinguished in pg_control.
+ *
  * Changing RELSEG_SIZE requires an initdb.
  */
+#ifdef USE_SEGMENTED_FILES
 #define RELSEG_SIZE (0x40000000 / BLCKSZ)
+#else
+#define RELSEG_SIZE 0
+#endif
 
 /*
  * Size of a WAL file block.  This need have no particular relation to BLCKSZ.
diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h
index 3313e43ea0d..e50ec2f8344 100644
--- a/src/include/storage/buffile.h
+++ b/src/include/storage/buffile.h
@@ -18,7 +18,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/buffile.h,v 1.23 2008/01/01 19:45:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/buffile.h,v 1.24 2008/03/10 20:06:27 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -38,8 +38,8 @@ extern BufFile *BufFileCreateTemp(bool interXact);
 extern void BufFileClose(BufFile *file);
 extern size_t BufFileRead(BufFile *file, void *ptr, size_t size);
 extern size_t BufFileWrite(BufFile *file, void *ptr, size_t size);
-extern int	BufFileSeek(BufFile *file, int fileno, long offset, int whence);
-extern void BufFileTell(BufFile *file, int *fileno, long *offset);
+extern int	BufFileSeek(BufFile *file, int fileno, off_t offset, int whence);
+extern void BufFileTell(BufFile *file, int *fileno, off_t *offset);
 extern int	BufFileSeekBlock(BufFile *file, long blknum);
 
 #endif   /* BUFFILE_H */
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index f5862bf82b3..05c2a625255 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.61 2008/01/01 19:45:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.62 2008/03/10 20:06:27 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -65,8 +65,8 @@ extern void FileClose(File file);
 extern int	FileRead(File file, char *buffer, int amount);
 extern int	FileWrite(File file, char *buffer, int amount);
 extern int	FileSync(File file);
-extern long FileSeek(File file, long offset, int whence);
-extern int	FileTruncate(File file, long offset);
+extern off_t FileSeek(File file, off_t offset, int whence);
+extern int	FileTruncate(File file, off_t offset);
 
 /* Operations that allow use of regular stdio --- USE WITH CAUTION */
 extern FILE *AllocateFile(const char *name, const char *mode);
-- 
GitLab