From 15c121b3ed7eb2f290e19533e41ccca734d23574 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 30 Sep 2008 10:52:14 +0000
Subject: [PATCH] Rewrite the FSM. Instead of relying on a fixed-size shared
 memory segment, the free space information is stored in a dedicated FSM
 relation fork, with each relation (except for hash indexes; they don't use
 FSM).

This eliminates the max_fsm_relations and max_fsm_pages GUC options; remove any
trace of them from the backend, initdb, and documentation.

Rewrite contrib/pg_freespacemap to match the new FSM implementation. Also
introduce a new variant of the get_raw_page(regclass, int4, int4) function in
contrib/pageinspect that let's you to return pages from any relation fork, and
a new fsm_page_contents() function to inspect the new FSM pages.
---
 contrib/pageinspect/Makefile                  |    4 +-
 contrib/pageinspect/fsmfuncs.c                |   61 +
 contrib/pageinspect/pageinspect.sql.in        |   17 +-
 contrib/pageinspect/rawpage.c                 |   12 +-
 .../pg_freespacemap/pg_freespacemap.sql.in    |   48 +-
 doc/src/sgml/acronyms.sgml                    |    4 +-
 doc/src/sgml/config.sgml                      |   76 +-
 doc/src/sgml/pageinspect.sgml                 |   49 +-
 doc/src/sgml/pgfreespacemap.sgml              |  283 +--
 doc/src/sgml/ref/vacuum.sgml                  |    8 +-
 doc/src/sgml/release.sgml                     |    5 +-
 doc/src/sgml/runtime.sgml                     |   12 +-
 doc/src/sgml/storage.sgml                     |   51 +-
 src/backend/access/gin/gininsert.c            |    6 +-
 src/backend/access/gin/ginutil.c              |    5 +-
 src/backend/access/gin/ginvacuum.c            |   35 +-
 src/backend/access/gist/gist.c                |    6 +-
 src/backend/access/gist/gistutil.c            |    5 +-
 src/backend/access/gist/gistvacuum.c          |   37 +-
 src/backend/access/heap/heapam.c              |    6 +-
 src/backend/access/heap/hio.c                 |    7 +-
 src/backend/access/nbtree/nbtpage.c           |    5 +-
 src/backend/access/nbtree/nbtree.c            |   80 +-
 src/backend/access/nbtree/nbtsort.c           |    4 +-
 src/backend/access/transam/rmgr.c             |    5 +-
 src/backend/access/transam/xlogutils.c        |    3 +-
 src/backend/bootstrap/bootstrap.c             |    4 +-
 src/backend/catalog/heap.c                    |   24 +-
 src/backend/catalog/index.c                   |   19 +-
 src/backend/commands/dbcommands.c             |   11 +-
 src/backend/commands/vacuum.c                 |   47 +-
 src/backend/commands/vacuumlazy.c             |  283 +--
 src/backend/postmaster/bgwriter.c             |    4 +-
 src/backend/storage/freespace/Makefile        |    4 +-
 src/backend/storage/freespace/README          |  195 ++
 src/backend/storage/freespace/freespace.c     | 2128 +++++------------
 src/backend/storage/freespace/fsmpage.c       |  352 +++
 src/backend/storage/freespace/indexfsm.c      |   92 +
 src/backend/storage/ipc/ipci.c                |    9 +-
 src/backend/storage/smgr/smgr.c               |   24 +-
 src/backend/tcop/postgres.c                   |   10 +-
 src/backend/utils/cache/relcache.c            |   10 +-
 src/backend/utils/misc/guc.c                  |   22 +-
 src/backend/utils/misc/postgresql.conf.sample |    7 -
 src/bin/initdb/initdb.c                       |   29 +-
 src/include/access/rmgr.h                     |    3 +-
 src/include/storage/freespace.h               |  150 +-
 src/include/storage/fsm_internals.h           |   73 +
 src/include/storage/indexfsm.h                |   27 +
 src/include/storage/lwlock.h                  |    8 +-
 src/include/storage/relfilenode.h             |    9 +-
 src/include/utils/guc_tables.h                |    3 +-
 src/include/utils/rel.h                       |    5 +-
 53 files changed, 1755 insertions(+), 2631 deletions(-)
 create mode 100644 contrib/pageinspect/fsmfuncs.c
 create mode 100644 src/backend/storage/freespace/README
 create mode 100644 src/backend/storage/freespace/fsmpage.c
 create mode 100644 src/backend/storage/freespace/indexfsm.c
 create mode 100644 src/include/storage/fsm_internals.h
 create mode 100644 src/include/storage/indexfsm.h

diff --git a/contrib/pageinspect/Makefile b/contrib/pageinspect/Makefile
index 63da705215d..3a6b729c174 100644
--- a/contrib/pageinspect/Makefile
+++ b/contrib/pageinspect/Makefile
@@ -2,12 +2,12 @@
 #
 # pageinspect Makefile
 #
-# $PostgreSQL: pgsql/contrib/pageinspect/Makefile,v 1.3 2007/11/10 23:59:51 momjian Exp $
+# $PostgreSQL: pgsql/contrib/pageinspect/Makefile,v 1.4 2008/09/30 10:52:09 heikki Exp $
 #
 #-------------------------------------------------------------------------
 
 MODULE_big	= pageinspect
-OBJS		= rawpage.o heapfuncs.o btreefuncs.o
+OBJS		= rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o
 DATA_built	= pageinspect.sql
 DATA      	= uninstall_pageinspect.sql
 
diff --git a/contrib/pageinspect/fsmfuncs.c b/contrib/pageinspect/fsmfuncs.c
new file mode 100644
index 00000000000..fb522e5ff5f
--- /dev/null
+++ b/contrib/pageinspect/fsmfuncs.c
@@ -0,0 +1,61 @@
+/*-------------------------------------------------------------------------
+ *
+ * fsmfuncs.c
+ *	  Functions to investigate FSM pages
+ *
+ * These functions are restricted to superusers for the fear of introducing
+ * security holes if the input checking isn't as water-tight as it should.
+ * You'd need to be superuser to obtain a raw page image anyway, so
+ * there's hardly any use case for using these without superuser-rights
+ * anyway.
+ *
+ * Copyright (c) 2007-2008, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/contrib/pageinspect/fsmfuncs.c,v 1.1 2008/09/30 10:52:09 heikki Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "lib/stringinfo.h"
+#include "storage/fsm_internals.h"
+#include "utils/builtins.h"
+#include "miscadmin.h"
+#include "funcapi.h"
+
+Datum fsm_page_contents(PG_FUNCTION_ARGS);
+
+/*
+ * Dumps the contents of a FSM page.
+ */
+PG_FUNCTION_INFO_V1(fsm_page_contents);
+
+Datum
+fsm_page_contents(PG_FUNCTION_ARGS)
+{
+	bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
+	int			raw_page_size;
+	StringInfoData sinfo;
+	FSMPage		fsmpage;
+	int			i;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use raw page functions"))));
+
+	raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
+	fsmpage = (FSMPage) PageGetContents(VARDATA(raw_page));
+
+	initStringInfo(&sinfo);
+
+	for(i=0; i < NodesPerPage; i++)
+	{
+		if (fsmpage->fp_nodes[i] != 0)
+			appendStringInfo(&sinfo, "%d: %d\n", i, fsmpage->fp_nodes[i]);
+	}
+	appendStringInfo(&sinfo, "fp_next_slot: %d\n", fsmpage->fp_next_slot);
+
+	PG_RETURN_TEXT_P(cstring_to_text(sinfo.data));
+}
diff --git a/contrib/pageinspect/pageinspect.sql.in b/contrib/pageinspect/pageinspect.sql.in
index 1af59f70f46..49fea9eb51f 100644
--- a/contrib/pageinspect/pageinspect.sql.in
+++ b/contrib/pageinspect/pageinspect.sql.in
@@ -1,4 +1,4 @@
-/* $PostgreSQL: pgsql/contrib/pageinspect/pageinspect.sql.in,v 1.4 2007/11/13 04:24:28 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/pageinspect/pageinspect.sql.in,v 1.5 2008/09/30 10:52:09 heikki Exp $ */
 
 -- Adjust this setting to control where the objects get created.
 SET search_path = public;
@@ -6,11 +6,16 @@ SET search_path = public;
 --
 -- get_raw_page()
 --
-CREATE OR REPLACE FUNCTION get_raw_page(text, int4)
+CREATE OR REPLACE FUNCTION get_raw_page(text, int4, int4)
 RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page'
 LANGUAGE C STRICT;
 
+CREATE OR REPLACE FUNCTION get_raw_page(text, int4) 
+RETURNS bytea
+AS $$ SELECT get_raw_page($1, 0, $2); $$
+LANGUAGE SQL STRICT;
+
 --
 -- page_header()
 --
@@ -92,3 +97,11 @@ CREATE OR REPLACE FUNCTION bt_page_items(IN relname text, IN blkno int4,
 RETURNS SETOF record
 AS 'MODULE_PATHNAME', 'bt_page_items'
 LANGUAGE C STRICT;
+
+--
+-- fsm_page_contents()
+--
+CREATE OR REPLACE FUNCTION fsm_page_contents(IN page bytea)
+RETURNS text
+AS 'MODULE_PATHNAME', 'fsm_page_contents'
+LANGUAGE C STRICT;
diff --git a/contrib/pageinspect/rawpage.c b/contrib/pageinspect/rawpage.c
index 0bc6bdc0174..51c6ee179f4 100644
--- a/contrib/pageinspect/rawpage.c
+++ b/contrib/pageinspect/rawpage.c
@@ -8,7 +8,7 @@
  * Copyright (c) 2007-2008, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/contrib/pageinspect/rawpage.c,v 1.6 2008/05/12 00:00:43 alvherre Exp $
+ *	  $PostgreSQL: pgsql/contrib/pageinspect/rawpage.c,v 1.7 2008/09/30 10:52:09 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -41,7 +41,8 @@ Datum
 get_raw_page(PG_FUNCTION_ARGS)
 {
 	text	   *relname = PG_GETARG_TEXT_P(0);
-	uint32		blkno = PG_GETARG_UINT32(1);
+	uint32		forknum = PG_GETARG_UINT32(1);
+	uint32		blkno = PG_GETARG_UINT32(2);
 
 	Relation	rel;
 	RangeVar   *relrv;
@@ -54,6 +55,11 @@ get_raw_page(PG_FUNCTION_ARGS)
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 (errmsg("must be superuser to use raw functions"))));
 
+	if (forknum > MAX_FORKNUM)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid fork number")));
+
 	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
 	rel = relation_openrv(relrv, AccessShareLock);
 
@@ -80,7 +86,7 @@ get_raw_page(PG_FUNCTION_ARGS)
 
 	/* Take a verbatim copy of the page */
 
-	buf = ReadBuffer(rel, blkno);
+	buf = ReadBufferWithFork(rel, forknum, blkno);
 	LockBuffer(buf, BUFFER_LOCK_SHARE);
 
 	memcpy(raw_page_data, BufferGetPage(buf), BLCKSZ);
diff --git a/contrib/pg_freespacemap/pg_freespacemap.sql.in b/contrib/pg_freespacemap/pg_freespacemap.sql.in
index e950d9a1128..0ab5e1d1eae 100644
--- a/contrib/pg_freespacemap/pg_freespacemap.sql.in
+++ b/contrib/pg_freespacemap/pg_freespacemap.sql.in
@@ -1,44 +1,26 @@
-/* $PostgreSQL: pgsql/contrib/pg_freespacemap/pg_freespacemap.sql.in,v 1.8 2007/11/13 04:24:28 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/pg_freespacemap/pg_freespacemap.sql.in,v 1.9 2008/09/30 10:52:09 heikki Exp $ */
 
 -- Adjust this setting to control where the objects get created.
 SET search_path = public;
 
 
--- Register the functions.
-CREATE OR REPLACE FUNCTION pg_freespacemap_pages()
-RETURNS SETOF RECORD
-AS 'MODULE_PATHNAME', 'pg_freespacemap_pages'
+-- Register the C function.
+CREATE OR REPLACE FUNCTION pg_freespace(regclass, int4)
+RETURNS int2
+AS 'MODULE_PATHNAME', 'pg_freespace'
 LANGUAGE C;
 
-CREATE OR REPLACE FUNCTION pg_freespacemap_relations()
+-- pg_freespace shows the recorded space avail at each block in a relation
+CREATE OR REPLACE FUNCTION
+  pg_freespace(rel regclass, blkno OUT int4, avail OUT int2)
 RETURNS SETOF RECORD
-AS 'MODULE_PATHNAME', 'pg_freespacemap_relations'
-LANGUAGE C;
+AS $$
+  SELECT blkno::int4, pg_freespace($1, blkno::int4) AS avail
+  FROM generate_series(0, pg_relation_size($1) / current_setting('block_size')::bigint - 1) AS blkno;
+$$
+LANGUAGE SQL;
 
 
--- Create views for convenient access.
-CREATE VIEW pg_freespacemap_pages AS
-	SELECT P.* FROM pg_freespacemap_pages() AS P
-	(reltablespace oid,
-	 reldatabase oid,
-	 relfilenode oid,
-	 relblocknumber bigint,
-	 bytes integer);
- 
-CREATE VIEW pg_freespacemap_relations AS
-	SELECT P.* FROM pg_freespacemap_relations() AS P
-	(reltablespace oid,
-	 reldatabase oid,
-	 relfilenode oid,
-	 avgrequest integer,
-	 interestingpages integer,
-	 storedpages integer,
-	 nextpage integer);
-
- 
 -- Don't want these to be available to public.
-REVOKE ALL ON FUNCTION pg_freespacemap_pages() FROM PUBLIC;
-REVOKE ALL ON pg_freespacemap_pages FROM PUBLIC;
-
-REVOKE ALL ON FUNCTION pg_freespacemap_relations() FROM PUBLIC;
-REVOKE ALL ON pg_freespacemap_relations FROM PUBLIC;
+REVOKE ALL ON FUNCTION pg_freespace(regclass, int4) FROM PUBLIC;
+REVOKE ALL ON FUNCTION pg_freespace(regclass) FROM PUBLIC;
diff --git a/doc/src/sgml/acronyms.sgml b/doc/src/sgml/acronyms.sgml
index c7c5f865d95..82d70de730c 100644
--- a/doc/src/sgml/acronyms.sgml
+++ b/doc/src/sgml/acronyms.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/acronyms.sgml,v 1.5 2008/03/18 16:05:07 mha Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/acronyms.sgml,v 1.6 2008/09/30 10:52:09 heikki Exp $ -->
 
 <appendix id="acronyms">
  <title>Acronyms</title>
@@ -216,7 +216,7 @@
     <term><acronym>FSM</acronym></term>
     <listitem>
      <para>
-      <link linkend="runtime-config-resource-fsm">Free Space Map</link>
+      <link linkend="storage-fsm">Free Space Map</link>
      </para>
     </listitem>
    </varlistentry>
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 9d33918a3ec..dfb976c4731 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.190 2008/08/25 19:03:37 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.191 2008/09/30 10:52:09 heikki Exp $ -->
 
 <chapter Id="runtime-config">
   <title>Server Configuration</title>
@@ -896,80 +896,6 @@ SET ENABLE_SEQSCAN TO OFF;
      </varlistentry>
 
      </variablelist>
-     </sect2>
-     <sect2 id="runtime-config-resource-fsm">
-     <title>Free Space Map</title>
-
-     <indexterm>
-      <primary>free space map</primary>
-     </indexterm>
-
-     <para>
-      These parameters control the size of the shared <firstterm>free space
-      map</> (<acronym>FSM</>), which tracks the locations of unused space in the database.
-      An undersized free space map can cause the database to consume
-      increasing amounts of disk space over time, because free space that
-      is not in the map cannot be re-used; instead <productname>PostgreSQL</>
-      will request more disk space from the operating system when it needs
-      to store new data.
-      The last few lines displayed by a database-wide <command>VACUUM VERBOSE</>
-      command can help in determining if the current settings are adequate.
-      A <literal>NOTICE</> message is also printed during such an operation
-      if the current settings are too low.
-     </para>
-
-     <para>
-      Increasing these parameters might cause <productname>PostgreSQL</>
-      to request more <systemitem class="osname">System V</> shared
-      memory than your operating system's default configuration
-      allows. See <xref linkend="sysvipc"> for information on how to
-      adjust those parameters, if necessary.
-     </para>
-
-     <variablelist>
-     <varlistentry id="guc-max-fsm-pages" xreflabel="max_fsm_pages">
-      <term><varname>max_fsm_pages</varname> (<type>integer</type>)</term>
-      <indexterm>
-       <primary><varname>max_fsm_pages</> configuration parameter</primary>
-      </indexterm>
-      <listitem>
-       <para>
-        Sets the maximum number of disk pages for which free space will
-        be tracked in the shared free-space map.  Six bytes of shared memory
-        are consumed for each page slot.  This setting must be at least
-        16 * <varname>max_fsm_relations</varname>.  The default is chosen
-        by <application>initdb</> depending on the amount of available memory,
-        and can range from 20k to 200k pages.
-        This parameter can only be set at server start.
-       </para>
-      </listitem>
-     </varlistentry>
-
-     <varlistentry id="guc-max-fsm-relations" xreflabel="max_fsm_relations">
-      <term><varname>max_fsm_relations</varname> (<type>integer</type>)</term>
-      <indexterm>
-       <primary><varname>max_fsm_relations</> configuration parameter</primary>
-      </indexterm>
-      <listitem>
-       <para>
-        Sets the maximum number of relations (tables and indexes) for which
-        free space will be tracked in the shared free-space map.  Roughly
-        seventy bytes of shared memory are consumed for each slot.
-        The default is one thousand relations.
-        This parameter can only be set at server start.
-       </para>
-      </listitem>
-     </varlistentry>
-
-     </variablelist>
-
-     <note>
-      <para>
-       See the <xref linkend="sql-vacuum" endterm="sql-vacuum-title">
-       command for information on setting this parameter.
-      </para>
-     </note>
-
      </sect2>
 
      <sect2 id="runtime-config-resource-kernel">
diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml
index e398733d013..94249399e10 100644
--- a/doc/src/sgml/pageinspect.sgml
+++ b/doc/src/sgml/pageinspect.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/pageinspect.sgml,v 1.3 2007/12/10 05:32:51 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/pageinspect.sgml,v 1.4 2008/09/30 10:52:09 heikki Exp $ -->
 
 <sect1 id="pageinspect">
  <title>pageinspect</title>
@@ -19,7 +19,7 @@
   <variablelist>
    <varlistentry>
     <term>
-     <function>get_raw_page(text, int) returns bytea</function>
+     <function>get_raw_page(relname text, forknum int, blkno int) returns bytea</function>
     </term>
 
     <listitem>
@@ -27,13 +27,28 @@
       <function>get_raw_page</function> reads the specified block of the named
       table and returns a copy as a <type>bytea</> value.  This allows a
       single time-consistent copy of the block to be obtained.
+      <literal>forknum</literal> should be 0 for the main data fork, or 1 for
+      the FSM.
      </para>
     </listitem>
    </varlistentry>
 
    <varlistentry>
     <term>
-     <function>page_header(bytea) returns record</function>
+     <function>get_raw_page(relname text, blkno int) returns bytea</function>
+    </term>
+
+    <listitem>
+     <para>
+      A shorthand of above, for reading from the main fork. Equal to
+      <literal>get_raw_page(relname, 0, blkno)</literal>
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term>
+     <function>page_header(page bytea) returns record</function>
     </term>
 
     <listitem>
@@ -63,7 +78,7 @@ test=# SELECT * FROM page_header(get_raw_page('pg_class', 0));
 
    <varlistentry>
     <term>
-     <function>heap_page_items(bytea) returns setof record</function>
+     <function>heap_page_items(page bytea) returns setof record</function>
     </term>
 
     <listitem>
@@ -90,7 +105,7 @@ test=# SELECT * FROM heap_page_items(get_raw_page('pg_class', 0));
 
    <varlistentry>
     <term>
-     <function>bt_metap(text) returns record</function>
+     <function>bt_metap(relname text) returns record</function>
     </term>
 
     <listitem>
@@ -113,7 +128,7 @@ fastlevel | 0
 
    <varlistentry>
     <term>
-     <function>bt_page_stats(text, int) returns record</function>
+     <function>bt_page_stats(relname text, blkno int) returns record</function>
     </term>
 
     <listitem>
@@ -141,7 +156,7 @@ btpo_flags    | 3
 
    <varlistentry>
     <term>
-     <function>bt_page_items(text, int) returns setof record</function>
+     <function>bt_page_items(relname text, blkno int) returns setof record</function>
     </term>
 
     <listitem>
@@ -164,6 +179,26 @@ test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1);
      </programlisting>
     </listitem>
    </varlistentry>
+
+   <varlistentry>
+    <term>
+     <function>fsm_page_contents(page bytea) returns text</function>
+    </term>
+
+    <listitem>
+     <para>
+      <function>fsm_page_contents</function> shows the internal node structure
+      of a FSM page. The output is a multi-line string, with one line per
+      node in the binary tree within the page. Only those nodes that are not
+      zero are printed. The so-called "next" pointer, which points to the
+      next slot to be returned from the page, is also printed.
+     </para>
+     <para>
+      See <filename>src/backend/storage/freespace/README</> for more
+      information on the structure of an FSM page.
+     </para>
+    </listitem>
+   </varlistentry>
   </variablelist>
  </sect2>
 
diff --git a/doc/src/sgml/pgfreespacemap.sgml b/doc/src/sgml/pgfreespacemap.sgml
index bc821ead6ba..3d749a953d3 100644
--- a/doc/src/sgml/pgfreespacemap.sgml
+++ b/doc/src/sgml/pgfreespacemap.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/pgfreespacemap.sgml,v 2.2 2007/12/10 05:32:51 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/pgfreespacemap.sgml,v 2.3 2008/09/30 10:52:09 heikki Exp $ -->
 
 <sect1 id="pgfreespacemap">
  <title>pg_freespacemap</title>
@@ -9,183 +9,66 @@
 
  <para>
   The <filename>pg_freespacemap</> module provides a means for examining the
-  free space map (FSM). It provides two C functions:
-  <function>pg_freespacemap_relations</function> and
-  <function>pg_freespacemap_pages</function> that each return a set of
-  records, plus two views <structname>pg_freespacemap_relations</structname>
-  and <structname>pg_freespacemap_pages</structname> that wrap the functions
-  for convenient use.
+  free space map (FSM). It provides a function called
+  <function>pg_freespacemap</function>, or two overloaded functions, to be
+  precise. The functions show the value recorded in the free space map for
+  a given page, or for all pages in the relation.
  </para>
 
  <para>
-  By default public access is revoked from the functions and views, just in
-  case there are security issues lurking.
+  By default public access is revoked from the functions, just in case
+  there are security issues lurking.
  </para>
 
  <sect2>
-  <title>The <filename>pg_freespacemap</> views</title>
+  <title>Functions</title>
+
+  <variablelist>
+   <varlistentry>
+    <term>
+     <function>pg_freespacemap(rel regclass IN, blkno bigint IN) returns int2</function>
+    </term>
+
+    <listitem>
+     <para>
+      Returns the amount of free space on the page of the relation, specified
+      by <literal>blkno</>, according to the FSM.
+      (blkno).
+     </para>
+    </listitem>
+   </varlistentry>
+
+
+   <varlistentry>
+    <term>
+     <function>pg_freespacemap(rel regclass IN, blkno OUT int4, avail OUT int2)</function>
+    </term>
+
+    <listitem>
+     <para>
+      Displays the the amount of free space on each page of the relation,
+      according to the FSM. A set of <literal>(blkno int4, avail int2)</>
+      tuples is returned, one tuple for each page in the relation.
+     </para>
+    </listitem>
+   </varlistentry>
+  </variablelist>
 
   <para>
-   The definitions of the columns exposed by the views are:
-  </para>
-
-  <table>
-   <title><structname>pg_freespacemap_relations</> Columns</title>
-
-   <tgroup cols="4">
-    <thead>
-     <row>
-      <entry>Name</entry>
-      <entry>Type</entry>
-      <entry>References</entry>
-      <entry>Description</entry>
-     </row>
-    </thead>
-    <tbody>
-
-     <row>
-      <entry><structfield>reltablespace</structfield></entry>
-      <entry><type>oid</type></entry>
-      <entry><literal>pg_tablespace.oid</literal></entry>
-      <entry>Tablespace OID of the relation</entry>
-     </row>
-     <row>
-      <entry><structfield>reldatabase</structfield></entry>
-      <entry><type>oid</type></entry>
-      <entry><literal>pg_database.oid</literal></entry>
-      <entry>Database OID of the relation</entry>
-     </row>
-     <row>
-      <entry><structfield>relfilenode</structfield></entry>
-      <entry><type>oid</type></entry>
-      <entry><literal>pg_class.relfilenode</literal></entry>
-      <entry>Relfilenode of the relation</entry>
-     </row>
-     <row>
-      <entry><structfield>avgrequest</structfield></entry>
-      <entry><type>integer</type></entry>
-      <entry></entry>
-      <entry>Moving average of free space requests (NULL for indexes)</entry>
-     </row>
-     <row>
-      <entry><structfield>interestingpages</structfield></entry>
-      <entry><type>integer</type></entry>
-      <entry></entry>
-      <entry>Count of pages last reported as containing useful free space</entry>
-     </row>
-     <row>
-      <entry><structfield>storedpages</structfield></entry>
-      <entry><type>integer</type></entry>
-      <entry></entry>
-      <entry>Count of pages actually stored in free space map</entry>
-     </row>
-     <row>
-      <entry><structfield>nextpage</structfield></entry>
-      <entry><type>integer</type></entry>
-      <entry></entry>
-      <entry>Page index (from 0) to start next search at</entry>
-     </row>
-
-    </tbody>
-   </tgroup>
-  </table>
-
-  <table>
-   <title><structname>pg_freespacemap_pages</> Columns</title>
-
-   <tgroup cols="4">
-    <thead>
-     <row>
-      <entry>Name</entry>
-      <entry>Type</entry>
-      <entry>References</entry>
-      <entry>Description</entry>
-     </row>
-    </thead>
-    <tbody>
-
-     <row>
-      <entry><structfield>reltablespace</structfield></entry>
-      <entry><type>oid</type></entry>
-      <entry><literal>pg_tablespace.oid</literal></entry>
-      <entry>Tablespace OID of the relation</entry>
-     </row>
-     <row>
-      <entry><structfield>reldatabase</structfield></entry>
-      <entry><type>oid</type></entry>
-      <entry><literal>pg_database.oid</literal></entry>
-      <entry>Database OID of the relation</entry>
-     </row>
-     <row>
-      <entry><structfield>relfilenode</structfield></entry>
-      <entry><type>oid</type></entry>
-      <entry><literal>pg_class.relfilenode</literal></entry>
-      <entry>Relfilenode of the relation</entry>
-     </row>
-     <row>
-      <entry><structfield>relblocknumber</structfield></entry>
-      <entry><type>bigint</type></entry>
-      <entry></entry>
-      <entry>Page number within the relation</entry>
-     </row>
-     <row>
-      <entry><structfield>bytes</structfield></entry>
-      <entry><type>integer</type></entry>
-      <entry></entry>
-      <entry>Free bytes in the page, or NULL for an index page (see below)</entry>
-     </row>
-
-    </tbody>
-   </tgroup>
-  </table>
-
-  <para>
-   For <structname>pg_freespacemap_relations</structname>, there is one row
-   for each relation in the free space map.
-   <structfield>storedpages</structfield> is the number of pages actually
-   stored in the map, while <structfield>interestingpages</structfield> is the
-   number of pages the last <command>VACUUM</> thought had useful amounts of
-   free space.
-  </para>
-
-  <para>
-   If <structfield>storedpages</structfield> is consistently less than
-   <structfield>interestingpages</> then it'd be a good idea to increase
-   <varname>max_fsm_pages</varname>.  Also, if the number of rows in
-   <structname>pg_freespacemap_relations</structname> is close to
-   <varname>max_fsm_relations</varname>, then you should consider increasing
-   <varname>max_fsm_relations</varname>.
-  </para>
-
-  <para>
-   For <structname>pg_freespacemap_pages</structname>, there is one row for
-   each page in the free space map. The number of rows for a relation will
-   match the <structfield>storedpages</structfield> column in
-   <structname>pg_freespacemap_relations</structname>.
+   The values stored in the free space map are not exact. They're rounded
+   to precision of 1/256th of BLCKSZ (32 bytes with default BLCKSZ), and
+   they're not kept fully up-to-date as tuples are inserted and updated.
   </para>
 
   <para>
    For indexes, what is tracked is entirely-unused pages, rather than free
-   space within pages.  Therefore, the average request size and free bytes
-   within a page are not meaningful, and are shown as NULL.
+   space within pages.  Therefore, the values are not meaningful, just
+   whether a page is full or empty.
   </para>
 
   <para>
-   Because the map is shared by all the databases, there will normally be
-   entries for relations not belonging to the current database.  This means
-   that there may not be matching join rows in <structname>pg_class</> for
-   some rows, or that there could even be incorrect joins.  If you are
-   trying to join against <structname>pg_class</>, it's a good idea to
-   restrict the join to rows having <structfield>reldatabase</> equal to
-   the current database's OID or zero.
-  </para>
-
-  <para>
-   When either of the views is accessed, internal free space map locks are
-   taken for long enough to copy all the state data that the view will display.
-   This ensures that the views produce a consistent set of results, while not
-   blocking normal activity longer than necessary.  Nonetheless there
-   could be some impact on database performance if they are read often.
+   NOTE: The interface was changed in version 8.4, to reflect the new FSM
+   implementation introduced in the same version.
   </para>
  </sect2>
 
@@ -193,45 +76,37 @@
   <title>Sample output</title>
 
   <programlisting>
-regression=# SELECT c.relname, r.avgrequest, r.interestingpages, r.storedpages
-             FROM pg_freespacemap_relations r INNER JOIN pg_class c
-             ON r.relfilenode = c.relfilenode AND
-                r.reldatabase IN (0, (SELECT oid FROM pg_database
-                                      WHERE datname = current_database()))
-             ORDER BY r.storedpages DESC LIMIT 10;
-             relname             | avgrequest | interestingpages | storedpages
----------------------------------+------------+------------------+-------------
- onek                            |        256 |              109 |         109
- pg_attribute                    |        167 |               93 |          93
- pg_class                        |        191 |               49 |          49
- pg_attribute_relid_attnam_index |            |               48 |          48
- onek2                           |        256 |               37 |          37
- pg_depend                       |         95 |               26 |          26
- pg_type                         |        199 |               16 |          16
- pg_rewrite                      |       1011 |               13 |          13
- pg_class_relname_nsp_index      |            |               10 |          10
- pg_proc                         |        302 |                8 |           8
-(10 rows)
-
-regression=# SELECT c.relname, p.relblocknumber, p.bytes
-             FROM pg_freespacemap_pages p INNER JOIN pg_class c
-             ON p.relfilenode = c.relfilenode AND
-                p.reldatabase IN (0, (SELECT oid FROM pg_database
-                                      WHERE datname = current_database()))
-             ORDER BY c.relname LIMIT 10;
-   relname    | relblocknumber | bytes
---------------+----------------+-------
- a_star       |              0 |  8040
- abstime_tbl  |              0 |  7908
- aggtest      |              0 |  8008
- altinhoid    |              0 |  8128
- altstartwith |              0 |  8128
- arrtest      |              0 |  7172
- b_star       |              0 |  7976
- box_tbl      |              0 |  7912
- bt_f8_heap   |             54 |  7728
- bt_i4_heap   |             49 |  8008
-(10 rows)
+postgres=# SELECT * FROM pg_freespace('foo');
+ blkno | avail 
+-------+-------
+     0 |     0
+     1 |     0
+     2 |     0
+     3 |    32
+     4 |   704
+     5 |   704
+     6 |   704
+     7 |  1216
+     8 |   704
+     9 |   704
+    10 |   704
+    11 |   704
+    12 |   704
+    13 |   704
+    14 |   704
+    15 |   704
+    16 |   704
+    17 |   704
+    18 |   704
+    19 |  3648
+(20 rows)
+
+postgres=# SELECT * FROM pg_freespace('foo', 7);
+ pg_freespace 
+--------------
+         1216
+(1 row)
+
   </programlisting>
  </sect2>
 
@@ -239,7 +114,9 @@ regression=# SELECT c.relname, p.relblocknumber, p.bytes
   <title>Author</title>
 
   <para>
-   Mark Kirkwood <email>markir@paradise.net.nz</email>
+   Original version by Mark Kirkwood <email>markir@paradise.net.nz</email>.
+   Rewritten in version 8.4 to suit new FSM implementation by Heikki
+   Linnakangas <email>heikki@enterprisedb.com</email>
   </para>
  </sect2>
 
diff --git a/doc/src/sgml/ref/vacuum.sgml b/doc/src/sgml/ref/vacuum.sgml
index 082473c069b..0568fd4eeb8 100644
--- a/doc/src/sgml/ref/vacuum.sgml
+++ b/doc/src/sgml/ref/vacuum.sgml
@@ -1,5 +1,5 @@
 <!--
-$PostgreSQL: pgsql/doc/src/sgml/ref/vacuum.sgml,v 1.51 2008/02/03 16:24:08 tgl Exp $
+$PostgreSQL: pgsql/doc/src/sgml/ref/vacuum.sgml,v 1.52 2008/09/30 10:52:10 heikki Exp $
 PostgreSQL documentation
 -->
 
@@ -96,11 +96,7 @@ VACUUM [ FULL ] [ FREEZE ] [ VERBOSE ] ANALYZE [ <replaceable class="PARAMETER">
     <term><literal>VERBOSE</literal></term>
     <listitem>
      <para>
-      Prints a detailed vacuum activity report for each table. Can be used
-      to help determine appropriate settings for
-      <xref linkend="guc-max-fsm-pages">,
-      <xref linkend="guc-max-fsm-relations">, and
-      <xref linkend="guc-default-statistics-target">.
+      Prints a detailed vacuum activity report for each table.
      </para>
     </listitem>
    </varlistentry>
diff --git a/doc/src/sgml/release.sgml b/doc/src/sgml/release.sgml
index 6f3daac2b8c..eea942d1a2e 100644
--- a/doc/src/sgml/release.sgml
+++ b/doc/src/sgml/release.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/release.sgml,v 1.585 2008/09/17 20:57:35 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/release.sgml,v 1.586 2008/09/30 10:52:09 heikki Exp $ -->
 <!--
 
 Typical markup:
@@ -6004,8 +6004,7 @@ current_date &lt; 2017-11-17
       <para>
        Increase default values for <link
        linkend="guc-shared-buffers"><varname>shared_buffers</></link>
-       and <link
-       linkend="guc-max-fsm-pages"><varname>max_fsm_pages</></link>
+       and <varname>max_fsm_pages</>
        (Andrew)
       </para>
      </listitem>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index adde49e1a39..6884e66d7ad 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/runtime.sgml,v 1.417 2008/09/23 09:20:34 heikki Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/runtime.sgml,v 1.418 2008/09/30 10:52:10 heikki Exp $ -->
 
 <chapter Id="runtime">
  <title>Operating System Environment</title>
@@ -1117,16 +1117,6 @@ set semsys:seminfo_semmsl=32
        <entry>8200 (assuming 8 kB <symbol>XLOG_BLCKSZ</>)</entry>
       </row>
 
-      <row>
-       <entry><xref linkend="guc-max-fsm-relations"></>
-       <entry>70</>
-      </row>
-
-      <row>
-       <entry><xref linkend="guc-max-fsm-pages"></>
-       <entry>6</>
-      </row>
-
       <row>
        <entry>Fixed space requirements</>
        <entry>770 kB</entry>
diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml
index e564fd2be9a..51f8a2fe165 100644
--- a/doc/src/sgml/storage.sgml
+++ b/doc/src/sgml/storage.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/storage.sgml,v 1.24 2008/08/05 12:09:30 mha Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/storage.sgml,v 1.25 2008/09/30 10:52:10 heikki Exp $ -->
 
 <chapter id="storage">
 
@@ -130,7 +130,12 @@ there.
 <para>
 Each table and index is stored in a separate file, named after the table
 or index's <firstterm>filenode</> number, which can be found in
-<structname>pg_class</>.<structfield>relfilenode</>.
+<structname>pg_class</>.<structfield>relfilenode</>. In addition to the
+main file (aka. main fork), a <firstterm>free space map</> (see
+<xref linkend="storage-fsm">) that stores information about free space
+available in the relation, is stored in a file named after the filenode
+number, with the the _1 suffix. For example, if the table's filenode number
+is 12345, the FSM file is named <filename>12345_1</>.
 </para>
 
 <caution>
@@ -367,6 +372,48 @@ comparison table, in which all the HTML pages were cut down to 7 kB to fit.
 
 </sect1>
 
+<sect1 id="storage-fsm">
+
+<title>Free Space Map</title>
+
+    <indexterm>
+     <primary>Free Space Map</primary>
+    </indexterm>
+    <indexterm><primary>FSM</><see>Free Space Map</></indexterm>
+
+<para>
+A Free Space Map is stored with every heap and index relation, except for
+hash indexes, to keep track of available space in the relation. It's stored
+along the main relation data, in a separate FSM relation fork, named after
+relfilenode of the relation, but with a <literal>_1</> suffix. For example,
+if the relfilenode of a relation is 12345, the FSM is stored in a file called
+<filename>12345_1</>, in the same directory as the main relation file.
+</para>
+
+<para>
+The Free Space Map is organized as a tree of <acronym>FSM</> pages. The
+bottom level <acronym>FSM</> pages stores the free space available on every
+heap (or index) page, using one byte to represent each heap page. The upper
+levels aggregate information from the lower levels.
+</para>
+
+<para>
+Within each <acronym>FSM</> page is a binary tree, stored in an array with
+one byte per node. Each leaf node represents a heap page, or a lower level
+<acronym>FSM</> page. In each non-leaf node, the higher of its children's
+values is stored. The maximum value in the leaf nodes is therefore stored
+at the root.
+</para>
+
+<para>
+See <filename>src/backend/storage/freespace/README</> for more details on
+how the <acronym>FSM</> is structured, and how it's updated and searched.
+<xref linkend="pgfreespacemap"> contrib module can be used to view the
+information stored in free space maps.
+</para>
+
+</sect1>
+
 <sect1 id="storage-page-layout">
 
 <title>Database Page Layout</title>
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index ac35069d7f5..64099cd1e50 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gin/gininsert.c,v 1.14 2008/07/11 21:06:29 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gin/gininsert.c,v 1.15 2008/09/30 10:52:10 heikki Exp $
  *-------------------------------------------------------------------------
  */
 
@@ -19,6 +19,7 @@
 #include "catalog/index.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
+#include "storage/indexfsm.h"
 #include "utils/memutils.h"
 
 
@@ -283,6 +284,9 @@ ginbuild(PG_FUNCTION_ARGS)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
+	/* Initialize FSM */
+	InitIndexFreeSpaceMap(index);
+
 	initGinState(&buildstate.ginstate, index);
 
 	/* initialize the root page */
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index 86b2650c753..587add92e97 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gin/ginutil.c,v 1.16 2008/07/11 21:06:29 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gin/ginutil.c,v 1.17 2008/09/30 10:52:10 heikki Exp $
  *-------------------------------------------------------------------------
  */
 
@@ -19,6 +19,7 @@
 #include "catalog/pg_type.h" 
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
+#include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 
 void
@@ -151,7 +152,7 @@ GinNewBuffer(Relation index)
 	/* First, try to get a page from FSM */
 	for (;;)
 	{
-		BlockNumber blkno = GetFreeIndexPage(&index->rd_node);
+		BlockNumber blkno = GetFreeIndexPage(index);
 
 		if (blkno == InvalidBlockNumber)
 			break;
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index 249f612dd10..c3e6f4e6f2d 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.21 2008/07/11 21:06:29 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.22 2008/09/30 10:52:10 heikki Exp $
  *-------------------------------------------------------------------------
  */
 
@@ -20,6 +20,7 @@
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
+#include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 
 typedef struct
@@ -678,10 +679,7 @@ ginvacuumcleanup(PG_FUNCTION_ARGS)
 	bool		needLock;
 	BlockNumber npages,
 				blkno;
-	BlockNumber totFreePages,
-				nFreePages,
-			   *freePages,
-				maxFreePages;
+	BlockNumber totFreePages;
 	BlockNumber lastBlock = GIN_ROOT_BLKNO,
 				lastFilledBlock = GIN_ROOT_BLKNO;
 
@@ -711,12 +709,7 @@ ginvacuumcleanup(PG_FUNCTION_ARGS)
 	if (needLock)
 		UnlockRelationForExtension(index, ExclusiveLock);
 
-	maxFreePages = npages;
-	if (maxFreePages > MaxFSMPages)
-		maxFreePages = MaxFSMPages;
-
-	totFreePages = nFreePages = 0;
-	freePages = (BlockNumber *) palloc(sizeof(BlockNumber) * maxFreePages);
+	totFreePages =  0;
 
 	for (blkno = GIN_ROOT_BLKNO + 1; blkno < npages; blkno++)
 	{
@@ -731,8 +724,7 @@ ginvacuumcleanup(PG_FUNCTION_ARGS)
 
 		if (GinPageIsDeleted(page))
 		{
-			if (nFreePages < maxFreePages)
-				freePages[nFreePages++] = blkno;
+			RecordFreeIndexPage(index, blkno);
 			totFreePages++;
 		}
 		else
@@ -742,25 +734,16 @@ ginvacuumcleanup(PG_FUNCTION_ARGS)
 	}
 	lastBlock = npages - 1;
 
-	if (info->vacuum_full && nFreePages > 0)
+	if (info->vacuum_full && lastBlock > lastFilledBlock)
 	{
 		/* try to truncate index */
-		int			i;
-
-		for (i = 0; i < nFreePages; i++)
-			if (freePages[i] >= lastFilledBlock)
-			{
-				totFreePages = nFreePages = i;
-				break;
-			}
-
-		if (lastBlock > lastFilledBlock)
-			RelationTruncate(index, lastFilledBlock + 1);
+		FreeSpaceMapTruncateRel(index, lastFilledBlock + 1);
+		RelationTruncate(index, lastFilledBlock + 1);
 
 		stats->pages_removed = lastBlock - lastFilledBlock;
+		totFreePages = totFreePages - stats->pages_removed;
 	}
 
-	RecordIndexFreeSpace(&index->rd_node, totFreePages, nFreePages, freePages);
 	stats->pages_free = totFreePages;
 
 	if (needLock)
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 7dd981a490b..2f75c3fa2a3 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.151 2008/06/12 09:12:29 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.152 2008/09/30 10:52:10 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,7 @@
 #include "catalog/index.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
+#include "storage/indexfsm.h"
 #include "utils/memutils.h"
 
 const XLogRecPtr XLogRecPtrForTemp = {1, 1};
@@ -102,6 +103,9 @@ gistbuild(PG_FUNCTION_ARGS)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
+	/* Initialize FSM */
+	InitIndexFreeSpaceMap(index);
+
 	/* no locking is needed */
 	initGISTstate(&buildstate.giststate, index);
 
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 248ec259567..22f50c19ac4 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.30 2008/07/13 20:45:46 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.31 2008/09/30 10:52:10 heikki Exp $
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -16,6 +16,7 @@
 #include "access/gist_private.h"
 #include "access/reloptions.h"
 #include "storage/freespace.h"
+#include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/bufmgr.h"
 #include "utils/rel.h"
@@ -617,7 +618,7 @@ gistNewBuffer(Relation r)
 	/* First, try to get a page from FSM */
 	for (;;)
 	{
-		BlockNumber blkno = GetFreeIndexPage(&r->rd_node);
+		BlockNumber blkno = GetFreeIndexPage(r);
 
 		if (blkno == InvalidBlockNumber)
 			break;				/* nothing left in FSM */
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index d929962af2a..b545922ccc8 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.36 2008/06/12 09:12:30 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.37 2008/09/30 10:52:10 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -20,6 +20,7 @@
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
+#include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "utils/memutils.h"
 
@@ -518,10 +519,7 @@ gistvacuumcleanup(PG_FUNCTION_ARGS)
 	Relation	rel = info->index;
 	BlockNumber npages,
 				blkno;
-	BlockNumber totFreePages,
-				nFreePages,
-			   *freePages,
-				maxFreePages;
+	BlockNumber	totFreePages;
 	BlockNumber lastBlock = GIST_ROOT_BLKNO,
 				lastFilledBlock = GIST_ROOT_BLKNO;
 	bool		needLock;
@@ -589,13 +587,7 @@ gistvacuumcleanup(PG_FUNCTION_ARGS)
 	if (needLock)
 		UnlockRelationForExtension(rel, ExclusiveLock);
 
-	maxFreePages = npages;
-	if (maxFreePages > MaxFSMPages)
-		maxFreePages = MaxFSMPages;
-
-	totFreePages = nFreePages = 0;
-	freePages = (BlockNumber *) palloc(sizeof(BlockNumber) * maxFreePages);
-
+	totFreePages = 0;
 	for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -609,9 +601,8 @@ gistvacuumcleanup(PG_FUNCTION_ARGS)
 
 		if (PageIsNew(page) || GistPageIsDeleted(page))
 		{
-			if (nFreePages < maxFreePages)
-				freePages[nFreePages++] = blkno;
 			totFreePages++;
+			RecordFreeIndexPage(rel, blkno);
 		}
 		else
 			lastFilledBlock = blkno;
@@ -619,25 +610,15 @@ gistvacuumcleanup(PG_FUNCTION_ARGS)
 	}
 	lastBlock = npages - 1;
 
-	if (info->vacuum_full && nFreePages > 0)
+	if (info->vacuum_full && lastFilledBlock < lastBlock)
 	{							/* try to truncate index */
-		int			i;
+		FreeSpaceMapTruncateRel(rel, lastFilledBlock + 1);
+		RelationTruncate(rel, lastFilledBlock + 1);
 
-		for (i = 0; i < nFreePages; i++)
-			if (freePages[i] >= lastFilledBlock)
-			{
-				totFreePages = nFreePages = i;
-				break;
-			}
-
-		if (lastBlock > lastFilledBlock)
-			RelationTruncate(rel, lastFilledBlock + 1);
 		stats->std.pages_removed = lastBlock - lastFilledBlock;
+		totFreePages = totFreePages - stats->std.pages_removed;
 	}
 
-	RecordIndexFreeSpace(&rel->rd_node, totFreePages, nFreePages, freePages);
-	pfree(freePages);
-
 	/* return statistics */
 	stats->std.pages_free = totFreePages;
 	if (needLock)
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index eb9f8701ae1..0fd61fe9cec 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.263 2008/09/11 14:01:09 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.264 2008/09/30 10:52:10 heikki Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -4721,6 +4721,9 @@ heap_sync(Relation rel)
 	/* FlushRelationBuffers will have opened rd_smgr */
 	smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM);
 
+	/* sync FSM as well */
+	smgrimmedsync(rel->rd_smgr, FSM_FORKNUM);
+
 	/* toast heap, if any */
 	if (OidIsValid(rel->rd_rel->reltoastrelid))
 	{
@@ -4729,6 +4732,7 @@ heap_sync(Relation rel)
 		toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock);
 		FlushRelationBuffers(toastrel);
 		smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM);
+		smgrimmedsync(toastrel->rd_smgr, FSM_FORKNUM);
 		heap_close(toastrel, AccessShareLock);
 	}
 }
diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 6db80590de6..3723977fe09 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.72 2008/07/13 20:45:47 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.73 2008/09/30 10:52:10 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -163,8 +163,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 		 * We have no cached target page, so ask the FSM for an initial
 		 * target.
 		 */
-		targetBlock = GetPageWithFreeSpace(&relation->rd_node,
-										   len + saveFreeSpace);
+		targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
 
 		/*
 		 * If the FSM knows nothing of the rel, try the last page before we
@@ -250,7 +249,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 		 * Update FSM as to condition of this page, and ask for another page
 		 * to try.
 		 */
-		targetBlock = RecordAndGetPageWithFreeSpace(&relation->rd_node,
+		targetBlock = RecordAndGetPageWithFreeSpace(relation,
 													targetBlock,
 													pageFreeSpace,
 													len + saveFreeSpace);
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 2cc5ebe8443..8ac9f538fc7 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.110 2008/07/13 20:45:47 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.111 2008/09/30 10:52:10 heikki Exp $
  *
  *	NOTES
  *	   Postgres btree pages look like ordinary relation pages.	The opaque
@@ -27,6 +27,7 @@
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
+#include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "utils/inval.h"
 #include "utils/snapmgr.h"
@@ -501,7 +502,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 		 */
 		for (;;)
 		{
-			blkno = GetFreeIndexPage(&rel->rd_node);
+			blkno = GetFreeIndexPage(rel);
 			if (blkno == InvalidBlockNumber)
 				break;
 			buf = ReadBuffer(rel, blkno);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 64a719f8279..abb6bd5c5d4 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.161 2008/06/19 00:46:03 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.162 2008/09/30 10:52:10 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -26,6 +26,7 @@
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
+#include "storage/indexfsm.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
 #include "utils/memutils.h"
@@ -56,9 +57,7 @@ typedef struct
 	IndexBulkDeleteCallback callback;
 	void	   *callback_state;
 	BTCycleId	cycleid;
-	BlockNumber *freePages;
-	int			nFreePages;		/* number of entries in freePages[] */
-	int			maxFreePages;	/* allocated size of freePages[] */
+	BlockNumber lastUsedPage;
 	BlockNumber totFreePages;	/* true total # of free pages */
 	MemoryContext pagedelcontext;
 } BTVacState;
@@ -110,6 +109,9 @@ btbuild(PG_FUNCTION_ARGS)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
+	/* Initialize FSM */
+	InitIndexFreeSpaceMap(index);
+
 	buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false);
 
 	/*
@@ -623,9 +625,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	vstate.callback = callback;
 	vstate.callback_state = callback_state;
 	vstate.cycleid = cycleid;
-	vstate.freePages = NULL;	/* temporarily */
-	vstate.nFreePages = 0;
-	vstate.maxFreePages = 0;
+	vstate.lastUsedPage = BTREE_METAPAGE;
 	vstate.totFreePages = 0;
 
 	/* Create a temporary memory context to run _bt_pagedel in */
@@ -670,17 +670,6 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		if (needLock)
 			UnlockRelationForExtension(rel, ExclusiveLock);
 
-		/* Allocate freePages after we read num_pages the first time */
-		if (vstate.freePages == NULL)
-		{
-			/* No point in remembering more than MaxFSMPages pages */
-			vstate.maxFreePages = MaxFSMPages;
-			if ((BlockNumber) vstate.maxFreePages > num_pages)
-				vstate.maxFreePages = (int) num_pages;
-			vstate.freePages = (BlockNumber *)
-				palloc(vstate.maxFreePages * sizeof(BlockNumber));
-		}
-
 		/* Quit if we've scanned the whole relation */
 		if (blkno >= num_pages)
 			break;
@@ -697,42 +686,22 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 * acquiring exclusive lock on the index and then rechecking all the
 	 * pages; doesn't seem worth it.
 	 */
-	if (info->vacuum_full && vstate.nFreePages > 0)
+	if (info->vacuum_full && vstate.lastUsedPage < num_pages - 1)
 	{
-		BlockNumber new_pages = num_pages;
-
-		while (vstate.nFreePages > 0 &&
-			   vstate.freePages[vstate.nFreePages - 1] == new_pages - 1)
-		{
-			new_pages--;
-			stats->pages_deleted--;
-			vstate.nFreePages--;
-			vstate.totFreePages = vstate.nFreePages;	/* can't be more */
-		}
-		if (new_pages != num_pages)
-		{
-			/*
-			 * Okay to truncate.
-			 */
-			RelationTruncate(rel, new_pages);
+		BlockNumber new_pages = vstate.lastUsedPage + 1;
 
-			/* update statistics */
-			stats->pages_removed += num_pages - new_pages;
+		/*
+		 * Okay to truncate.
+		 */
+		FreeSpaceMapTruncateRel(rel, new_pages);
+		RelationTruncate(rel, new_pages);
 
-			num_pages = new_pages;
-		}
+		/* update statistics */
+		stats->pages_removed += num_pages - new_pages;
+		vstate.totFreePages -= (num_pages - new_pages);
+		num_pages = new_pages;
 	}
 
-	/*
-	 * Update the shared Free Space Map with the info we now have about free
-	 * pages in the index, discarding any old info the map may have. We do not
-	 * need to sort the page numbers; they're in order already.
-	 */
-	RecordIndexFreeSpace(&rel->rd_node, vstate.totFreePages,
-						 vstate.nFreePages, vstate.freePages);
-
-	pfree(vstate.freePages);
-
 	MemoryContextDelete(vstate.pagedelcontext);
 
 	/* update statistics */
@@ -788,8 +757,7 @@ restart:
 	/*
 	 * If we are recursing, the only case we want to do anything with is a
 	 * live leaf page having the current vacuum cycle ID.  Any other state
-	 * implies we already saw the page (eg, deleted it as being empty). In
-	 * particular, we don't want to risk adding it to freePages twice.
+	 * implies we already saw the page (eg, deleted it as being empty).
 	 */
 	if (blkno != orig_blkno)
 	{
@@ -803,12 +771,15 @@ restart:
 		}
 	}
 
+	/* If the page is in use, update lastUsedPage */
+	if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno)
+		vstate->lastUsedPage = blkno;
+
 	/* Page is valid, see what to do with it */
 	if (_bt_page_recyclable(page))
 	{
 		/* Okay to recycle this page */
-		if (vstate->nFreePages < vstate->maxFreePages)
-			vstate->freePages[vstate->nFreePages++] = blkno;
+		RecordFreeIndexPage(rel, blkno);
 		vstate->totFreePages++;
 		stats->pages_deleted++;
 	}
@@ -944,8 +915,7 @@ restart:
 		 */
 		if (ndel && info->vacuum_full)
 		{
-			if (vstate->nFreePages < vstate->maxFreePages)
-				vstate->freePages[vstate->nFreePages++] = blkno;
+			RecordFreeIndexPage(rel, blkno);
 			vstate->totFreePages++;
 		}
 
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 7dcfa10eeec..eb1653e2f37 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -52,12 +52,14 @@
  * we log the completed index pages to WAL if and only if WAL archiving is
  * active.
  *
+ * This code isn't concerned about the FSM at all. The caller is responsible
+ * for initializing that.
  *
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.117 2008/08/11 11:05:10 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.118 2008/09/30 10:52:10 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 08de22eaa4a..7c62ec38543 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -3,7 +3,7 @@
  *
  * Resource managers definition
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.25 2006/11/05 22:42:07 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.26 2008/09/30 10:52:11 heikki Exp $
  */
 #include "postgres.h"
 
@@ -19,6 +19,7 @@
 #include "commands/dbcommands.h"
 #include "commands/sequence.h"
 #include "commands/tablespace.h"
+#include "storage/freespace.h"
 #include "storage/smgr.h"
 
 
@@ -30,7 +31,7 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = {
 	{"Database", dbase_redo, dbase_desc, NULL, NULL, NULL},
 	{"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL},
 	{"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL},
-	{"Reserved 7", NULL, NULL, NULL, NULL, NULL},
+	{"FreeSpaceMap", fsm_redo, fsm_desc, NULL, NULL, NULL},
 	{"Reserved 8", NULL, NULL, NULL, NULL, NULL},
 	{"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL},
 	{"Heap", heap_redo, heap_desc, NULL, NULL, NULL},
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 59124e349e4..9abcce65483 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -11,7 +11,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.58 2008/08/11 11:05:10 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.59 2008/09/30 10:52:11 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -359,6 +359,7 @@ CreateFakeRelcacheEntry(RelFileNode rnode)
 	rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
 
 	rel->rd_targblock = InvalidBlockNumber;
+	rel->rd_fsm_nblocks_cache = InvalidBlockNumber;
 	rel->rd_smgr = NULL;
 
 	return rel;
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 0689fb1f1aa..04194acd3f2 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.245 2008/09/01 20:42:43 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.246 2008/09/30 10:52:11 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -33,7 +33,6 @@
 #include "postmaster/bgwriter.h"
 #include "postmaster/walwriter.h"
 #include "storage/bufmgr.h"
-#include "storage/freespace.h"
 #include "storage/ipc.h"
 #include "storage/proc.h"
 #include "tcop/tcopprot.h"
@@ -419,7 +418,6 @@ AuxiliaryProcessMain(int argc, char *argv[])
 		case StartupProcess:
 			bootstrap_signals();
 			StartupXLOG();
-			LoadFreeSpaceMap();
 			BuildFlatFiles(false);
 			proc_exit(0);		/* startup done */
 
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 5b26b91b697..50a2a98bbb2 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/heap.c,v 1.339 2008/08/28 23:09:45 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/heap.c,v 1.340 2008/09/30 10:52:12 heikki Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -56,6 +56,7 @@
 #include "parser/parse_expr.h"
 #include "parser/parse_relation.h"
 #include "storage/bufmgr.h"
+#include "storage/freespace.h"
 #include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
@@ -294,14 +295,22 @@ heap_create(const char *relname,
 	/*
 	 * Have the storage manager create the relation's disk file, if needed.
 	 *
-	 * We only create storage for the main fork here. The caller is
-	 * responsible for creating any additional forks if needed.
+	 * We create storage for the main fork here, and also for the FSM for a
+	 * heap or toast relation. The caller is responsible for creating any
+	 * additional forks if needed.
 	 */
 	if (create_storage)
 	{
 		Assert(rel->rd_smgr == NULL);
 		RelationOpenSmgr(rel);
 		smgrcreate(rel->rd_smgr, MAIN_FORKNUM, rel->rd_istemp, false);
+
+		/*
+		 * For a real heap, create FSM fork as well. Indexams are
+		 * responsible for creating any extra forks themselves.
+		 */
+		if (relkind == RELKIND_RELATION || relkind == RELKIND_TOASTVALUE)
+			smgrcreate(rel->rd_smgr, FSM_FORKNUM, rel->rd_istemp, false);
 	}
 
 	return rel;
@@ -2256,7 +2265,11 @@ RelationTruncateIndexes(Relation heapRelation)
 		/* Fetch info needed for index_build */
 		indexInfo = BuildIndexInfo(currentIndex);
 
-		/* Now truncate the actual file (and discard buffers) */
+		/*
+		 * Now truncate the actual file (and discard buffers). The indexam
+		 * is responsible for truncating the FSM in index_build(), if
+		 * applicable.
+		 */
 		RelationTruncate(currentIndex, 0);
 
 		/* Initialize the index and rebuild */
@@ -2310,7 +2323,8 @@ heap_truncate(List *relids)
 	{
 		Relation	rel = lfirst(cell);
 
-		/* Truncate the actual file (and discard buffers) */
+		/* Truncate the FSM and actual file (and discard buffers) */
+		FreeSpaceMapTruncateRel(rel, 0);
 		RelationTruncate(rel, 0);
 
 		/* If this relation has indexes, truncate the indexes too */
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 301e7d1f2d5..e8063476add 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.304 2008/09/15 18:43:41 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.305 2008/09/30 10:52:12 heikki Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -920,7 +920,7 @@ index_drop(Oid indexId)
 	RelationOpenSmgr(userIndexRelation);
 	for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		if (smgrexists(userIndexRelation->rd_smgr, forknum))
-			smgrscheduleunlink(userIndexRelation->rd_smgr, forknum, 
+			smgrscheduleunlink(userIndexRelation->rd_smgr, forknum,
 							   userIndexRelation->rd_istemp);
 	RelationCloseSmgr(userIndexRelation);
 
@@ -1322,7 +1322,7 @@ setNewRelfilenode(Relation relation, TransactionId freezeXid)
 	/*
 	 * ... and create storage for corresponding forks in the new relfilenode.
 	 *
-	 * NOTE: any conflict in relfilenode value will be caught here 
+	 * NOTE: any conflict in relfilenode value will be caught here
 	 */
 	newrnode = relation->rd_node;
 	newrnode.relNode = newrelfilenode;
@@ -1331,6 +1331,14 @@ setNewRelfilenode(Relation relation, TransactionId freezeXid)
 	/* Create the main fork, like heap_create() does */
 	smgrcreate(srel, MAIN_FORKNUM, relation->rd_istemp, false);
 
+	/*
+	 * For a heap, create FSM fork as well. Indexams are responsible for
+	 * creating any extra forks themselves.
+	 */
+	if (relation->rd_rel->relkind == RELKIND_RELATION ||
+		relation->rd_rel->relkind == RELKIND_TOASTVALUE)
+		smgrcreate(srel, FSM_FORKNUM, relation->rd_istemp, false);
+
 	/* schedule unlinking old files */
 	for (i = 0; i <= MAX_FORKNUM; i++)
 	{
@@ -2310,7 +2318,10 @@ reindex_index(Oid indexId)
 
 		if (inplace)
 		{
-			/* Truncate the actual file (and discard buffers) */
+			/*
+			 * Truncate the actual file (and discard buffers). The indexam
+			 * is responsible for truncating the FSM, if applicable
+			 */
 			RelationTruncate(iRel, 0);
 		}
 		else
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 29dc0733a73..37c2f45c727 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -13,7 +13,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.212 2008/09/23 10:58:03 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.213 2008/09/30 10:52:12 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -43,7 +43,6 @@
 #include "postmaster/bgwriter.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
-#include "storage/freespace.h"
 #include "storage/ipc.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
@@ -796,11 +795,6 @@ dropdb(const char *dbname, bool missing_ok)
 	 */
 	DropDatabaseBuffers(db_id);
 
-	/*
-	 * Also, clean out any entries in the shared free space map.
-	 */
-	FreeSpaceMapForgetDatabase(db_id);
-
 	/*
 	 * Tell the stats collector to forget it immediately, too.
 	 */
@@ -1640,9 +1634,6 @@ dbase_redo(XLogRecPtr lsn, XLogRecord *record)
 		/* Drop pages for this database that are in the shared buffer cache */
 		DropDatabaseBuffers(xlrec->db_id);
 
-		/* Also, clean out any entries in the shared free space map */
-		FreeSpaceMapForgetDatabase(xlrec->db_id);
-
 		/* Also, clean out any fsync requests that might be pending in md.c */
 		ForgetDatabaseFsyncRequests(xlrec->db_id);
 
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index af7b6646d28..925a8d8abd3 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.377 2008/09/11 14:01:09 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.378 2008/09/30 10:52:12 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -505,14 +505,6 @@ vacuum(VacuumStmt *vacstmt, Oid relid, bool do_toast,
 		 * (autovacuum.c does this for itself.)
 		 */
 		vac_update_datfrozenxid();
-
-		/*
-		 * If it was a database-wide VACUUM, print FSM usage statistics (we
-		 * don't make you be superuser to see these).  We suppress this in
-		 * autovacuum, too.
-		 */
-		if (all_rels)
-			PrintFreeSpaceMapStatistics(elevel);
 	}
 
 	/*
@@ -1272,8 +1264,9 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
 		}
 	}
 
-	/* update shared free space map with final free space info */
+	/* update thefree space map with final free space info, and vacuum it */
 	vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages);
+	FreeSpaceMapVacuum(onerel);
 
 	/* update statistics in pg_class */
 	vac_update_relstats(RelationGetRelid(onerel), vacrelstats->rel_pages,
@@ -2849,6 +2842,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 	/* Truncate relation, if needed */
 	if (blkno < nblocks)
 	{
+		FreeSpaceMapTruncateRel(onerel, blkno);
 		RelationTruncate(onerel, blkno);
 		vacrelstats->rel_pages = blkno; /* set new number of blocks */
 	}
@@ -3243,6 +3237,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
 				(errmsg("\"%s\": truncated %u to %u pages",
 						RelationGetRelationName(onerel),
 						vacrelstats->rel_pages, relblocks)));
+		FreeSpaceMapTruncateRel(onerel, relblocks);
 		RelationTruncate(onerel, relblocks);
 		vacrelstats->rel_pages = relblocks;		/* set new number of blocks */
 	}
@@ -3475,8 +3470,8 @@ tid_reaped(ItemPointer itemptr, void *state)
 }
 
 /*
- * Update the shared Free Space Map with the info we now have about
- * free space in the relation, discarding any old info the map may have.
+ * Update the Free Space Map with the info we now have about free space in
+ * the relation.
  */
 static void
 vac_update_fsm(Relation onerel, VacPageList fraged_pages,
@@ -3484,26 +3479,8 @@ vac_update_fsm(Relation onerel, VacPageList fraged_pages,
 {
 	int			nPages = fraged_pages->num_pages;
 	VacPage    *pagedesc = fraged_pages->pagedesc;
-	Size		threshold;
-	FSMPageData *pageSpaces;
-	int			outPages;
 	int			i;
 
-	/*
-	 * We only report pages with free space at least equal to the average
-	 * request size --- this avoids cluttering FSM with uselessly-small bits
-	 * of space.  Although FSM would discard pages with little free space
-	 * anyway, it's important to do this prefiltering because (a) it reduces
-	 * the time spent holding the FSM lock in RecordRelationFreeSpace, and (b)
-	 * FSM uses the number of pages reported as a statistic for guiding space
-	 * management.	If we didn't threshold our reports the same way
-	 * vacuumlazy.c does, we'd be skewing that statistic.
-	 */
-	threshold = GetAvgFSMRequestSize(&onerel->rd_node);
-
-	pageSpaces = (FSMPageData *) palloc(nPages * sizeof(FSMPageData));
-	outPages = 0;
-
 	for (i = 0; i < nPages; i++)
 	{
 		/*
@@ -3514,17 +3491,9 @@ vac_update_fsm(Relation onerel, VacPageList fraged_pages,
 		if (pagedesc[i]->blkno >= rel_pages)
 			break;
 
-		if (pagedesc[i]->free >= threshold)
-		{
-			FSMPageSetPageNum(&pageSpaces[outPages], pagedesc[i]->blkno);
-			FSMPageSetSpace(&pageSpaces[outPages], pagedesc[i]->free);
-			outPages++;
-		}
+		RecordPageWithFreeSpace(onerel, pagedesc[i]->blkno, pagedesc[i]->free);
 	}
 
-	RecordRelationFreeSpace(&onerel->rd_node, outPages, outPages, pageSpaces);
-
-	pfree(pageSpaces);
 }
 
 /* Copy a VacPage structure */
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 6ebf25933b1..fbaeb8d602a 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -18,15 +18,6 @@
  * index cleanup and page compaction, then resume the heap scan with an empty
  * TID array.
  *
- * We can limit the storage for page free space to MaxFSMPages entries,
- * since that's the most the free space map will be willing to remember
- * anyway.	If the relation has fewer than that many pages with free space,
- * life is easy: just build an array of per-page info.	If it has more,
- * we store the free space info as a heap ordered by amount of free space,
- * so that we can discard the pages with least free space to ensure we never
- * have more than MaxFSMPages entries in all.  The surviving page entries
- * are passed to the free space map at conclusion of the scan.
- *
  * If we're processing a table with no indexes, we can just vacuum each page
  * as we go; there's no need to save up multiple tuples to minimize the number
  * of index scans performed.  So we don't use maintenance_work_mem memory for
@@ -38,7 +29,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.107 2008/05/12 00:00:48 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.108 2008/09/30 10:52:12 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -90,19 +81,11 @@ typedef struct LVRelStats
 	BlockNumber pages_removed;
 	double		tuples_deleted;
 	BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
-	Size		threshold;		/* minimum interesting free space */
 	/* List of TIDs of tuples we intend to delete */
 	/* NB: this list is ordered by TID address */
 	int			num_dead_tuples;	/* current # of entries */
 	int			max_dead_tuples;	/* # slots allocated in array */
 	ItemPointer dead_tuples;	/* array of ItemPointerData */
-	/* Array or heap of per-page info about free space */
-	/* We use a simple array until it fills up, then convert to heap */
-	bool		fs_is_heap;		/* are we using heap organization? */
-	int			num_free_pages; /* current # of entries */
-	int			max_free_pages; /* # slots allocated in array */
-	FSMPageData *free_pages;	/* array or heap of blkno/avail */
-	BlockNumber tot_free_pages; /* total pages with >= threshold space */
 	int			num_index_scans;
 } LVRelStats;
 
@@ -134,12 +117,8 @@ static BlockNumber count_nondeletable_pages(Relation onerel,
 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
 					   ItemPointer itemptr);
-static void lazy_record_free_space(LVRelStats *vacrelstats,
-					   BlockNumber page, Size avail);
 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
-static void lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats);
 static int	vac_cmp_itemptr(const void *left, const void *right);
-static int	vac_cmp_page_spaces(const void *left, const void *right);
 
 
 /*
@@ -180,10 +159,6 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 
 	vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
 
-	/* Set threshold for interesting free space = average request size */
-	/* XXX should we scale it up or down?  Adjust vacuum.c too, if so */
-	vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node);
-
 	vacrelstats->num_index_scans = 0;
 
 	/* Open all indexes of the relation */
@@ -207,18 +182,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 		possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)
 		lazy_truncate_heap(onerel, vacrelstats);
 
-	/* Update shared free space map with final free space info */
-	lazy_update_fsm(onerel, vacrelstats);
-
-	if (vacrelstats->tot_free_pages > MaxFSMPages)
-		ereport(WARNING,
-				(errmsg("relation \"%s.%s\" contains more than \"max_fsm_pages\" pages with useful free space",
-						get_namespace_name(RelationGetNamespace(onerel)),
-						RelationGetRelationName(onerel)),
-				 /* Only suggest VACUUM FULL if > 20% free */
-				 (vacrelstats->tot_free_pages > vacrelstats->rel_pages * 0.20) ?
-				 errhint("Consider using VACUUM FULL on this relation or increasing the configuration parameter \"max_fsm_pages\".") :
-				 errhint("Consider increasing the configuration parameter \"max_fsm_pages\".")));
+	/* Vacuum the Free Space Map */
+	FreeSpaceMapVacuum(onerel);
 
 	/* Update statistics in pg_class */
 	vac_update_relstats(RelationGetRelid(onerel),
@@ -313,6 +278,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		int			prev_dead_count;
 		OffsetNumber frozen[MaxOffsetNumber];
 		int			nfrozen;
+		Size		freespace;
 
 		vacuum_delay_point();
 
@@ -375,20 +341,21 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 						relname, blkno)));
 				PageInit(page, BufferGetPageSize(buf), 0);
 				empty_pages++;
-				lazy_record_free_space(vacrelstats, blkno,
-									   PageGetHeapFreeSpace(page));
 			}
+			freespace = PageGetHeapFreeSpace(page);
 			MarkBufferDirty(buf);
 			UnlockReleaseBuffer(buf);
+
+			RecordPageWithFreeSpace(onerel, blkno, freespace);
 			continue;
 		}
 
 		if (PageIsEmpty(page))
 		{
 			empty_pages++;
-			lazy_record_free_space(vacrelstats, blkno,
-								   PageGetHeapFreeSpace(page));
+			freespace = PageGetHeapFreeSpace(page);
 			UnlockReleaseBuffer(buf);
+			RecordPageWithFreeSpace(onerel, blkno, freespace);
 			continue;
 		}
 
@@ -556,6 +523,14 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 			vacuumed_pages++;
 		}
 
+		freespace = PageGetHeapFreeSpace(page);
+
+		/* Remember the location of the last page with nonremovable tuples */
+		if (hastup)
+			vacrelstats->nonempty_pages = blkno + 1;
+
+		UnlockReleaseBuffer(buf);
+
 		/*
 		 * If we remembered any tuples for deletion, then the page will be
 		 * visited again by lazy_vacuum_heap, which will compute and record
@@ -564,16 +539,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		 * taken if there are no indexes.)
 		 */
 		if (vacrelstats->num_dead_tuples == prev_dead_count)
-		{
-			lazy_record_free_space(vacrelstats, blkno,
-								   PageGetHeapFreeSpace(page));
-		}
-
-		/* Remember the location of the last page with nonremovable tuples */
-		if (hastup)
-			vacrelstats->nonempty_pages = blkno + 1;
-
-		UnlockReleaseBuffer(buf);
+			RecordPageWithFreeSpace(onerel, blkno, freespace);
 	}
 
 	/* save stats for use later */
@@ -611,12 +577,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 					tups_vacuumed, num_tuples, nblocks),
 			 errdetail("%.0f dead row versions cannot be removed yet.\n"
 					   "There were %.0f unused item pointers.\n"
-					   "%u pages contain useful free space.\n"
 					   "%u pages are entirely empty.\n"
 					   "%s.",
 					   nkeep,
 					   nunused,
-					   vacrelstats->tot_free_pages,
 					   empty_pages,
 					   pg_rusage_show(&ru0))));
 }
@@ -649,6 +613,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 		BlockNumber tblk;
 		Buffer		buf;
 		Page		page;
+		Size		freespace;
 
 		vacuum_delay_point();
 
@@ -656,11 +621,13 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 		buf = ReadBufferWithStrategy(onerel, tblk, vac_strategy);
 		LockBufferForCleanup(buf);
 		tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
+
 		/* Now that we've compacted the page, record its available space */
 		page = BufferGetPage(buf);
-		lazy_record_free_space(vacrelstats, tblk,
-							   PageGetHeapFreeSpace(page));
+		freespace = PageGetHeapFreeSpace(page);
+
 		UnlockReleaseBuffer(buf);
+		RecordPageWithFreeSpace(onerel, tblk, freespace);
 		npages++;
 	}
 
@@ -816,10 +783,6 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
 {
 	BlockNumber old_rel_pages = vacrelstats->rel_pages;
 	BlockNumber new_rel_pages;
-	FSMPageData *pageSpaces;
-	int			n;
-	int			i,
-				j;
 	PGRUsage	ru0;
 
 	pg_rusage_init(&ru0);
@@ -865,6 +828,7 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
 	/*
 	 * Okay to truncate.
 	 */
+	FreeSpaceMapTruncateRel(onerel, new_rel_pages);
 	RelationTruncate(onerel, new_rel_pages);
 
 	/*
@@ -875,34 +839,6 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
 	 * the table again.
 	 */
 
-	/*
-	 * Drop free-space info for removed blocks; these must not get entered
-	 * into the FSM!
-	 */
-	pageSpaces = vacrelstats->free_pages;
-	n = vacrelstats->num_free_pages;
-	j = 0;
-	for (i = 0; i < n; i++)
-	{
-		if (FSMPageGetPageNum(&pageSpaces[i]) < new_rel_pages)
-		{
-			pageSpaces[j] = pageSpaces[i];
-			j++;
-		}
-	}
-	vacrelstats->num_free_pages = j;
-
-	/*
-	 * If tot_free_pages was more than num_free_pages, we can't tell for sure
-	 * what its correct value is now, because we don't know which of the
-	 * forgotten pages are getting truncated.  Conservatively set it equal to
-	 * num_free_pages.
-	 */
-	vacrelstats->tot_free_pages = j;
-
-	/* We destroyed the heap ordering, so mark array unordered */
-	vacrelstats->fs_is_heap = false;
-
 	/* update statistics */
 	vacrelstats->rel_pages = new_rel_pages;
 	vacrelstats->pages_removed = old_rel_pages - new_rel_pages;
@@ -1005,7 +941,6 @@ static void
 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
 {
 	long		maxtuples;
-	int			maxpages;
 
 	if (vacrelstats->hasindex)
 	{
@@ -1029,19 +964,6 @@ lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
 	vacrelstats->max_dead_tuples = (int) maxtuples;
 	vacrelstats->dead_tuples = (ItemPointer)
 		palloc(maxtuples * sizeof(ItemPointerData));
-
-	maxpages = MaxFSMPages;
-	maxpages = Min(maxpages, MaxAllocSize / sizeof(FSMPageData));
-	/* No need to allocate more pages than the relation has blocks */
-	if (relblocks < (BlockNumber) maxpages)
-		maxpages = (int) relblocks;
-
-	vacrelstats->fs_is_heap = false;
-	vacrelstats->num_free_pages = 0;
-	vacrelstats->max_free_pages = maxpages;
-	vacrelstats->free_pages = (FSMPageData *)
-		palloc(maxpages * sizeof(FSMPageData));
-	vacrelstats->tot_free_pages = 0;
 }
 
 /*
@@ -1063,127 +985,6 @@ lazy_record_dead_tuple(LVRelStats *vacrelstats,
 	}
 }
 
-/*
- * lazy_record_free_space - remember free space on one page
- */
-static void
-lazy_record_free_space(LVRelStats *vacrelstats,
-					   BlockNumber page,
-					   Size avail)
-{
-	FSMPageData *pageSpaces;
-	int			n;
-
-	/*
-	 * A page with less than stats->threshold free space will be forgotten
-	 * immediately, and never passed to the free space map.  Removing the
-	 * uselessly small entries early saves cycles, and in particular reduces
-	 * the amount of time we spend holding the FSM lock when we finally call
-	 * RecordRelationFreeSpace.  Since the FSM will probably drop pages with
-	 * little free space anyway, there's no point in making this really small.
-	 *
-	 * XXX Is it worth trying to measure average tuple size, and using that to
-	 * adjust the threshold?  Would be worthwhile if FSM has no stats yet for
-	 * this relation.  But changing the threshold as we scan the rel might
-	 * lead to bizarre behavior, too.  Also, it's probably better if vacuum.c
-	 * has the same thresholding behavior as we do here.
-	 */
-	if (avail < vacrelstats->threshold)
-		return;
-
-	/* Count all pages over threshold, even if not enough space in array */
-	vacrelstats->tot_free_pages++;
-
-	/* Copy pointers to local variables for notational simplicity */
-	pageSpaces = vacrelstats->free_pages;
-	n = vacrelstats->max_free_pages;
-
-	/* If we haven't filled the array yet, just keep adding entries */
-	if (vacrelstats->num_free_pages < n)
-	{
-		FSMPageSetPageNum(&pageSpaces[vacrelstats->num_free_pages], page);
-		FSMPageSetSpace(&pageSpaces[vacrelstats->num_free_pages], avail);
-		vacrelstats->num_free_pages++;
-		return;
-	}
-
-	/*----------
-	 * The rest of this routine works with "heap" organization of the
-	 * free space arrays, wherein we maintain the heap property
-	 *			avail[(j-1) div 2] <= avail[j]	for 0 < j < n.
-	 * In particular, the zero'th element always has the smallest available
-	 * space and can be discarded to make room for a new page with more space.
-	 * See Knuth's discussion of heap-based priority queues, sec 5.2.3;
-	 * but note he uses 1-origin array subscripts, not 0-origin.
-	 *----------
-	 */
-
-	/* If we haven't yet converted the array to heap organization, do it */
-	if (!vacrelstats->fs_is_heap)
-	{
-		/*
-		 * Scan backwards through the array, "sift-up" each value into its
-		 * correct position.  We can start the scan at n/2-1 since each entry
-		 * above that position has no children to worry about.
-		 */
-		int			l = n / 2;
-
-		while (--l >= 0)
-		{
-			BlockNumber R = FSMPageGetPageNum(&pageSpaces[l]);
-			Size		K = FSMPageGetSpace(&pageSpaces[l]);
-			int			i;		/* i is where the "hole" is */
-
-			i = l;
-			for (;;)
-			{
-				int			j = 2 * i + 1;
-
-				if (j >= n)
-					break;
-				if (j + 1 < n && FSMPageGetSpace(&pageSpaces[j]) > FSMPageGetSpace(&pageSpaces[j + 1]))
-					j++;
-				if (K <= FSMPageGetSpace(&pageSpaces[j]))
-					break;
-				pageSpaces[i] = pageSpaces[j];
-				i = j;
-			}
-			FSMPageSetPageNum(&pageSpaces[i], R);
-			FSMPageSetSpace(&pageSpaces[i], K);
-		}
-
-		vacrelstats->fs_is_heap = true;
-	}
-
-	/* If new page has more than zero'th entry, insert it into heap */
-	if (avail > FSMPageGetSpace(&pageSpaces[0]))
-	{
-		/*
-		 * Notionally, we replace the zero'th entry with the new data, and
-		 * then sift-up to maintain the heap property.	Physically, the new
-		 * data doesn't get stored into the arrays until we find the right
-		 * location for it.
-		 */
-		int			i = 0;		/* i is where the "hole" is */
-
-		for (;;)
-		{
-			int			j = 2 * i + 1;
-
-			if (j >= n)
-				break;
-			if (j + 1 < n && FSMPageGetSpace(&pageSpaces[j]) > FSMPageGetSpace(&pageSpaces[j + 1]))
-				j++;
-			if (avail <= FSMPageGetSpace(&pageSpaces[j]))
-				break;
-			pageSpaces[i] = pageSpaces[j];
-			i = j;
-		}
-		FSMPageSetPageNum(&pageSpaces[i], page);
-		FSMPageSetSpace(&pageSpaces[i], avail);
-	}
-}
-
 /*
  *	lazy_tid_reaped() -- is a particular tid deletable?
  *
@@ -1206,27 +1007,6 @@ lazy_tid_reaped(ItemPointer itemptr, void *state)
 	return (res != NULL);
 }
 
-/*
- * Update the shared Free Space Map with the info we now have about
- * free space in the relation, discarding any old info the map may have.
- */
-static void
-lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats)
-{
-	FSMPageData *pageSpaces = vacrelstats->free_pages;
-	int			nPages = vacrelstats->num_free_pages;
-
-	/*
-	 * Sort data into order, as required by RecordRelationFreeSpace.
-	 */
-	if (nPages > 1)
-		qsort(pageSpaces, nPages, sizeof(FSMPageData),
-			  vac_cmp_page_spaces);
-
-	RecordRelationFreeSpace(&onerel->rd_node, vacrelstats->tot_free_pages,
-							nPages, pageSpaces);
-}
-
 /*
  * Comparator routines for use with qsort() and bsearch().
  */
@@ -1256,18 +1036,3 @@ vac_cmp_itemptr(const void *left, const void *right)
 
 	return 0;
 }
-
-static int
-vac_cmp_page_spaces(const void *left, const void *right)
-{
-	FSMPageData *linfo = (FSMPageData *) left;
-	FSMPageData *rinfo = (FSMPageData *) right;
-	BlockNumber	lblkno = FSMPageGetPageNum(linfo);
-	BlockNumber	rblkno = FSMPageGetPageNum(rinfo);
-
-	if (lblkno < rblkno)
-		return -1;
-	else if (lblkno > rblkno)
-		return 1;
-	return 0;
-}
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 823c5243797..bf805e977eb 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.51 2008/08/11 11:05:11 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.52 2008/09/30 10:52:13 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -55,7 +55,6 @@
 #include "postmaster/bgwriter.h"
 #include "storage/bufmgr.h"
 #include "storage/fd.h"
-#include "storage/freespace.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
 #include "storage/pmsignal.h"
@@ -398,7 +397,6 @@ BackgroundWriterMain(void)
 			ExitOnAnyError = true;
 			/* Close down the database */
 			ShutdownXLOG(0, 0);
-			DumpFreeSpaceMap(0, 0);
 			/* Normal exit from the bgwriter is here */
 			proc_exit(0);		/* done */
 		}
diff --git a/src/backend/storage/freespace/Makefile b/src/backend/storage/freespace/Makefile
index 553131d8f06..bc9cae622ce 100644
--- a/src/backend/storage/freespace/Makefile
+++ b/src/backend/storage/freespace/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for storage/freespace
 #
 # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/storage/freespace/Makefile,v 1.4 2008/02/19 10:30:08 petere Exp $
+#    $PostgreSQL: pgsql/src/backend/storage/freespace/Makefile,v 1.5 2008/09/30 10:52:13 heikki Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -12,6 +12,6 @@ subdir = src/backend/storage/freespace
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = freespace.o
+OBJS = freespace.o fsmpage.o indexfsm.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README
new file mode 100644
index 00000000000..b5a37f9cb55
--- /dev/null
+++ b/src/backend/storage/freespace/README
@@ -0,0 +1,195 @@
+$PostgreSQL: pgsql/src/backend/storage/freespace/README,v 1.1 2008/09/30 10:52:13 heikki Exp $
+
+Free Space Map
+--------------
+
+The purpose of the free space map is to quickly locate a page with enough
+free space to hold a tuple to be stored; or to determine that no such page
+exists and the relation must be extended by one page.  As of PostgreSQL 8.4
+each relation has its own, extensible free space map stored in a separate
+"fork" of its relation.  This eliminates the disadvantages of the former
+fixed-size FSM.
+
+It is important to keep the map small so that it can be searched rapidly.
+Therefore, we don't attempt to record the exact free space on a page.
+We allocate one map byte to each page, allowing us to record free space
+at a granularity of 1/256th of a page.  Another way to say it is that
+the stored value is the free space divided by BLCKSZ/256 (rounding down).
+We assume that the free space must always be less than BLCKSZ, since
+all pages have some overhead; so the maximum map value is 255.
+
+To assist in fast searching, the map isn't simply an array of per-page
+entries, but has a tree structure above those entries.  There is a tree
+structure of pages, and a tree structure within each page, as described
+below.
+
+FSM page structure
+------------------
+
+Within each FSM page, we use a binary tree structure where leaf nodes store
+the amount of free space on heap pages (or lower level FSM pages, see
+"Higher-level structure" below), with one leaf node per heap page. A non-leaf
+node stores the max amount of free space on any of its children.
+
+For example:
+
+    4
+ 4     2
+3 4   0 2    <- This level represents heap pages
+
+We need two basic operations: search and update.
+
+To search for a page with X amount of free space, traverse down the tree
+along a path where n >= X, until you hit the bottom. If both children of a
+node satisfy the condition, you can pick either one arbitrarily.
+
+To update the amount of free space on a page to X, first update the leaf node
+corresponding to the heap page, then "bubble up" the change to upper nodes,
+by walking up to each parent and recomputing its value as the max of its
+two children.  Repeat until reaching the root or a parent whose value
+doesn't change.
+
+This data structure has a couple of nice properties:
+- to discover that there is no page with X bytes of free space, you only
+  need to look at the root node
+- by varying which child to traverse to in the search algorithm, when you have
+  a choice, we can implement various strategies, like preferring pages closer
+  to a given page, or spreading the load across the table.
+
+Higher-level routines that use FSM pages access them through the fsm_set_avail()
+and fsm_search_avail() functions. The interface to those functions hides the
+page's internal tree structure, treating the FSM page as a black box that has
+a certain number of "slots" for storing free space information.  (However,
+the higher routines have to be aware of the tree structure of the whole map.)
+
+The binary tree is stored on each FSM page as an array. Because the page
+header takes some space on a page, the binary tree isn't perfect. That is,
+a few right-most leaf nodes are missing, and there are some useless non-leaf
+nodes at the right. So the tree looks something like this:
+
+       0
+   1       2
+ 3   4   5   6
+7 8 9 A B
+
+where the numbers denote each node's position in the array.  Note that the
+tree is guaranteed complete above the leaf level; only some leaf nodes are
+missing.  This is reflected in the number of usable "slots" per page not
+being an exact power of 2.
+
+A FSM page also has a next slot pointer, fp_next_slot, that determines where
+to start the next search for free space within that page.  The reason for that
+is to spread out the pages that are returned by FSM searches.  When several
+backends are concurrently inserting into a relation, contention can be avoided
+by having them insert into different pages.  But it is also desirable to fill
+up pages in sequential order, to get the benefit of OS prefetching and batched
+writes.  The FSM is responsible for making that happen, and the next slot
+pointer helps provide the desired behavior. 
+
+Higher-level structure
+----------------------
+
+To scale up the data structure described above beyond a single page, we
+maintain a similar tree-structure across pages. Leaf nodes in higher level
+pages correspond to lower level FSM pages. The root node within each page
+has the same value as the corresponding leaf node on its parent page.
+
+The root page is always stored at physical block 0.
+
+For example, assuming each FSM page can hold information about 4 pages (in
+reality, it holds (BLCKSZ - headers) / 2, or ~4000 with default BLCKSZ),
+we get a disk layout like this:
+
+ 0     <-- page 0 at level 2 (root page)
+  0     <-- page 0 at level 1
+   0     <-- page 0 at level 0
+   1     <-- page 1 at level 0
+   2     <-- ...
+   3
+  1     <-- page 1 at level 1
+   4
+   5
+   6
+   7
+  2
+   8
+   9
+   10
+   11
+  3
+   12
+   13
+   14
+   15
+
+where the numbers are page numbers *at that level*, starting from 0.
+
+To find the physical block # corresponding to leaf page n, we need to
+count the number number of leaf and upper-level pages preceding page n.
+This turns out to be
+
+y = n + (n / F + 1) + (n / F^2 + 1) + ... + 1
+
+where F is the fanout (4 in the above example). The first term n is the number
+of preceding leaf pages, the second term is the number of pages at level 1,
+and so forth.
+
+To keep things simple, the tree is always constant height. To cover the
+maximum relation size of 2^32-1 blocks, three levels is enough with the default
+BLCKSZ (4000^3 > 2^32).
+
+Addressing
+----------
+
+The higher-level routines operate on "logical" addresses, consisting of
+- level,
+- logical page number, and
+- slot (if applicable)
+
+Bottom level FSM pages have level of 0, the level above that 1, and root 2.
+As in the diagram above, logical page number is the page number at that level,
+starting from 0.
+
+Locking
+-------
+
+When traversing down to search for free space, only one page is locked at a
+time: the parent page is released before locking the child. If the child page
+is concurrently modified, and there no longer is free space on the child page
+when you land on it, you need to start from scratch (after correcting the
+parent page, so that you don't get into an infinite loop).
+
+We use shared buffer locks when searching, but exclusive buffer lock when
+updating a page.  However, the next slot search pointer is updated during
+searches even though we have only a shared lock.  fp_next_slot is just a hint
+and we can easily reset it if it gets corrupted; so it seems better to accept
+some risk of that type than to pay the overhead of exclusive locking.
+
+Recovery
+--------
+
+The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of
+self-correcting measures to repair possible corruption.
+
+First of all, whenever a value is set on an FSM page, the root node of the
+page is compared against the new value after bubbling up the change is
+finished. It should be greater than or equal to the value just set, or we
+have a corrupted page, with a parent somewhere with too small a value.
+Secondly, if we detect corrupted pages while we search, traversing down
+the tree. That check will notice if a parent node is set to too high a value.
+In both cases, the upper nodes on the page are immediately rebuilt, fixing
+the corruption.
+
+Vacuum updates all the bottom level pages with correct amount of free space
+on the heap pages, fixing any outdated values there. After the heap and
+index passes are done, FreeSpaceMapVacuum is called, and the FSM tree is
+scanned in depth-first order. This fixes any discrepancies between upper
+and lower level FSM pages.
+
+TODO
+----
+
+- fastroot to avoid traversing upper nodes with just 1 child
+- use a different system for tables that fit into one FSM page, with a
+  mechanism to switch to the real thing as it grows.
+
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index 9373675b8cc..1602ec0cc93 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -8,245 +8,123 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/freespace/freespace.c,v 1.60 2008/03/10 02:04:09 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/freespace/freespace.c,v 1.61 2008/09/30 10:52:13 heikki Exp $
  *
  *
  * NOTES:
  *
- * The only really interesting aspect of this code is the heuristics for
- * deciding how much information we can afford to keep about each relation,
- * given that we have a limited amount of workspace in shared memory.
- * These currently work as follows:
- *
- * The number of distinct relations tracked is limited by a configuration
- * variable (MaxFSMRelations).	When this would be exceeded, we discard the
- * least recently used relation.  A doubly-linked list with move-to-front
- * behavior keeps track of which relation is least recently used.
- *
- * For each known relation, we track the average request size given to
- * GetPageWithFreeSpace() as well as the most recent number of pages reported
- * to RecordRelationFreeSpace().  The average request size is not directly
- * used in this module, but we expect VACUUM to use it to filter out
- * uninteresting amounts of space before calling RecordRelationFreeSpace().
- * The sum of the RRFS page counts is thus the total number of "interesting"
- * pages that we would like to track; this is called DesiredFSMPages.
- *
- * The number of pages actually tracked is limited by a configuration variable
- * (MaxFSMPages).  When this is less than DesiredFSMPages, each relation
- * gets to keep a fraction MaxFSMPages/DesiredFSMPages of its free pages.
- * We discard pages with less free space to reach this target.
- *
- * Actually, our space allocation is done in "chunks" of CHUNKPAGES pages,
- * with each relation guaranteed at least one chunk.  This reduces thrashing
- * of the storage allocations when there are small changes in the RRFS page
- * counts from one VACUUM to the next.	(XXX it might also be worthwhile to
- * impose some kind of moving-average smoothing on the RRFS page counts?)
- *
- * So the actual arithmetic is: for each relation compute myRequest as the
- * number of chunks needed to hold its RRFS page count (not counting the
- * first, guaranteed chunk); compute sumRequests as the sum of these values
- * over all relations; then for each relation figure its target allocation
- * as
- *			1 + round(spareChunks * myRequest / sumRequests)
- * where spareChunks = totalChunks - numRels is the number of chunks we have
- * a choice what to do with.  We round off these numbers because truncating
- * all of them would waste significant space.  But because of roundoff, it's
- * possible for the last few relations to get less space than they should;
- * the target allocation must be checked against remaining available space.
+ *  Free Space Map keeps track of the amount of free space on pages, and
+ *  allows quickly searching for a page with enough free space. The FSM is
+ *  stored in a dedicated relation fork of all heap relations, and those
+ *  index access methods that need it (see also indexfsm.c). See README for
+ *  more information.
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
-#include <limits.h>
-#include <math.h>
-#include <unistd.h>
-
-#include "storage/fd.h"
+#include "access/htup.h"
+#include "access/xlogutils.h"
+#include "storage/bufpage.h"
+#include "storage/bufmgr.h"
 #include "storage/freespace.h"
+#include "storage/fsm_internals.h"
+#include "storage/lmgr.h"
 #include "storage/lwlock.h"
-#include "storage/shmem.h"
+#include "storage/smgr.h"
+#include "utils/rel.h"
+#include "utils/inval.h"
+#include "miscadmin.h"
 
-
-/*----------
- * During database shutdown, we store the contents of FSM into a disk file,
- * which is re-read during startup.  This way we don't have a startup
- * transient condition where FSM isn't really functioning.
+/*
+ * We use just one byte to store the amount of free space on a page, so we
+ * divide the amount of free space a page can have into 256 different
+ * categories. The highest category, 255, represents a page with at least
+ * MaxFSMRequestSize bytes of free space, and the second highest category
+ * represents the range from 254 * FSM_CAT_STEP, inclusive, to
+ * MaxFSMRequestSize, exclusive.
  *
- * The file format is:
- *		label			"FSM\0"
- *		endian			constant 0x01020304 for detecting endianness problems
- *		version#
- *		numRels
- *	-- for each rel, in *reverse* usage order:
- *		relfilenode
- *		isIndex
- *		avgRequest
- *		interestingPages
- *		storedPages
- *		arena data		array of storedPages FSMPageData or IndexFSMPageData
- *----------
+ * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
+ * default 8k BLCKSZ, and that MaxFSMRequestSize is 24 bytes, the categories
+ * look like this
+ * 
+ *
+ * Range     Category
+ * 0    - 31   0
+ * 32   - 63   1
+ * ...    ...  ...
+ * 8096 - 8127 253
+ * 8128 - 8163 254
+ * 8164 - 8192 255
+ *
+ * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
+ * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
+ * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
+ * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
+ * completely empty page, that would mean that we could never satisfy a
+ * request of exactly MaxFSMRequestSize bytes.
  */
-
-/* Name of FSM cache file (relative to $PGDATA) */
-#define FSM_CACHE_FILENAME	"global/pg_fsm.cache"
-
-/* Fixed values in header */
-#define FSM_CACHE_LABEL		"FSM"
-#define FSM_CACHE_ENDIAN	0x01020304
-#define FSM_CACHE_VERSION	20030305
-
-/* File header layout */
-typedef struct FsmCacheFileHeader
-{
-	char		label[4];
-	uint32		endian;
-	uint32		version;
-	int32		numRels;
-} FsmCacheFileHeader;
-
-/* Per-relation header */
-typedef struct FsmCacheRelHeader
-{
-	RelFileNode key;			/* hash key (must be first) */
-	bool		isIndex;		/* if true, we store only page numbers */
-	uint32		avgRequest;		/* moving average of space requests */
-	BlockNumber interestingPages;		/* # of pages with useful free space */
-	int32		storedPages;	/* # of pages stored in arena */
-} FsmCacheRelHeader;
-
-int			MaxFSMRelations;	/* these are set by guc.c */
-int			MaxFSMPages;
-
-static FSMHeader *FreeSpaceMap; /* points to FSMHeader in shared memory */
-static HTAB *FreeSpaceMapRelHash;		/* points to (what used to be)
-										 * FSMHeader->relHash */
-
-
-static void CheckFreeSpaceMapStatistics(int elevel, int numRels,
-							double needed);
-static FSMRelation *lookup_fsm_rel(RelFileNode *rel);
-static FSMRelation *create_fsm_rel(RelFileNode *rel);
-static void delete_fsm_rel(FSMRelation *fsmrel);
-static int realloc_fsm_rel(FSMRelation *fsmrel, BlockNumber interestingPages,
-				bool isIndex);
-static void link_fsm_rel_usage(FSMRelation *fsmrel);
-static void unlink_fsm_rel_usage(FSMRelation *fsmrel);
-static void link_fsm_rel_storage(FSMRelation *fsmrel);
-static void unlink_fsm_rel_storage(FSMRelation *fsmrel);
-static BlockNumber find_free_space(FSMRelation *fsmrel, Size spaceNeeded);
-static BlockNumber find_index_free_space(FSMRelation *fsmrel);
-static void fsm_record_free_space(FSMRelation *fsmrel, BlockNumber page,
-					  Size spaceAvail);
-static bool lookup_fsm_page_entry(FSMRelation *fsmrel, BlockNumber page,
-					  int *outPageIndex);
-static void compact_fsm_storage(void);
-static void push_fsm_rels_after(FSMRelation *afterRel);
-static void pack_incoming_pages(FSMPageData *newLocation, int newPages,
-					FSMPageData *pageSpaces, int nPages);
-static void pack_existing_pages(FSMPageData *newLocation, int newPages,
-					FSMPageData *oldLocation, int oldPages);
-static int	fsm_calc_request(FSMRelation *fsmrel);
-static int	fsm_calc_request_unclamped(FSMRelation *fsmrel);
-static int	fsm_calc_target_allocation(int myRequest);
-static int	fsm_current_chunks(FSMRelation *fsmrel);
-static int	fsm_current_allocation(FSMRelation *fsmrel);
-
+#define FSM_CATEGORIES	256
+#define FSM_CAT_STEP	(BLCKSZ / FSM_CATEGORIES)
+#define MaxFSMRequestSize	MaxHeapTupleSize
 
 /*
- * Exported routines
+ * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
+ * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
+ * 216 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
+ * this means that 4096 bytes is the smallest BLCKSZ that we can get away
+ * with a 3-level tree, and 512 is the smallest we support.
  */
+#define FSM_TREE_DEPTH	((SlotsPerFSMPage >= 1626) ? 3 : 4)
 
+#define FSM_ROOT_LEVEL	(FSM_TREE_DEPTH - 1)
+#define FSM_BOTTOM_LEVEL 0
 
 /*
- * InitFreeSpaceMap -- Initialize the freespace module.
- *
- * This must be called once during shared memory initialization.
- * It builds the empty free space map table.  FreeSpaceLock must also be
- * initialized at some point, but is not touched here --- we assume there is
- * no need for locking, since only the calling process can be accessing shared
- * memory as yet.
+ * The internal FSM routines work on a logical addressing scheme. Each
+ * level of the tree can be thought of as a separately addressable file.
  */
-void
-InitFreeSpaceMap(void)
+typedef struct
 {
-	HASHCTL		info;
-	int			nchunks;
-	bool		found;
-
-	/* Create table header */
-	FreeSpaceMap = (FSMHeader *) ShmemInitStruct("Free Space Map Header",
-												 sizeof(FSMHeader),
-												 &found);
-	if (FreeSpaceMap == NULL)
-		ereport(FATAL,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("insufficient shared memory for free space map")));
-	if (!found)
-		MemSet(FreeSpaceMap, 0, sizeof(FSMHeader));
-
-	/* Create hashtable for FSMRelations */
-	info.keysize = sizeof(RelFileNode);
-	info.entrysize = sizeof(FSMRelation);
-	info.hash = tag_hash;
-
-	FreeSpaceMapRelHash = ShmemInitHash("Free Space Map Hash",
-										MaxFSMRelations + 1,
-										MaxFSMRelations + 1,
-										&info,
-										(HASH_ELEM | HASH_FUNCTION));
-
-	if (!FreeSpaceMapRelHash)
-		ereport(FATAL,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("insufficient shared memory for free space map")));
-
-	if (found)
-		return;
-
-
-	/* Allocate page-storage arena */
-	nchunks = (MaxFSMPages - 1) / CHUNKPAGES + 1;
-	/* This check ensures spareChunks will be greater than zero */
-	if (nchunks <= MaxFSMRelations)
-		ereport(FATAL,
-				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-				 errmsg("max_fsm_pages must exceed max_fsm_relations * %d",
-						CHUNKPAGES)));
-
-	FreeSpaceMap->arena = (char *) ShmemAlloc((Size) nchunks * CHUNKBYTES);
-	if (FreeSpaceMap->arena == NULL)
-		ereport(FATAL,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("insufficient shared memory for free space map")));
-
-	FreeSpaceMap->totalChunks = nchunks;
-	FreeSpaceMap->usedChunks = 0;
-	FreeSpaceMap->sumRequests = 0;
-}
+	int level;		/* level */
+	int logpageno;	/* page number within the level */
+} FSMAddress;
 
-/*
- * Estimate amount of shmem space needed for FSM.
- */
-Size
-FreeSpaceShmemSize(void)
+/* Address of the root page. */
+static const FSMAddress FSM_ROOT_ADDRESS = { FSM_ROOT_LEVEL, 0 };
+
+/* XLOG record types */
+#define XLOG_FSM_TRUNCATE     0x00    /* truncate */
+
+typedef struct
 {
-	Size		size;
-	int			nchunks;
+	RelFileNode node;			/* truncated relation */
+	BlockNumber nheapblocks;	/* new number of blocks in the heap */
+} xl_fsm_truncate;
 
-	/* table header */
-	size = MAXALIGN(sizeof(FSMHeader));
+/* functions to navigate the tree */
+static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
+static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
+static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
+static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
+static BlockNumber fsm_logical_to_physical(FSMAddress addr);
 
-	/* hash table, including the FSMRelation objects */
-	size = add_size(size, hash_estimate_size(MaxFSMRelations + 1,
-											 sizeof(FSMRelation)));
+static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
+static void fsm_extend(Relation rel, BlockNumber nfsmblocks);
 
-	/* page-storage arena */
-	nchunks = (MaxFSMPages - 1) / CHUNKPAGES + 1;
-	size = add_size(size, mul_size(nchunks, CHUNKBYTES));
+/* functions to convert amount of free space to a FSM category */
+static uint8 fsm_space_avail_to_cat(Size avail);
+static uint8 fsm_space_needed_to_cat(Size needed);
+static Size  fsm_space_cat_to_avail(uint8 cat);
+
+/* workhorse functions for various operations */
+static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
+							  uint8 newValue, uint8 minValue);
+static BlockNumber fsm_search(Relation rel, uint8 min_cat);
+static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof);
 
-	return size;
-}
+
+/******** Public API ********/
 
 /*
  * GetPageWithFreeSpace - try to find a page in the given relation with
@@ -262,1608 +140,668 @@ FreeSpaceShmemSize(void)
  * extend the relation.
  */
 BlockNumber
-GetPageWithFreeSpace(RelFileNode *rel, Size spaceNeeded)
+GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
 {
-	FSMRelation *fsmrel;
-	BlockNumber freepage;
-
-	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
-
-	/*
-	 * We always add a rel to the hashtable when it is inquired about.
-	 */
-	fsmrel = create_fsm_rel(rel);
-
-	/*
-	 * Update the moving average of space requests.  This code implements an
-	 * exponential moving average with an equivalent period of about 63
-	 * requests.  Ignore silly requests, however, to ensure that the average
-	 * stays sane.
-	 */
-	if (spaceNeeded > 0 && spaceNeeded < BLCKSZ)
-	{
-		int			cur_avg = (int) fsmrel->avgRequest;
-
-		cur_avg += ((int) spaceNeeded - cur_avg) / 32;
-		fsmrel->avgRequest = (Size) cur_avg;
-	}
-	freepage = find_free_space(fsmrel, spaceNeeded);
-	LWLockRelease(FreeSpaceLock);
-	return freepage;
+	uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
+	return fsm_search(rel, min_cat);
 }
 
 /*
  * RecordAndGetPageWithFreeSpace - update info about a page and try again.
  *
- * We provide this combo form, instead of a separate Record operation,
- * to save one lock and hash table lookup cycle.
+ * We provide this combo form to save some locking overhead, compared to
+ * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
+ * also some effort to return a page close to the old page; if there's a
+ * page with enough free space on the same FSM page where the old one page
+ * is located, it is preferred.
  */
 BlockNumber
-RecordAndGetPageWithFreeSpace(RelFileNode *rel,
-							  BlockNumber oldPage,
-							  Size oldSpaceAvail,
-							  Size spaceNeeded)
+RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
+							  Size oldSpaceAvail, Size spaceNeeded)
 {
-	FSMRelation *fsmrel;
-	BlockNumber freepage;
+	int			old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
+	int			search_cat = fsm_space_needed_to_cat(spaceNeeded);
+	FSMAddress	addr;
+	uint16		slot;
+	int			search_slot;
 
-	/* Sanity check: ensure spaceAvail will fit into OffsetNumber */
-	AssertArg(oldSpaceAvail < BLCKSZ);
+	/* Get the location of the FSM byte representing the heap block */
+	addr = fsm_get_location(oldPage, &slot);
 
-	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
+	search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
 
 	/*
-	 * We always add a rel to the hashtable when it is inquired about.
+	 * If fsm_set_and_search found a suitable new block, return that.
+	 * Otherwise, search as usual.
 	 */
-	fsmrel = create_fsm_rel(rel);
-
-	/* Do the Record */
-	fsm_record_free_space(fsmrel, oldPage, oldSpaceAvail);
-
-	/*
-	 * Update the moving average of space requests, same as in
-	 * GetPageWithFreeSpace.
-	 */
-	if (spaceNeeded > 0 && spaceNeeded < BLCKSZ)
-	{
-		int			cur_avg = (int) fsmrel->avgRequest;
-
-		cur_avg += ((int) spaceNeeded - cur_avg) / 32;
-		fsmrel->avgRequest = (Size) cur_avg;
-	}
-	/* Do the Get */
-	freepage = find_free_space(fsmrel, spaceNeeded);
-	LWLockRelease(FreeSpaceLock);
-	return freepage;
-}
-
-/*
- * GetAvgFSMRequestSize - get average FSM request size for a relation.
- *
- * If the relation is not known to FSM, return a default value.
- */
-Size
-GetAvgFSMRequestSize(RelFileNode *rel)
-{
-	Size		result;
-	FSMRelation *fsmrel;
-
-	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
-	fsmrel = lookup_fsm_rel(rel);
-	if (fsmrel)
-		result = fsmrel->avgRequest;
+	if (search_slot != -1)
+		return fsm_get_heap_blk(addr, search_slot);
 	else
-		result = INITIAL_AVERAGE;
-	LWLockRelease(FreeSpaceLock);
-	return result;
+		return fsm_search(rel, search_cat);
 }
 
 /*
- * RecordRelationFreeSpace - record available-space info about a relation.
- *
- * Any pre-existing info about the relation is assumed obsolete and discarded.
- *
- * interestingPages is the total number of pages in the relation that have
- * at least threshold free space; nPages is the number actually reported in
- * pageSpaces[] (may be less --- in particular, callers typically clamp their
- * space usage to MaxFSMPages).
+ * RecordPageWithFreeSpace - update info about a page.
  *
- * The given pageSpaces[] array must be sorted in order by blkno.  Note that
- * the FSM is at liberty to discard some or all of the data.
+ * Note that if the new spaceAvail value is higher than the old value stored
+ * in the FSM, the space might not become visible to searchers until the next
+ * FreeSpaceMapVacuum call, which updates the upper level pages.
  */
 void
-RecordRelationFreeSpace(RelFileNode *rel,
-						BlockNumber interestingPages,
-						int nPages,
-						FSMPageData *pageSpaces)
+RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
 {
-	FSMRelation *fsmrel;
+	int			new_cat = fsm_space_avail_to_cat(spaceAvail);
+	FSMAddress	addr;
+	uint16		slot;
 
-	/* Limit nPages to something sane */
-	if (nPages < 0)
-		nPages = 0;
-	else if (nPages > MaxFSMPages)
-		nPages = MaxFSMPages;
+	/* Get the location of the FSM byte representing the heap block */
+	addr = fsm_get_location(heapBlk, &slot);
 
-	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
-
-	/*
-	 * Note we don't record info about a relation unless there's already an
-	 * FSM entry for it, implying someone has done GetPageWithFreeSpace for
-	 * it.	Inactive rels thus will not clutter the map simply by being
-	 * vacuumed.
-	 */
-	fsmrel = lookup_fsm_rel(rel);
-	if (fsmrel)
-	{
-		int			curAlloc;
-		int			curAllocPages;
-		FSMPageData *newLocation;
-
-		curAlloc = realloc_fsm_rel(fsmrel, interestingPages, false);
-		curAllocPages = curAlloc * CHUNKPAGES;
-
-		/*
-		 * If the data fits in our current allocation, just copy it; otherwise
-		 * must compress.
-		 */
-		newLocation = (FSMPageData *)
-			(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
-		if (nPages <= curAllocPages)
-		{
-			int			i;
-
-			for (i = 0; i < nPages; i++)
-			{
-				BlockNumber page = FSMPageGetPageNum(&pageSpaces[i]);
-
-				/* Check caller provides sorted data */
-				if (i > 0 && page <= FSMPageGetPageNum(&pageSpaces[i - 1]))
-					elog(ERROR, "free-space data is not in page order");
-				*newLocation = pageSpaces[i];
-				newLocation++;
-			}
-			fsmrel->storedPages = nPages;
-		}
-		else
-		{
-			pack_incoming_pages(newLocation, curAllocPages,
-								pageSpaces, nPages);
-			fsmrel->storedPages = curAllocPages;
-		}
-	}
-	LWLockRelease(FreeSpaceLock);
+	fsm_set_and_search(rel, addr, slot, new_cat, 0);
 }
 
 /*
- * GetFreeIndexPage - like GetPageWithFreeSpace, but for indexes
+ * GetRecordedFreePage - return the amount of free space on a particular page,
+ *		according to the FSM.
  */
-BlockNumber
-GetFreeIndexPage(RelFileNode *rel)
+Size
+GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
 {
-	FSMRelation *fsmrel;
-	BlockNumber freepage;
+	FSMAddress	addr;
+	uint16		slot;
+	Buffer		buf;
+	uint8		cat;
 
-	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
+	/* Get the location of the FSM byte representing the heap block */
+	addr = fsm_get_location(heapBlk, &slot);
 
-	/*
-	 * We always add a rel to the hashtable when it is inquired about.
-	 */
-	fsmrel = create_fsm_rel(rel);
+	buf = fsm_readbuf(rel, addr, false);
+	if (!BufferIsValid(buf))
+		return 0;
+	cat = fsm_get_avail(BufferGetPage(buf), slot);
+	ReleaseBuffer(buf);
 
-	freepage = find_index_free_space(fsmrel);
-	LWLockRelease(FreeSpaceLock);
-	return freepage;
+	return fsm_space_cat_to_avail(cat);
 }
 
 /*
- * RecordIndexFreeSpace - like RecordRelationFreeSpace, but for indexes
+ * FreeSpaceMapTruncateRel - adjust for truncation of a relation.
+ *
+ * The caller must hold AccessExclusiveLock on the relation, to ensure
+ * that other backends receive the relcache invalidation event that this
+ * function sends, before accessing the FSM again.
+ *
+ * nblocks is the new size of the heap.
  */
 void
-RecordIndexFreeSpace(RelFileNode *rel,
-					 BlockNumber interestingPages,
-					 int nPages,
-					 BlockNumber *pages)
+FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
 {
-	FSMRelation *fsmrel;
+	BlockNumber	new_nfsmblocks;
+	FSMAddress	first_removed_address;
+	uint16		first_removed_slot;
+	Buffer		buf;
 
-	/* Limit nPages to something sane */
-	if (nPages < 0)
-		nPages = 0;
-	else if (nPages > MaxFSMPages)
-		nPages = MaxFSMPages;
+	RelationOpenSmgr(rel);
 
-	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
+	/* Get the location in the FSM of the first removed heap block */
+	first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
 
 	/*
-	 * Note we don't record info about a relation unless there's already an
-	 * FSM entry for it, implying someone has done GetFreeIndexPage for it.
-	 * Inactive rels thus will not clutter the map simply by being vacuumed.
+	 * Zero out the tail of the last remaining FSM page. If the slot
+	 * representing the first removed heap block is at a page boundary, as
+	 * the first slot on the FSM page that first_removed_address points to,
+	 * we can just truncate that page altogether.
 	 */
-	fsmrel = lookup_fsm_rel(rel);
-	if (fsmrel)
+	if (first_removed_slot > 0)
 	{
-		int			curAlloc;
-		int			curAllocPages;
-		int			i;
-		IndexFSMPageData *newLocation;
+		buf = fsm_readbuf(rel, first_removed_address, false);
+		if (!BufferIsValid(buf))
+			return; /* nothing to do; the FSM was already smaller */
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
+		MarkBufferDirty(buf);
+		UnlockReleaseBuffer(buf);
 
-		curAlloc = realloc_fsm_rel(fsmrel, interestingPages, true);
-		curAllocPages = curAlloc * INDEXCHUNKPAGES;
+		new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
+	}
+	else
+	{
+		new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
+		if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks)
+			return; /* nothing to do; the FSM was already smaller */
+	}
 
-		/*
-		 * If the data fits in our current allocation, just copy it; otherwise
-		 * must compress.  But compression is easy: we merely forget extra
-		 * pages.
-		 */
-		newLocation = (IndexFSMPageData *)
-			(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
-		if (nPages > curAllocPages)
-			nPages = curAllocPages;
+	/* Truncate the unused FSM pages */
+	smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks, rel->rd_istemp);
 
-		for (i = 0; i < nPages; i++)
-		{
-			BlockNumber page = pages[i];
+	/*
+	 * FSM truncations are WAL-logged, because we must never return a block
+	 * that doesn't exist in the heap, not even if we crash before the FSM
+	 * truncation has made it to disk. smgrtruncate() writes its own WAL
+	 * record, but that's not enough to zero out the last remaining FSM page.
+	 * (if we didn't need to zero out anything above, we can skip this)
+	 */
+	if (!rel->rd_istemp && !InRecovery && first_removed_slot != 0)
+	{
+		xl_fsm_truncate xlrec;
+		XLogRecData		rdata;
+		XLogRecPtr		recptr;
 
-			/* Check caller provides sorted data */
-			if (i > 0 && page <= pages[i - 1])
-				elog(ERROR, "free-space data is not in page order");
-			IndexFSMPageSetPageNum(newLocation, page);
-			newLocation++;
-		}
-		fsmrel->storedPages = nPages;
-	}
-	LWLockRelease(FreeSpaceLock);
-}
+		xlrec.node = rel->rd_node;
+		xlrec.nheapblocks = nblocks;
 
-/*
- * FreeSpaceMapTruncateRel - adjust for truncation of a relation.
- *
- * We need to delete any stored data past the new relation length, so that
- * we don't bogusly return removed block numbers.
- */
-void
-FreeSpaceMapTruncateRel(RelFileNode *rel, BlockNumber nblocks)
-{
-	FSMRelation *fsmrel;
+		rdata.data = (char *) &xlrec;
+		rdata.len = sizeof(xl_fsm_truncate);
+		rdata.buffer = InvalidBuffer;
+		rdata.next = NULL;
 
-	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
-	fsmrel = lookup_fsm_rel(rel);
-	if (fsmrel)
-	{
-		int			pageIndex;
+		recptr = XLogInsert(RM_FREESPACE_ID, XLOG_FSM_TRUNCATE, &rdata);
 
-		/* Use lookup to locate first entry >= nblocks */
-		(void) lookup_fsm_page_entry(fsmrel, nblocks, &pageIndex);
-		/* Delete all such entries */
-		fsmrel->storedPages = pageIndex;
-		/* XXX should we adjust rel's interestingPages and sumRequests? */
+		/*
+		 * Flush, because otherwise the truncation of the main relation
+		 * might hit the disk before the WAL record of truncating the
+		 * FSM is flushed. If we crashed during that window, we'd be
+		 * left with a truncated heap, without a truncated FSM.
+		 */
+		XLogFlush(recptr);
 	}
-	LWLockRelease(FreeSpaceLock);
+
+	/*
+	 * Need to invalidate the relcache entry, because rd_fsm_nblocks_cache
+	 * seen by other backends is no longer valid.
+	 */
+	if (!InRecovery)
+		CacheInvalidateRelcache(rel);
+	rel->rd_fsm_nblocks_cache = new_nfsmblocks;
 }
 
 /*
- * FreeSpaceMapForgetRel - forget all about a relation.
- *
- * This is called when a relation is deleted.  Although we could just let
- * the rel age out of the map, it's better to reclaim and reuse the space
- * sooner.
+ * FreeSpaceMapVacuum - scan and fix any inconsistencies in the FSM
  */
 void
-FreeSpaceMapForgetRel(RelFileNode *rel)
+FreeSpaceMapVacuum(Relation rel)
 {
-	FSMRelation *fsmrel;
+	bool dummy;
 
-	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
-	fsmrel = lookup_fsm_rel(rel);
-	if (fsmrel)
-		delete_fsm_rel(fsmrel);
-	LWLockRelease(FreeSpaceLock);
+	/*
+	 * Traverse the tree in depth-first order. The tree is stored physically
+	 * in depth-first order, so this should be pretty I/O efficient.
+	 */
+	fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, &dummy);
 }
 
+/******** Internal routines ********/
+
 /*
- * FreeSpaceMapForgetDatabase - forget all relations of a database.
- *
- * This is called during DROP DATABASE.  As above, might as well reclaim
- * map space sooner instead of later.
+ * Return category corresponding x bytes of free space
  */
-void
-FreeSpaceMapForgetDatabase(Oid dbid)
+static uint8
+fsm_space_avail_to_cat(Size avail)
 {
-	FSMRelation *fsmrel,
-			   *nextrel;
+	int cat;
 
-	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
-	for (fsmrel = FreeSpaceMap->usageList; fsmrel; fsmrel = nextrel)
-	{
-		nextrel = fsmrel->nextUsage;	/* in case we delete it */
-		if (fsmrel->key.dbNode == dbid)
-			delete_fsm_rel(fsmrel);
-	}
-	LWLockRelease(FreeSpaceLock);
-}
+	Assert(avail < BLCKSZ);
 
-/*
- * PrintFreeSpaceMapStatistics - print statistics about FSM contents
- *
- * The info is sent to ereport() with the specified message level.	This is
- * intended for use during VACUUM.
- */
-void
-PrintFreeSpaceMapStatistics(int elevel)
-{
-	FSMRelation *fsmrel;
-	int			storedPages = 0;
-	double		sumRequests = 0;
-	int			numRels;
-	double		needed;
+	if (avail >= MaxFSMRequestSize)
+		return 255;
 
-	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
+	cat = avail / FSM_CAT_STEP;
 
 	/*
-	 * Count total space actually used, as well as the unclamped request total
+	 * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
+	 * more.
 	 */
-	for (fsmrel = FreeSpaceMap->firstRel;
-		 fsmrel != NULL;
-		 fsmrel = fsmrel->nextPhysical)
-	{
-		storedPages += fsmrel->storedPages;
-		sumRequests += fsm_calc_request_unclamped(fsmrel);
-	}
+	if (cat > 254)
+		cat = 254;
 
-	/* Copy other stats before dropping lock */
-	numRels = FreeSpaceMap->numRels;
-	LWLockRelease(FreeSpaceLock);
-
-	/* Convert stats to actual number of page slots needed */
-	needed = (sumRequests + numRels) * CHUNKPAGES;
-
-	ereport(elevel,
-			(errmsg("free space map contains %d pages in %d relations",
-					storedPages, numRels),
-	errdetail("A total of %.0f page slots are in use (including overhead).\n"
-			  "%.0f page slots are required to track all free space.\n"
-		  "Current limits are:  %d page slots, %d relations, using %.0f kB.",
-			  Min(needed, MaxFSMPages),
-			  needed,
-			  MaxFSMPages, MaxFSMRelations,
-			  (double) FreeSpaceShmemSize() / 1024.0)));
-
-	CheckFreeSpaceMapStatistics(NOTICE, numRels, needed);
-	/* Print to server logs too because is deals with a config variable. */
-	CheckFreeSpaceMapStatistics(LOG, numRels, needed);
+	return (uint8) cat;
 }
 
-static void
-CheckFreeSpaceMapStatistics(int elevel, int numRels, double needed)
+/*
+ * Return the lower bound of the range of free space represented by given
+ * category.
+ */
+static Size
+fsm_space_cat_to_avail(uint8 cat)
 {
-	if (numRels == MaxFSMRelations)
-		ereport(elevel,
-				(errmsg("max_fsm_relations(%d) equals the number of relations checked",
-						MaxFSMRelations),
-				 errhint("You have at least %d relations.  "
-						 "Consider increasing the configuration parameter \"max_fsm_relations\".",
-						 numRels)));
-	else if (needed > MaxFSMPages)
-		ereport(elevel,
-				(errmsg("number of page slots needed (%.0f) exceeds max_fsm_pages (%d)",
-						needed, MaxFSMPages),
-				 errhint("Consider increasing the configuration parameter \"max_fsm_pages\" "
-						 "to a value over %.0f.", needed)));
+	/* The highest category represents exactly MaxFSMRequestSize bytes. */
+	if (cat == 255)
+		return MaxFSMRequestSize;
+	else
+		return cat * FSM_CAT_STEP;
 }
 
 /*
- * DumpFreeSpaceMap - dump contents of FSM into a disk file for later reload
- *
- * This is expected to be called during database shutdown, after updates to
- * the FSM have stopped.  We lock the FreeSpaceLock but that's purely pro
- * forma --- if anyone else is still accessing FSM, there's a problem.
+ * Which category does a page need to have, to accommodate x bytes of data?
+ * While fsm_size_to_avail_cat() rounds down, this needs to round up.
  */
-void
-DumpFreeSpaceMap(int code, Datum arg)
+static uint8
+fsm_space_needed_to_cat(Size needed)
 {
-	FILE	   *fp;
-	FsmCacheFileHeader header;
-	FSMRelation *fsmrel;
-
-	/* Try to create file */
-	unlink(FSM_CACHE_FILENAME); /* in case it exists w/wrong permissions */
-
-	fp = AllocateFile(FSM_CACHE_FILENAME, PG_BINARY_W);
-	if (fp == NULL)
-	{
-		elog(LOG, "could not write \"%s\": %m", FSM_CACHE_FILENAME);
-		return;
-	}
-
-	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
-
-	/* Write file header */
-	MemSet(&header, 0, sizeof(header));
-	strcpy(header.label, FSM_CACHE_LABEL);
-	header.endian = FSM_CACHE_ENDIAN;
-	header.version = FSM_CACHE_VERSION;
-	header.numRels = FreeSpaceMap->numRels;
-	if (fwrite(&header, 1, sizeof(header), fp) != sizeof(header))
-		goto write_failed;
-
-	/* For each relation, in order from least to most recently used... */
-	for (fsmrel = FreeSpaceMap->usageListTail;
-		 fsmrel != NULL;
-		 fsmrel = fsmrel->priorUsage)
-	{
-		FsmCacheRelHeader relheader;
-		int			nPages;
-
-		/* Write relation header */
-		MemSet(&relheader, 0, sizeof(relheader));
-		relheader.key = fsmrel->key;
-		relheader.isIndex = fsmrel->isIndex;
-		relheader.avgRequest = fsmrel->avgRequest;
-		relheader.interestingPages = fsmrel->interestingPages;
-		relheader.storedPages = fsmrel->storedPages;
-		if (fwrite(&relheader, 1, sizeof(relheader), fp) != sizeof(relheader))
-			goto write_failed;
-
-		/* Write the per-page data directly from the arena */
-		nPages = fsmrel->storedPages;
-		if (nPages > 0)
-		{
-			Size		len;
-			char	   *data;
-
-			if (fsmrel->isIndex)
-				len = nPages * sizeof(IndexFSMPageData);
-			else
-				len = nPages * sizeof(FSMPageData);
-			data = (char *)
-				(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
-			if (fwrite(data, 1, len, fp) != len)
-				goto write_failed;
-		}
-	}
-
-	/* Clean up */
-	LWLockRelease(FreeSpaceLock);
-
-	if (FreeFile(fp))
-	{
-		elog(LOG, "could not write \"%s\": %m", FSM_CACHE_FILENAME);
-		/* Remove busted cache file */
-		unlink(FSM_CACHE_FILENAME);
-	}
+	int cat;
 
-	return;
+	/* Can't ask for more space than the highest category represents */
+	if (needed > MaxFSMRequestSize)
+		elog(ERROR, "invalid FSM request size %d", needed);
 
-write_failed:
-	elog(LOG, "could not write \"%s\": %m", FSM_CACHE_FILENAME);
+	if (needed == 0)
+		return 1;
 
-	/* Clean up */
-	LWLockRelease(FreeSpaceLock);
+	cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
 
-	FreeFile(fp);
+	if (cat > 255)
+		cat = 255;
 
-	/* Remove busted cache file */
-	unlink(FSM_CACHE_FILENAME);
+	return (uint8) cat;
 }
 
 /*
- * LoadFreeSpaceMap - load contents of FSM from a disk file
- *
- * This is expected to be called during database startup, before any FSM
- * updates begin.  We lock the FreeSpaceLock but that's purely pro
- * forma --- if anyone else is accessing FSM yet, there's a problem.
- *
- * Notes: no complaint is issued if no cache file is found.  If the file is
- * found, it is deleted after reading.	Thus, if we crash without a clean
- * shutdown, the next cycle of life starts with no FSM data.  To do otherwise,
- * we'd need to do significantly more validation in this routine, because of
- * the likelihood that what is in the dump file would be out-of-date, eg
- * there might be entries for deleted or truncated rels.
+ * Returns the physical block number an FSM page
  */
-void
-LoadFreeSpaceMap(void)
+static BlockNumber
+fsm_logical_to_physical(FSMAddress addr)
 {
-	FILE	   *fp;
-	FsmCacheFileHeader header;
-	int			relno;
-
-	/* Try to open file */
-	fp = AllocateFile(FSM_CACHE_FILENAME, PG_BINARY_R);
-	if (fp == NULL)
-	{
-		if (errno != ENOENT)
-			elog(LOG, "could not read \"%s\": %m", FSM_CACHE_FILENAME);
-		return;
-	}
+	BlockNumber pages;
+	int leafno;
+	int l;
 
-	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
+	/*
+	 * Calculate the logical page number of the first leaf page below the
+	 * given page.
+	 */
+	leafno = addr.logpageno;
+	for (l = 0; l < addr.level; l++)
+		leafno *= SlotsPerFSMPage;
 
-	/* Read and verify file header */
-	if (fread(&header, 1, sizeof(header), fp) != sizeof(header) ||
-		strcmp(header.label, FSM_CACHE_LABEL) != 0 ||
-		header.endian != FSM_CACHE_ENDIAN ||
-		header.version != FSM_CACHE_VERSION ||
-		header.numRels < 0)
+	/* Count upper level nodes required to address the leaf page */
+	pages = 0;
+	for (l = 0; l < FSM_TREE_DEPTH; l++)
 	{
-		elog(LOG, "bogus file header in \"%s\"", FSM_CACHE_FILENAME);
-		goto read_failed;
+		pages += leafno + 1;
+		leafno /= SlotsPerFSMPage;
 	}
 
-	/* For each relation, in order from least to most recently used... */
-	for (relno = 0; relno < header.numRels; relno++)
-	{
-		FsmCacheRelHeader relheader;
-		Size		len;
-		char	   *data;
-		FSMRelation *fsmrel;
-		int			nPages;
-		int			curAlloc;
-		int			curAllocPages;
-
-		/* Read and verify relation header, as best we can */
-		if (fread(&relheader, 1, sizeof(relheader), fp) != sizeof(relheader) ||
-			(relheader.isIndex != false && relheader.isIndex != true) ||
-			relheader.avgRequest >= BLCKSZ ||
-			relheader.storedPages < 0)
-		{
-			elog(LOG, "bogus rel header in \"%s\"", FSM_CACHE_FILENAME);
-			goto read_failed;
-		}
-
-		/* Read the per-page data */
-		nPages = relheader.storedPages;
-		if (relheader.isIndex)
-			len = nPages * sizeof(IndexFSMPageData);
-		else
-			len = nPages * sizeof(FSMPageData);
-		data = (char *) palloc(len);
-		if (fread(data, 1, len, fp) != len)
-		{
-			elog(LOG, "premature EOF in \"%s\"", FSM_CACHE_FILENAME);
-			pfree(data);
-			goto read_failed;
-		}
-
-		/*
-		 * Okay, create the FSM entry and insert data into it.	Since the rels
-		 * were stored in reverse usage order, at the end of the loop they
-		 * will be correctly usage-ordered in memory; and if MaxFSMRelations
-		 * is less than it used to be, we will correctly drop the least
-		 * recently used ones.
-		 */
-		fsmrel = create_fsm_rel(&relheader.key);
-		fsmrel->avgRequest = relheader.avgRequest;
-
-		curAlloc = realloc_fsm_rel(fsmrel, relheader.interestingPages,
-								   relheader.isIndex);
-		if (relheader.isIndex)
-		{
-			IndexFSMPageData *newLocation;
-
-			curAllocPages = curAlloc * INDEXCHUNKPAGES;
-
-			/*
-			 * If the data fits in our current allocation, just copy it;
-			 * otherwise must compress.  But compression is easy: we merely
-			 * forget extra pages.
-			 */
-			newLocation = (IndexFSMPageData *)
-				(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
-			if (nPages > curAllocPages)
-				nPages = curAllocPages;
-			memcpy(newLocation, data, nPages * sizeof(IndexFSMPageData));
-			fsmrel->storedPages = nPages;
-		}
-		else
-		{
-			FSMPageData *newLocation;
-
-			curAllocPages = curAlloc * CHUNKPAGES;
-
-			/*
-			 * If the data fits in our current allocation, just copy it;
-			 * otherwise must compress.
-			 */
-			newLocation = (FSMPageData *)
-				(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
-			if (nPages <= curAllocPages)
-			{
-				memcpy(newLocation, data, nPages * sizeof(FSMPageData));
-				fsmrel->storedPages = nPages;
-			}
-			else
-			{
-				pack_existing_pages(newLocation, curAllocPages,
-									(FSMPageData *) data, nPages);
-				fsmrel->storedPages = curAllocPages;
-			}
-		}
-
-		pfree(data);
-	}
-
-read_failed:
-
-	/* Clean up */
-	LWLockRelease(FreeSpaceLock);
-
-	FreeFile(fp);
+	/*
+	 * If the page we were asked for wasn't at the bottom level, subtract
+	 * the additional lower level pages we counted above.
+	 */
+	pages -= addr.level;
 
-	/* Remove cache file before it can become stale; see notes above */
-	unlink(FSM_CACHE_FILENAME);
+	/* Turn the page count into 0-based block number */
+	return pages - 1;
 }
 
-
-/*
- * Internal routines.  These all assume the caller holds the FreeSpaceLock.
- */
-
 /*
- * Lookup a relation in the hash table.  If not present, return NULL.
- *
- * The relation's position in the LRU list is not changed.
+ * Return the FSM location corresponding to given heap block.
  */
-static FSMRelation *
-lookup_fsm_rel(RelFileNode *rel)
+static FSMAddress
+fsm_get_location(BlockNumber heapblk, uint16 *slot)
 {
-	FSMRelation *fsmrel;
+	FSMAddress addr;
 
-	fsmrel = (FSMRelation *) hash_search(FreeSpaceMapRelHash,
-										 (void *) rel,
-										 HASH_FIND,
-										 NULL);
-	if (!fsmrel)
-		return NULL;
+	addr.level = FSM_BOTTOM_LEVEL;
+	addr.logpageno = heapblk / SlotsPerFSMPage;
+	*slot = heapblk % SlotsPerFSMPage;
 
-	return fsmrel;
+	return addr;
 }
 
 /*
- * Lookup a relation in the hash table, creating an entry if not present.
- *
- * On successful lookup, the relation is moved to the front of the LRU list.
+ * Return the heap block number corresponding to given location in the FSM.
  */
-static FSMRelation *
-create_fsm_rel(RelFileNode *rel)
+static BlockNumber
+fsm_get_heap_blk(FSMAddress addr, uint16 slot)
 {
-	FSMRelation *fsmrel;
-	bool		found;
-
-	fsmrel = (FSMRelation *) hash_search(FreeSpaceMapRelHash,
-										 (void *) rel,
-										 HASH_ENTER,
-										 &found);
-
-	if (!found)
-	{
-		/* New hashtable entry, initialize it (hash_search set the key) */
-		fsmrel->isIndex = false;	/* until we learn different */
-		fsmrel->avgRequest = INITIAL_AVERAGE;
-		fsmrel->interestingPages = 0;
-		fsmrel->firstChunk = -1;	/* no space allocated */
-		fsmrel->storedPages = 0;
-		fsmrel->nextPage = 0;
-
-		/* Discard lowest-priority existing rel, if we are over limit */
-		if (FreeSpaceMap->numRels >= MaxFSMRelations)
-			delete_fsm_rel(FreeSpaceMap->usageListTail);
-
-		/* Add new entry at front of LRU list */
-		link_fsm_rel_usage(fsmrel);
-		fsmrel->nextPhysical = NULL;	/* not in physical-storage list */
-		fsmrel->priorPhysical = NULL;
-		FreeSpaceMap->numRels++;
-		/* sumRequests is unchanged because request must be zero */
-	}
-	else
-	{
-		/* Existing entry, move to front of LRU list */
-		if (fsmrel->priorUsage != NULL)
-		{
-			unlink_fsm_rel_usage(fsmrel);
-			link_fsm_rel_usage(fsmrel);
-		}
-	}
-
-	return fsmrel;
+	Assert(addr.level == FSM_BOTTOM_LEVEL);
+	return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
 }
 
 /*
- * Remove an existing FSMRelation entry.
+ * Given a logical address of a child page, get the logical page number of
+ * the parent, and the slot within the parent corresponding to the child.
  */
-static void
-delete_fsm_rel(FSMRelation *fsmrel)
+static FSMAddress
+fsm_get_parent(FSMAddress child, uint16 *slot)
 {
-	FSMRelation *result;
-
-	FreeSpaceMap->sumRequests -= fsm_calc_request(fsmrel);
-	unlink_fsm_rel_usage(fsmrel);
-	unlink_fsm_rel_storage(fsmrel);
-	FreeSpaceMap->numRels--;
-	result = (FSMRelation *) hash_search(FreeSpaceMapRelHash,
-										 (void *) &(fsmrel->key),
-										 HASH_REMOVE,
-										 NULL);
-	if (!result)
-		elog(ERROR, "FreeSpaceMap hashtable corrupted");
-}
+	FSMAddress parent;
 
-/*
- * Reallocate space for a FSMRelation.
- *
- * This is shared code for RecordRelationFreeSpace and RecordIndexFreeSpace.
- * The return value is the actual new allocation, in chunks.
- */
-static int
-realloc_fsm_rel(FSMRelation *fsmrel, BlockNumber interestingPages,
-				bool isIndex)
-{
-	int			myRequest;
-	int			myAlloc;
-	int			curAlloc;
+	Assert(child.level < FSM_ROOT_LEVEL);
 
-	/*
-	 * Delete any existing entries, and update request status.
-	 */
-	fsmrel->storedPages = 0;
-	FreeSpaceMap->sumRequests -= fsm_calc_request(fsmrel);
-	fsmrel->interestingPages = interestingPages;
-	fsmrel->isIndex = isIndex;
-	myRequest = fsm_calc_request(fsmrel);
-	FreeSpaceMap->sumRequests += myRequest;
-	myAlloc = fsm_calc_target_allocation(myRequest);
+	parent.level = child.level + 1;
+	parent.logpageno = child.logpageno / SlotsPerFSMPage;
+	*slot = child.logpageno % SlotsPerFSMPage;
 
-	/*
-	 * Need to reallocate space if (a) my target allocation is more than my
-	 * current allocation, AND (b) my actual immediate need (myRequest+1
-	 * chunks) is more than my current allocation. Otherwise just store the
-	 * new data in-place.
-	 */
-	curAlloc = fsm_current_allocation(fsmrel);
-	if (myAlloc > curAlloc && (myRequest + 1) > curAlloc && interestingPages > 0)
-	{
-		/* Remove entry from storage list, and compact */
-		unlink_fsm_rel_storage(fsmrel);
-		compact_fsm_storage();
-		/* Reattach to end of storage list */
-		link_fsm_rel_storage(fsmrel);
-		/* And allocate storage */
-		fsmrel->firstChunk = FreeSpaceMap->usedChunks;
-		FreeSpaceMap->usedChunks += myAlloc;
-		curAlloc = myAlloc;
-		/* Watch out for roundoff error */
-		if (FreeSpaceMap->usedChunks > FreeSpaceMap->totalChunks)
-		{
-			FreeSpaceMap->usedChunks = FreeSpaceMap->totalChunks;
-			curAlloc = FreeSpaceMap->totalChunks - fsmrel->firstChunk;
-		}
-	}
-	return curAlloc;
+	return parent;
 }
 
 /*
- * Link a FSMRelation into the LRU list (always at the head).
+ * Given a logical address of a parent page, and a slot number get the
+ * logical address of the corresponding child page.
  */
-static void
-link_fsm_rel_usage(FSMRelation *fsmrel)
+static FSMAddress
+fsm_get_child(FSMAddress parent, uint16 slot)
 {
-	fsmrel->priorUsage = NULL;
-	fsmrel->nextUsage = FreeSpaceMap->usageList;
-	FreeSpaceMap->usageList = fsmrel;
-	if (fsmrel->nextUsage != NULL)
-		fsmrel->nextUsage->priorUsage = fsmrel;
-	else
-		FreeSpaceMap->usageListTail = fsmrel;
-}
+	FSMAddress child;
 
-/*
- * Delink a FSMRelation from the LRU list.
- */
-static void
-unlink_fsm_rel_usage(FSMRelation *fsmrel)
-{
-	if (fsmrel->priorUsage != NULL)
-		fsmrel->priorUsage->nextUsage = fsmrel->nextUsage;
-	else
-		FreeSpaceMap->usageList = fsmrel->nextUsage;
-	if (fsmrel->nextUsage != NULL)
-		fsmrel->nextUsage->priorUsage = fsmrel->priorUsage;
-	else
-		FreeSpaceMap->usageListTail = fsmrel->priorUsage;
+	Assert(parent.level > FSM_BOTTOM_LEVEL);
 
-	/*
-	 * We don't bother resetting fsmrel's links, since it's about to be
-	 * deleted or relinked at the head.
-	 */
-}
+	child.level = parent.level - 1;
+	child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
 
-/*
- * Link a FSMRelation into the storage-order list (always at the tail).
- */
-static void
-link_fsm_rel_storage(FSMRelation *fsmrel)
-{
-	fsmrel->nextPhysical = NULL;
-	fsmrel->priorPhysical = FreeSpaceMap->lastRel;
-	if (FreeSpaceMap->lastRel != NULL)
-		FreeSpaceMap->lastRel->nextPhysical = fsmrel;
-	else
-		FreeSpaceMap->firstRel = fsmrel;
-	FreeSpaceMap->lastRel = fsmrel;
+	return child;
 }
 
 /*
- * Delink a FSMRelation from the storage-order list, if it's in it.
+ * Read a FSM page.
+ *
+ * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
+ * true, the FSM file is extended.
  */
-static void
-unlink_fsm_rel_storage(FSMRelation *fsmrel)
+static Buffer
+fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
 {
-	if (fsmrel->priorPhysical != NULL || FreeSpaceMap->firstRel == fsmrel)
+	BlockNumber blkno = fsm_logical_to_physical(addr);
+
+	RelationOpenSmgr(rel);
+
+	if (rel->rd_fsm_nblocks_cache == InvalidBlockNumber || 
+		rel->rd_fsm_nblocks_cache <= blkno)
+		rel->rd_fsm_nblocks_cache = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
+
+	if (blkno >= rel->rd_fsm_nblocks_cache)
 	{
-		if (fsmrel->priorPhysical != NULL)
-			fsmrel->priorPhysical->nextPhysical = fsmrel->nextPhysical;
+		if (extend)
+			fsm_extend(rel, blkno + 1);
 		else
-			FreeSpaceMap->firstRel = fsmrel->nextPhysical;
-		if (fsmrel->nextPhysical != NULL)
-			fsmrel->nextPhysical->priorPhysical = fsmrel->priorPhysical;
-		else
-			FreeSpaceMap->lastRel = fsmrel->priorPhysical;
+			return InvalidBuffer;
 	}
-	/* mark as not in list, since we may not put it back immediately */
-	fsmrel->nextPhysical = NULL;
-	fsmrel->priorPhysical = NULL;
-	/* Also mark it as having no storage */
-	fsmrel->firstChunk = -1;
-	fsmrel->storedPages = 0;
+	return ReadBufferWithFork(rel, FSM_FORKNUM, blkno);
 }
 
 /*
- * Look to see if a page with at least the specified amount of space is
- * available in the given FSMRelation.	If so, return its page number,
- * and advance the nextPage counter so that the next inquiry will return
- * a different page if possible; also update the entry to show that the
- * requested space is not available anymore.  Return InvalidBlockNumber
- * if no success.
+ * Ensure that the FSM fork is at least n_fsmblocks long, extending
+ * it if necessary with empty pages. And by empty, I mean pages filled
+ * with zeros, meaning there's no free space.
  */
-static BlockNumber
-find_free_space(FSMRelation *fsmrel, Size spaceNeeded)
+static void
+fsm_extend(Relation rel, BlockNumber n_fsmblocks)
 {
-	FSMPageData *info;
-	int			pagesToCheck,	/* outer loop counter */
-				pageIndex;		/* current page index */
-
-	if (fsmrel->isIndex)
-		elog(ERROR, "find_free_space called for an index relation");
-	info = (FSMPageData *)
-		(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
-	pageIndex = fsmrel->nextPage;
-	/* Last operation may have left nextPage pointing past end */
-	if (pageIndex >= fsmrel->storedPages)
-		pageIndex = 0;
-
-	for (pagesToCheck = fsmrel->storedPages; pagesToCheck > 0; pagesToCheck--)
-	{
-		FSMPageData *page = info + pageIndex;
-		Size		spaceAvail = FSMPageGetSpace(page);
-
-		/* Check this page */
-		if (spaceAvail >= spaceNeeded)
-		{
-			/*
-			 * Found what we want --- adjust the entry, and update nextPage.
-			 */
-			FSMPageSetSpace(page, spaceAvail - spaceNeeded);
-			fsmrel->nextPage = pageIndex + 1;
-			return FSMPageGetPageNum(page);
-		}
-		/* Advance pageIndex, wrapping around if needed */
-		if (++pageIndex >= fsmrel->storedPages)
-			pageIndex = 0;
-	}
+	BlockNumber n_fsmblocks_now;
+	Page pg;
 
-	return InvalidBlockNumber;	/* nothing found */
-}
-
-/*
- * As above, but for index case --- we only deal in whole pages.
- */
-static BlockNumber
-find_index_free_space(FSMRelation *fsmrel)
-{
-	IndexFSMPageData *info;
-	BlockNumber result;
+	pg = (Page) palloc(BLCKSZ);
+	PageInit(pg, BLCKSZ, 0);
 
 	/*
-	 * If isIndex isn't set, it could be that RecordIndexFreeSpace() has never
-	 * yet been called on this relation, and we're still looking at the
-	 * default setting from create_fsm_rel().  If so, just act as though
-	 * there's no space.
+	 * We use the relation extension lock to lock out other backends
+	 * trying to extend the FSM at the same time. It also locks out
+	 * extension of the main fork, unnecessarily, but extending the
+	 * FSM happens seldom enough that it doesn't seem worthwhile to
+	 * have a separate lock tag type for it.
+	 *
+	 * Note that another backend might have extended the relation
+	 * before we get the lock.
 	 */
-	if (!fsmrel->isIndex)
+	LockRelationForExtension(rel, ExclusiveLock);
+
+	n_fsmblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
+	while (n_fsmblocks_now < n_fsmblocks)
 	{
-		if (fsmrel->storedPages == 0)
-			return InvalidBlockNumber;
-		elog(ERROR, "find_index_free_space called for a non-index relation");
+		smgrextend(rel->rd_smgr, FSM_FORKNUM, n_fsmblocks_now,
+				   (char *) pg, rel->rd_istemp);
+		n_fsmblocks_now++;
 	}
 
-	/*
-	 * For indexes, there's no need for the nextPage state variable; we just
-	 * remove and return the first available page.	(We could save cycles here
-	 * by returning the last page, but it seems better to encourage re-use of
-	 * lower-numbered pages.)
-	 */
-	if (fsmrel->storedPages <= 0)
-		return InvalidBlockNumber;		/* no pages available */
-	info = (IndexFSMPageData *)
-		(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
-	result = IndexFSMPageGetPageNum(info);
-	fsmrel->storedPages--;
-	memmove(info, info + 1, fsmrel->storedPages * sizeof(IndexFSMPageData));
-	return result;
-}
-
-/*
- * fsm_record_free_space - guts of RecordFreeSpace operation (now only
- * provided as part of RecordAndGetPageWithFreeSpace).
- */
-static void
-fsm_record_free_space(FSMRelation *fsmrel, BlockNumber page, Size spaceAvail)
-{
-	int			pageIndex;
+	UnlockRelationForExtension(rel, ExclusiveLock);
 
-	if (fsmrel->isIndex)
-		elog(ERROR, "fsm_record_free_space called for an index relation");
-	if (lookup_fsm_page_entry(fsmrel, page, &pageIndex))
-	{
-		/* Found an existing entry for page; update it */
-		FSMPageData *info;
+	pfree(pg);
 
-		info = (FSMPageData *)
-			(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
-		info += pageIndex;
-		FSMPageSetSpace(info, spaceAvail);
-	}
-	else
-	{
-		/*
-		 * No existing entry; ignore the call.	We used to add the page to the
-		 * FSM --- but in practice, if the page hasn't got enough space to
-		 * satisfy the caller who's kicking it back to us, then it's probably
-		 * uninteresting to everyone else as well.
-		 */
-	}
+	/* update the cache with the up-to-date size */
+	rel->rd_fsm_nblocks_cache = n_fsmblocks_now;
 }
 
 /*
- * Look for an entry for a specific page (block number) in a FSMRelation.
- * Returns TRUE if a matching entry exists, else FALSE.
+ * Set value in given FSM page and slot.
  *
- * The output argument *outPageIndex is set to indicate where the entry exists
- * (if TRUE result) or could be inserted (if FALSE result).
+ * If minValue > 0, the updated page is also searched for a page with at
+ * least minValue of free space. If one is found, its slot number is
+ * returned, -1 otherwise.
  */
-static bool
-lookup_fsm_page_entry(FSMRelation *fsmrel, BlockNumber page,
-					  int *outPageIndex)
+static int
+fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
+				   uint8 newValue, uint8 minValue)
 {
-	/* Check for empty relation */
-	if (fsmrel->storedPages <= 0)
-	{
-		*outPageIndex = 0;
-		return false;
-	}
+	Buffer		buf;
+	Page		page;
+	int			newslot = -1;
 
-	/* Do binary search */
-	if (fsmrel->isIndex)
-	{
-		IndexFSMPageData *info;
-		int			low,
-					high;
-
-		info = (IndexFSMPageData *)
-			(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
-		low = 0;
-		high = fsmrel->storedPages - 1;
-		while (low <= high)
-		{
-			int			middle;
-			BlockNumber probe;
+	buf = fsm_readbuf(rel, addr, true);
+	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
-			middle = low + (high - low) / 2;
-			probe = IndexFSMPageGetPageNum(info + middle);
-			if (probe == page)
-			{
-				*outPageIndex = middle;
-				return true;
-			}
-			else if (probe < page)
-				low = middle + 1;
-			else
-				high = middle - 1;
-		}
-		*outPageIndex = low;
-		return false;
-	}
-	else
-	{
-		FSMPageData *info;
-		int			low,
-					high;
-
-		info = (FSMPageData *)
-			(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
-		low = 0;
-		high = fsmrel->storedPages - 1;
-		while (low <= high)
-		{
-			int			middle;
-			BlockNumber probe;
+	page = BufferGetPage(buf);
 
-			middle = low + (high - low) / 2;
-			probe = FSMPageGetPageNum(info + middle);
-			if (probe == page)
-			{
-				*outPageIndex = middle;
-				return true;
-			}
-			else if (probe < page)
-				low = middle + 1;
-			else
-				high = middle - 1;
-		}
-		*outPageIndex = low;
-		return false;
+	if (fsm_set_avail(page, slot, newValue))
+		MarkBufferDirty(buf);
+
+	if (minValue != 0)
+	{
+		/* Search while we still hold the lock */
+		newslot = fsm_search_avail(buf, minValue,
+								   addr.level == FSM_BOTTOM_LEVEL,
+								   true);
 	}
+
+	UnlockReleaseBuffer(buf);
+
+	return newslot;
 }
 
 /*
- * Re-pack the FSM storage arena, dropping data if necessary to meet the
- * current allocation target for each relation.  At conclusion, all available
- * space in the arena will be coalesced at the end.
+ * Search the tree for a heap page with at least min_cat of free space
  */
-static void
-compact_fsm_storage(void)
+static BlockNumber
+fsm_search(Relation rel, uint8 min_cat)
 {
-	int			nextChunkIndex = 0;
-	bool		did_push = false;
-	FSMRelation *fsmrel;
+	int restarts = 0;
+	FSMAddress addr = FSM_ROOT_ADDRESS;
 
-	for (fsmrel = FreeSpaceMap->firstRel;
-		 fsmrel != NULL;
-		 fsmrel = fsmrel->nextPhysical)
+	for (;;)
 	{
-		int			newAlloc;
-		int			newAllocPages;
-		int			newChunkIndex;
-		int			oldChunkIndex;
-		int			curChunks;
-		char	   *newLocation;
-		char	   *oldLocation;
+		int slot;
+		Buffer buf;
+		uint8 max_avail;
 
 		/*
-		 * Calculate target allocation, make sure we don't overrun due to
-		 * roundoff error
+		 * Read the FSM page. The root page is created if it doesn't exist
+		 * yet, to save future searchers the effort of having to call
+		 * smgrnblocks() in fsm_readbuf(), only to see that the FSM is
+		 * completely empty.
 		 */
-		newAlloc = fsm_calc_target_allocation(fsm_calc_request(fsmrel));
-		if (newAlloc > FreeSpaceMap->totalChunks - nextChunkIndex)
-			newAlloc = FreeSpaceMap->totalChunks - nextChunkIndex;
-		if (fsmrel->isIndex)
-			newAllocPages = newAlloc * INDEXCHUNKPAGES;
-		else
-			newAllocPages = newAlloc * CHUNKPAGES;
-
-		/*
-		 * Determine current size, current and new locations
-		 */
-		curChunks = fsm_current_chunks(fsmrel);
-		oldChunkIndex = fsmrel->firstChunk;
-		oldLocation = FreeSpaceMap->arena + oldChunkIndex * CHUNKBYTES;
-		newChunkIndex = nextChunkIndex;
-		newLocation = FreeSpaceMap->arena + newChunkIndex * CHUNKBYTES;
+		buf = fsm_readbuf(rel, addr, (addr.level != FSM_ROOT_LEVEL));
 
-		/*
-		 * It's possible that we have to move data down, not up, if the
-		 * allocations of previous rels expanded.  This normally means that
-		 * our allocation expanded too (or at least got no worse), and ditto
-		 * for later rels.	So there should be room to move all our data down
-		 * without dropping any --- but we might have to push down following
-		 * rels to acquire the room.  We don't want to do the push more than
-		 * once, so pack everything against the end of the arena if so.
-		 *
-		 * In corner cases where we are on the short end of a roundoff choice
-		 * that we were formerly on the long end of, it's possible that we
-		 * have to move down and compress our data too.  In fact, even after
-		 * pushing down the following rels, there might not be as much space
-		 * as we computed for this rel above --- that would imply that some
-		 * following rel(s) are also on the losing end of roundoff choices. We
-		 * could handle this fairly by doing the per-rel compactions
-		 * out-of-order, but that seems like way too much complexity to deal
-		 * with a very infrequent corner case. Instead, we simply drop pages
-		 * from the end of the current rel's data until it fits.
-		 */
-		if (newChunkIndex > oldChunkIndex)
+		/* Search within the page */
+		if (BufferIsValid(buf))
 		{
-			int			limitChunkIndex;
-
-			if (newAllocPages < fsmrel->storedPages)
-			{
-				/* move and compress --- just drop excess pages */
-				fsmrel->storedPages = newAllocPages;
-				curChunks = fsm_current_chunks(fsmrel);
-			}
-			/* is there enough space? */
-			if (fsmrel->nextPhysical != NULL)
-				limitChunkIndex = fsmrel->nextPhysical->firstChunk;
-			else
-				limitChunkIndex = FreeSpaceMap->totalChunks;
-			if (newChunkIndex + curChunks > limitChunkIndex)
-			{
-				/* not enough space, push down following rels */
-				if (!did_push)
-				{
-					push_fsm_rels_after(fsmrel);
-					did_push = true;
-				}
-				/* now is there enough space? */
-				if (fsmrel->nextPhysical != NULL)
-					limitChunkIndex = fsmrel->nextPhysical->firstChunk;
-				else
-					limitChunkIndex = FreeSpaceMap->totalChunks;
-				if (newChunkIndex + curChunks > limitChunkIndex)
-				{
-					/* uh-oh, forcibly cut the allocation to fit */
-					newAlloc = limitChunkIndex - newChunkIndex;
-
-					/*
-					 * If newAlloc < 0 at this point, we are moving the rel's
-					 * firstChunk into territory currently assigned to a later
-					 * rel.  This is okay so long as we do not copy any data.
-					 * The rels will be back in nondecreasing firstChunk order
-					 * at completion of the compaction pass.
-					 */
-					if (newAlloc < 0)
-						newAlloc = 0;
-					if (fsmrel->isIndex)
-						newAllocPages = newAlloc * INDEXCHUNKPAGES;
-					else
-						newAllocPages = newAlloc * CHUNKPAGES;
-					fsmrel->storedPages = newAllocPages;
-					curChunks = fsm_current_chunks(fsmrel);
-				}
-			}
-			memmove(newLocation, oldLocation, curChunks * CHUNKBYTES);
+			LockBuffer(buf, BUFFER_LOCK_SHARE);
+			slot = fsm_search_avail(buf, min_cat,
+									(addr.level == FSM_BOTTOM_LEVEL),
+									false);
+			if (slot == -1)
+				max_avail = fsm_get_max_avail(BufferGetPage(buf));
+			UnlockReleaseBuffer(buf);
+		}
+		else
+		{
+			slot = -1;
+			max_avail = 0;
 		}
-		else if (newAllocPages < fsmrel->storedPages)
+
+		if (slot != -1)
 		{
 			/*
-			 * Need to compress the page data.	For an index, "compression"
-			 * just means dropping excess pages; otherwise we try to keep the
-			 * ones with the most space.
+			 * Descend the tree, or return the found block if we're at the
+			 * bottom.
 			 */
-			if (fsmrel->isIndex)
-			{
-				fsmrel->storedPages = newAllocPages;
-				/* may need to move data */
-				if (newChunkIndex != oldChunkIndex)
-					memmove(newLocation, oldLocation, newAlloc * CHUNKBYTES);
-			}
-			else
-			{
-				pack_existing_pages((FSMPageData *) newLocation,
-									newAllocPages,
-									(FSMPageData *) oldLocation,
-									fsmrel->storedPages);
-				fsmrel->storedPages = newAllocPages;
-			}
+			if (addr.level == FSM_BOTTOM_LEVEL)
+				return fsm_get_heap_blk(addr, slot);
+
+			addr = fsm_get_child(addr, slot);
 		}
-		else if (newChunkIndex != oldChunkIndex)
+		else if (addr.level == FSM_ROOT_LEVEL)
 		{
 			/*
-			 * No compression needed, but must copy the data up
+			 * At the root, failure means there's no page with enough free
+			 * space in the FSM. Give up.
 			 */
-			memmove(newLocation, oldLocation, curChunks * CHUNKBYTES);
+			return InvalidBlockNumber;
 		}
-		fsmrel->firstChunk = newChunkIndex;
-		nextChunkIndex += newAlloc;
-	}
-	Assert(nextChunkIndex <= FreeSpaceMap->totalChunks);
-	FreeSpaceMap->usedChunks = nextChunkIndex;
-}
-
-/*
- * Push all FSMRels physically after afterRel to the end of the storage arena.
- *
- * We sometimes have to do this when deletion or truncation of a relation
- * causes the allocations of remaining rels to expand markedly.  We must
- * temporarily push existing data down to the end so that we can move it
- * back up in an orderly fashion.
- */
-static void
-push_fsm_rels_after(FSMRelation *afterRel)
-{
-	int			nextChunkIndex = FreeSpaceMap->totalChunks;
-	FSMRelation *fsmrel;
-
-	FreeSpaceMap->usedChunks = FreeSpaceMap->totalChunks;
+		else
+		{
+			uint16 parentslot;
+			FSMAddress parent;
 
-	for (fsmrel = FreeSpaceMap->lastRel;
-		 fsmrel != NULL;
-		 fsmrel = fsmrel->priorPhysical)
-	{
-		int			chunkCount;
-		int			newChunkIndex;
-		int			oldChunkIndex;
-		char	   *newLocation;
-		char	   *oldLocation;
+			/*
+			 * At lower level, failure can happen if the value in the upper-
+			 * level node didn't reflect the value on the lower page. Update
+			 * the upper node, to avoid falling into the same trap again, and
+			 * start over.
+			 *
+			 * There's a race condition here, if another backend updates this
+			 * page right after we release it, and gets the lock on the parent
+			 * page before us. We'll then update the parent page with the now
+			 * stale information we had. It's OK, because it should happen
+			 * rarely, and will be fixed by the next vacuum.
+			 */
+			parent = fsm_get_parent(addr, &parentslot);
+			fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
 
-		if (fsmrel == afterRel)
-			break;
+			/*
+			 * If the upper pages are badly out of date, we might need to
+			 * loop quite a few times, updating them as we go. Any
+			 * inconsistencies should eventually be corrected and the loop
+			 * should end. Looping indefinitely is nevertheless scary, so
+			 * provide an emergency valve.
+			 */
+			if (restarts++ > 10000)
+				return InvalidBlockNumber;
 
-		chunkCount = fsm_current_chunks(fsmrel);
-		nextChunkIndex -= chunkCount;
-		newChunkIndex = nextChunkIndex;
-		oldChunkIndex = fsmrel->firstChunk;
-		if (newChunkIndex < oldChunkIndex)
-		{
-			/* we're pushing down, how can it move up? */
-			elog(PANIC, "inconsistent entry sizes in FSM");
-		}
-		else if (newChunkIndex > oldChunkIndex)
-		{
-			/* need to move it */
-			newLocation = FreeSpaceMap->arena + newChunkIndex * CHUNKBYTES;
-			oldLocation = FreeSpaceMap->arena + oldChunkIndex * CHUNKBYTES;
-			memmove(newLocation, oldLocation, chunkCount * CHUNKBYTES);
-			fsmrel->firstChunk = newChunkIndex;
+			/* Start search all over from the root */
+			addr = FSM_ROOT_ADDRESS;
 		}
 	}
-	Assert(nextChunkIndex >= 0);
 }
 
+
 /*
- * Pack a set of per-page freespace data into a smaller amount of space.
- *
- * The method is to compute a low-resolution histogram of the free space
- * amounts, then determine which histogram bin contains the break point.
- * We then keep all pages above that bin, none below it, and just enough
- * of the pages in that bin to fill the output area exactly.
+ * Recursive guts of FreeSpaceMapVacuum
  */
-#define HISTOGRAM_BINS	64
-
-static void
-pack_incoming_pages(FSMPageData *newLocation, int newPages,
-					FSMPageData *pageSpaces, int nPages)
+static uint8
+fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p)
 {
-	int			histogram[HISTOGRAM_BINS];
-	int			above,
-				binct,
-				i;
-	Size		thresholdL,
-				thresholdU;
-
-	Assert(newPages < nPages);	/* else I shouldn't have been called */
-	/* Build histogram */
-	MemSet(histogram, 0, sizeof(histogram));
-	for (i = 0; i < nPages; i++)
-	{
-		Size		avail = FSMPageGetSpace(&pageSpaces[i]);
-
-		if (avail >= BLCKSZ)
-			elog(ERROR, "bogus freespace amount");
-		avail /= (BLCKSZ / HISTOGRAM_BINS);
-		histogram[avail]++;
-	}
-	/* Find the breakpoint bin */
-	above = 0;
-	for (i = HISTOGRAM_BINS - 1; i >= 0; i--)
-	{
-		int			sum = above + histogram[i];
+	Buffer buf;
+	Page page;
+	uint8 max_avail;
 
-		if (sum > newPages)
-			break;
-		above = sum;
-	}
-	Assert(i >= 0);
-	thresholdL = i * BLCKSZ / HISTOGRAM_BINS;	/* low bound of bp bin */
-	thresholdU = (i + 1) * BLCKSZ / HISTOGRAM_BINS;		/* hi bound */
-	binct = newPages - above;	/* number to take from bp bin */
-	/* And copy the appropriate data */
-	for (i = 0; i < nPages; i++)
+	/* Read the page if it exists, or return EOF */
+	buf = fsm_readbuf(rel, addr, false);
+	if (!BufferIsValid(buf))
 	{
-		BlockNumber	page = FSMPageGetPageNum(&pageSpaces[i]);
-		Size		avail = FSMPageGetSpace(&pageSpaces[i]);
-
-		/* Check caller provides sorted data */
-		if (i > 0 && page <= FSMPageGetPageNum(&pageSpaces[i - 1]))
-			elog(ERROR, "free-space data is not in page order");
-		/* Save this page? */
-		if (avail >= thresholdU ||
-			(avail >= thresholdL && (--binct >= 0)))
-		{
-			*newLocation = pageSpaces[i];
-			newLocation++;
-			newPages--;
-		}
+		*eof_p = true;
+		return 0;
 	}
-	Assert(newPages == 0);
-}
-
-/*
- * Pack a set of per-page freespace data into a smaller amount of space.
- *
- * This is algorithmically identical to pack_incoming_pages(), but accepts
- * a different input representation.  Also, we assume the input data has
- * previously been checked for validity (size in bounds, pages in order).
- *
- * Note: it is possible for the source and destination arrays to overlap.
- * The caller is responsible for making sure newLocation is at lower addresses
- * so that we can copy data moving forward in the arrays without problem.
- */
-static void
-pack_existing_pages(FSMPageData *newLocation, int newPages,
-					FSMPageData *oldLocation, int oldPages)
-{
-	int			histogram[HISTOGRAM_BINS];
-	int			above,
-				binct,
-				i;
-	Size		thresholdL,
-				thresholdU;
-
-	Assert(newPages < oldPages);	/* else I shouldn't have been called */
-	/* Build histogram */
-	MemSet(histogram, 0, sizeof(histogram));
-	for (i = 0; i < oldPages; i++)
-	{
-		Size		avail = FSMPageGetSpace(oldLocation + i);
+	else
+		*eof_p = false;
 
-		/* Shouldn't happen, but test to protect against stack clobber */
-		if (avail >= BLCKSZ)
-			elog(ERROR, "bogus freespace amount");
-		avail /= (BLCKSZ / HISTOGRAM_BINS);
-		histogram[avail]++;
-	}
-	/* Find the breakpoint bin */
-	above = 0;
-	for (i = HISTOGRAM_BINS - 1; i >= 0; i--)
-	{
-		int			sum = above + histogram[i];
+	page = BufferGetPage(buf);
 
-		if (sum > newPages)
-			break;
-		above = sum;
-	}
-	Assert(i >= 0);
-	thresholdL = i * BLCKSZ / HISTOGRAM_BINS;	/* low bound of bp bin */
-	thresholdU = (i + 1) * BLCKSZ / HISTOGRAM_BINS;		/* hi bound */
-	binct = newPages - above;	/* number to take from bp bin */
-	/* And copy the appropriate data */
-	for (i = 0; i < oldPages; i++)
+	/*
+	 * Recurse into children, and fix the information stored about them
+	 * at this level.
+	 */
+	if (addr.level > FSM_BOTTOM_LEVEL)
 	{
-		BlockNumber page = FSMPageGetPageNum(oldLocation + i);
-		Size		avail = FSMPageGetSpace(oldLocation + i);
+		int slot;
+		bool eof = false;
 
-		/* Save this page? */
-		if (avail >= thresholdU ||
-			(avail >= thresholdL && (--binct >= 0)))
+		for (slot = 0; slot < SlotsPerFSMPage; slot++)
 		{
-			FSMPageSetPageNum(newLocation, page);
-			FSMPageSetSpace(newLocation, avail);
-			newLocation++;
-			newPages--;
-		}
-	}
-	Assert(newPages == 0);
-}
+			int child_avail;
 
-/*
- * Calculate number of chunks "requested" by a rel.  The "request" is
- * anything beyond the rel's one guaranteed chunk.
- *
- * Rel's interestingPages and isIndex settings must be up-to-date when called.
- *
- * See notes at top of file for details.
- */
-static int
-fsm_calc_request(FSMRelation *fsmrel)
-{
-	int			req;
+			/* After we hit end-of-file, just clear the rest of the slots */
+			if (!eof)
+				child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), &eof);
+			else
+				child_avail = 0;
 
-	/* Convert page count to chunk count */
-	if (fsmrel->isIndex)
-	{
-		/* test to avoid unsigned underflow at zero */
-		if (fsmrel->interestingPages <= INDEXCHUNKPAGES)
-			return 0;
-		/* quotient will fit in int, even if interestingPages doesn't */
-		req = (fsmrel->interestingPages - 1) / INDEXCHUNKPAGES;
-	}
-	else
-	{
-		if (fsmrel->interestingPages <= CHUNKPAGES)
-			return 0;
-		req = (fsmrel->interestingPages - 1) / CHUNKPAGES;
+			/* Update information about the child */
+			if (fsm_get_avail(page, slot) != child_avail)
+			{
+				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+				fsm_set_avail(BufferGetPage(buf), slot, child_avail);
+				MarkBufferDirty(buf);
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+			}
+		}
 	}
 
+	max_avail = fsm_get_max_avail(BufferGetPage(buf));
+
 	/*
-	 * We clamp the per-relation requests to at most half the arena size; this
-	 * is intended to prevent a single bloated relation from crowding out FSM
-	 * service for every other rel.
+	 * Reset the next slot pointer. This encourages the use of low-numbered
+	 * pages, increasing the chances that a later vacuum can truncate the
+	 * relation.
 	 */
-	req = Min(req, FreeSpaceMap->totalChunks / 2);
-
-	return req;
-}
+	((FSMPage) PageGetContents(page))->fp_next_slot = 0;
 
-/*
- * Same as above, but without the clamp ... this is just intended for
- * reporting the total space needed to store all information.
- */
-static int
-fsm_calc_request_unclamped(FSMRelation *fsmrel)
-{
-	int			req;
-
-	/* Convert page count to chunk count */
-	if (fsmrel->isIndex)
-	{
-		/* test to avoid unsigned underflow at zero */
-		if (fsmrel->interestingPages <= INDEXCHUNKPAGES)
-			return 0;
-		/* quotient will fit in int, even if interestingPages doesn't */
-		req = (fsmrel->interestingPages - 1) / INDEXCHUNKPAGES;
-	}
-	else
-	{
-		if (fsmrel->interestingPages <= CHUNKPAGES)
-			return 0;
-		req = (fsmrel->interestingPages - 1) / CHUNKPAGES;
-	}
+	ReleaseBuffer(buf);
 
-	return req;
+	return max_avail;
 }
 
-/*
- * Calculate target allocation (number of chunks) for a rel
- *
- * Parameter is the result from fsm_calc_request().  The global sumRequests
- * and numRels totals must be up-to-date already.
- *
- * See notes at top of file for details.
- */
-static int
-fsm_calc_target_allocation(int myRequest)
-{
-	double		spareChunks;
-	int			extra;
 
-	spareChunks = FreeSpaceMap->totalChunks - FreeSpaceMap->numRels;
-	Assert(spareChunks > 0);
-	if (spareChunks >= FreeSpaceMap->sumRequests)
-	{
-		/* We aren't oversubscribed, so allocate exactly the request */
-		extra = myRequest;
-	}
-	else
-	{
-		extra = (int) rint(spareChunks * myRequest / FreeSpaceMap->sumRequests);
-		if (extra < 0)			/* shouldn't happen, but make sure */
-			extra = 0;
-	}
-	return 1 + extra;
-}
+/****** WAL-logging ******/
 
-/*
- * Calculate number of chunks actually used to store current data
- */
-static int
-fsm_current_chunks(FSMRelation *fsmrel)
+void
+fsm_redo(XLogRecPtr lsn, XLogRecord *record)
 {
-	int			chunkCount;
-
-	/* Make sure storedPages==0 produces right answer */
-	if (fsmrel->storedPages <= 0)
-		return 0;
-	/* Convert page count to chunk count */
-	if (fsmrel->isIndex)
-		chunkCount = (fsmrel->storedPages - 1) / INDEXCHUNKPAGES + 1;
-	else
-		chunkCount = (fsmrel->storedPages - 1) / CHUNKPAGES + 1;
-	return chunkCount;
-}
+	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
-/*
- * Calculate current actual allocation (number of chunks) for a rel
- */
-static int
-fsm_current_allocation(FSMRelation *fsmrel)
-{
-	if (fsmrel->nextPhysical != NULL)
-		return fsmrel->nextPhysical->firstChunk - fsmrel->firstChunk;
-	else if (fsmrel == FreeSpaceMap->lastRel)
-		return FreeSpaceMap->usedChunks - fsmrel->firstChunk;
-	else
+	switch (info)
 	{
-		/* it's not in the storage-order list */
-		Assert(fsmrel->firstChunk < 0 && fsmrel->storedPages == 0);
-		return 0;
-	}
-}
-
-
-/*
- * Return the FreeSpaceMap structure for examination.
- */
-FSMHeader *
-GetFreeSpaceMap(void)
-{
+		case XLOG_FSM_TRUNCATE:
+			{
+				xl_fsm_truncate *xlrec;
+				Relation rel;
 
-	return FreeSpaceMap;
+				xlrec = (xl_fsm_truncate *) XLogRecGetData(record);
+				rel = CreateFakeRelcacheEntry(xlrec->node);
+				FreeSpaceMapTruncateRel(rel, xlrec->nheapblocks);
+				FreeFakeRelcacheEntry(rel);
+			}
+			break;
+		default:
+			elog(PANIC, "fsm_redo: unknown op code %u", info);
+	}
 }
 
-
-#ifdef FREESPACE_DEBUG
-/*
- * Dump contents of freespace map for debugging.
- *
- * We assume caller holds the FreeSpaceLock, or is otherwise unconcerned
- * about other processes.
- */
 void
-DumpFreeSpace(void)
+fsm_desc(StringInfo buf, uint8 xl_info, char *rec)
 {
-	FSMRelation *fsmrel;
-	FSMRelation *prevrel = NULL;
-	int			relNum = 0;
-	int			nPages;
+	uint8           info = xl_info & ~XLR_INFO_MASK;
 
-	for (fsmrel = FreeSpaceMap->usageList; fsmrel; fsmrel = fsmrel->nextUsage)
+	switch (info)
 	{
-		relNum++;
-		fprintf(stderr, "Map %d: rel %u/%u/%u isIndex %d avgRequest %u interestingPages %u nextPage %d\nMap= ",
-				relNum,
-				fsmrel->key.spcNode, fsmrel->key.dbNode, fsmrel->key.relNode,
-				(int) fsmrel->isIndex, fsmrel->avgRequest,
-				fsmrel->interestingPages, fsmrel->nextPage);
-		if (fsmrel->isIndex)
-		{
-			IndexFSMPageData *page;
-
-			page = (IndexFSMPageData *)
-				(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
-			for (nPages = 0; nPages < fsmrel->storedPages; nPages++)
-			{
-				fprintf(stderr, " %u",
-						IndexFSMPageGetPageNum(page));
-				page++;
-			}
-		}
-		else
+		case XLOG_FSM_TRUNCATE:
 		{
-			FSMPageData *page;
+			xl_fsm_truncate *xlrec = (xl_fsm_truncate *) rec;
 
-			page = (FSMPageData *)
-				(FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES);
-			for (nPages = 0; nPages < fsmrel->storedPages; nPages++)
-			{
-				fprintf(stderr, " %u:%u",
-						FSMPageGetPageNum(page),
-						FSMPageGetSpace(page));
-				page++;
-			}
+			appendStringInfo(buf, "truncate: rel %u/%u/%u; nheapblocks %u;",
+							 xlrec->node.spcNode, xlrec->node.dbNode,
+							 xlrec->node.relNode, xlrec->nheapblocks);
+			break;
 		}
-		fprintf(stderr, "\n");
-		/* Cross-check list links */
-		if (prevrel != fsmrel->priorUsage)
-			fprintf(stderr, "DumpFreeSpace: broken list links\n");
-		prevrel = fsmrel;
+		default:
+			appendStringInfo(buf, "UNKNOWN");
+			break;
 	}
-	if (prevrel != FreeSpaceMap->usageListTail)
-		fprintf(stderr, "DumpFreeSpace: broken list links\n");
-	/* Cross-check global counters */
-	if (relNum != FreeSpaceMap->numRels)
-		fprintf(stderr, "DumpFreeSpace: %d rels in list, but numRels = %d\n",
-				relNum, FreeSpaceMap->numRels);
 }
-
-#endif   /* FREESPACE_DEBUG */
diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c
new file mode 100644
index 00000000000..ce6f47e8b91
--- /dev/null
+++ b/src/backend/storage/freespace/fsmpage.c
@@ -0,0 +1,352 @@
+/*-------------------------------------------------------------------------
+ *
+ * fsmpage.c
+ *	  routines to search and manipulate one FSM page.
+ *
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/storage/freespace/fsmpage.c,v 1.1 2008/09/30 10:52:13 heikki Exp $
+ *
+ * NOTES:
+ *
+ *  The public functions in this file form an API that hides the internal
+ *  structure of a FSM page. This allows freespace.c to treat each FSM page
+ *  as a black box with SlotsPerPage "slots". fsm_set_avail() and
+ *  fsm_get_avail() let's you get/set the value of a slot, and
+ *  fsm_search_avail() let's you search for a slot with value >= X.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/fsm_internals.h"
+
+/* macros to navigate the tree within a page. */
+#define leftchild(x)	(2 * (x) + 1)
+#define rightchild(x)	(2 * (x) + 2)
+#define parentof(x)		(((x) - 1) / 2)
+
+/* returns right sibling of x, wrapping around within the level */
+static int
+rightsibling(int x)
+{
+	/*
+	 * Move right. This might wrap around, stepping to the leftmost node at
+	 * the next level.
+	 */
+	x++;
+
+	/*
+	 * Check if we stepped to the leftmost node at next level, and correct
+	 * if so. The leftmost nodes at each level are of form x = 2^level - 1, so
+	 * check if (x + 1) is a power of two.
+	 */
+	if (((x + 1) & x) == 0)
+		x = parentof(x);
+
+	return x;
+}
+
+/*
+ * Sets the value of a slot on page. Returns true if the page was
+ * modified.
+ *
+ * The caller must hold an exclusive lock on the page.
+ */
+bool
+fsm_set_avail(Page page, int slot, uint8 value)
+{
+	int nodeno = NonLeafNodesPerPage + slot;
+	FSMPage fsmpage = (FSMPage) PageGetContents(page);
+	uint8 oldvalue;
+
+	Assert(slot < LeafNodesPerPage);
+
+	oldvalue = fsmpage->fp_nodes[nodeno];
+
+	/* If the value hasn't changed, we don't need to do anything */
+	if (oldvalue == value && value <= fsmpage->fp_nodes[0])
+		return false;
+
+	fsmpage->fp_nodes[nodeno] = value;
+
+	/*
+	 * Propagate up, until we hit the root or a node that doesn't
+	 * need to be updated.
+	 */
+	do
+	{
+		uint8 newvalue = 0;
+		int lchild;
+		int rchild;
+
+		nodeno = parentof(nodeno);
+		lchild = leftchild(nodeno);
+		rchild = lchild + 1;
+
+		newvalue = fsmpage->fp_nodes[lchild];
+		if (rchild < NodesPerPage)
+			newvalue = Max(newvalue,
+						   fsmpage->fp_nodes[rchild]);
+
+		oldvalue = fsmpage->fp_nodes[nodeno];
+		if (oldvalue == newvalue)
+			break;
+
+		fsmpage->fp_nodes[nodeno] = newvalue;
+	} while (nodeno > 0);
+
+	/*
+	 * sanity check: if the new value value is higher than the value
+	 * at the top, the tree is corrupt.
+	 */
+	if (value > fsmpage->fp_nodes[0])
+		fsm_rebuild_page(page);
+
+	return true;
+}
+
+/*
+ * Returns the value of given slot on page.
+ *
+ * Since this is just a read-only access of a single byte, the page doesn't
+ * need to be locked.
+ */
+uint8
+fsm_get_avail(Page page, int slot)
+{
+	FSMPage fsmpage = (FSMPage) PageGetContents(page);
+
+	return fsmpage->fp_nodes[NonLeafNodesPerPage + slot];
+}
+
+/*
+ * Returns the value at the root of a page.
+ * Since this is just a read-only access of a single byte, the page doesn't
+ * need to be locked.
+ */
+uint8
+fsm_get_max_avail(Page page)
+{
+	FSMPage fsmpage = (FSMPage) PageGetContents(page);
+	return fsmpage->fp_nodes[0];
+}
+
+/*
+ * Searches for a slot with min. category. Returns slot number, or -1 if 
+ * none found.
+ *
+ * The caller must hold at least a shared lock on the page, and this
+ * function can unlock and lock the page again in exclusive mode if it
+ * needs to be updated. exclusive_lock_held should be set to true if the
+ * caller is already holding an exclusive lock, to avoid extra work.
+ *
+ * If advancenext is false, fp_next_slot is set to point to the returned
+ * slot, and if it's true, to the slot next to the returned slot.
+ */
+int
+fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext,
+				 bool exclusive_lock_held)
+{
+	Page page = BufferGetPage(buf);
+	FSMPage fsmpage = (FSMPage) PageGetContents(page);
+	int nodeno;
+	int target;
+	uint16 slot;
+
+ restart:
+	/*
+	 * Check the root first, and exit quickly if there's no page with
+	 * enough free space
+	 */
+	if (fsmpage->fp_nodes[0] < minvalue)
+		return -1;
+
+
+	/* fp_next_slot is just a hint, so check that it's sane */
+	target = fsmpage->fp_next_slot;
+	if (target < 0 || target >= LeafNodesPerPage)
+		target = 0;
+	target += NonLeafNodesPerPage;
+
+	/*
+	 * Start the search from the target slot. At every step, move one
+	 * node to the right, and climb up to the parent. Stop when we reach a
+	 * node with enough free space. (note that moving to the right only
+	 * makes a difference if we're on the right child of the parent)
+	 *
+	 * The idea is to graduall expand our "search triangle", that is, all
+	 * nodes covered by the current node. In the beginning, just the target
+	 * node is included, and more nodes to the right of the target node,
+	 * taking wrap-around into account, is included at each step. Nodes are
+	 * added to the search triangle in left-to-right order, starting from
+	 * the target node. This ensures that we'll find the first suitable node
+	 * to the right of the target node, and not some other node with enough
+	 * free space.
+	 *
+	 * For example, consider this tree:
+	 *
+	 *         7
+	 *     7       6
+	 *   5   7   6   5
+	 *  4 5 5 7 2 6 5 2
+	 *              T
+	 *
+	 * Imagine that target node is the node indicated by the letter T, and
+	 * we're searching for a node with value of 6 or higher. The search
+	 * begins at T. At first iteration, we move to the right, and to the
+	 * parent, arriving the rightmost 5. At the 2nd iteration, we move to the
+	 * right, wrapping around, and climb up, arriving at the 7 at the 2nd
+	 * level. 7 satisfies our search, so we descend down to the bottom,
+	 * following the path of sevens.
+	 */
+	nodeno = target;
+	while (nodeno > 0)
+	{
+		if (fsmpage->fp_nodes[nodeno] >= minvalue)
+			break;
+		
+		/*
+		 * Move to the right, wrapping around at the level if necessary, and
+		 * climb up.
+		 */
+		nodeno = parentof(rightsibling(nodeno));
+	}
+
+	/*
+	 * We're now at a node with enough free space, somewhere in the middle of
+	 * the tree. Descend to the bottom, following a path with enough free
+	 * space, preferring to move left if there's a choice.
+	 */
+	while (nodeno < NonLeafNodesPerPage)
+	{
+		int leftnodeno = leftchild(nodeno);
+		int rightnodeno = leftnodeno + 1;
+		bool leftok = (leftnodeno < NodesPerPage) &&
+			(fsmpage->fp_nodes[leftnodeno] >= minvalue);
+		bool rightok = (rightnodeno < NodesPerPage) &&
+			(fsmpage->fp_nodes[rightnodeno] >= minvalue);
+
+		if (leftok)
+			nodeno = leftnodeno;
+		else if (rightok)
+			nodeno = rightnodeno;
+		else
+		{
+			/*
+			 * Oops. The parent node promised that either left or right
+			 * child has enough space, but neither actually did. This can
+			 * happen in case of a "torn page", IOW if we crashed earlier
+			 * while writing the page to disk, and only part of the page
+			 * made it to disk.
+			 *
+			 * Fix the corruption and restart.
+			 */
+			RelFileNode	rnode;
+			ForkNumber	forknum;
+			BlockNumber	blknum;
+
+			BufferGetTag(buf, &rnode, &forknum, &blknum);
+			elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u",
+				 blknum, rnode.spcNode, rnode.dbNode, rnode.relNode);
+
+			/* make sure we hold an exclusive lock */
+			if (!exclusive_lock_held)
+			{
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+				exclusive_lock_held = true;
+			}
+			fsm_rebuild_page(page);
+			MarkBufferDirty(buf);
+			goto restart;
+		}
+	}
+
+	/* We're now at the bottom level, at a node with enough space. */
+	slot = nodeno - NonLeafNodesPerPage;
+
+	/*
+	 * Update the next slot pointer. Note that we do this even if we're only
+	 * holding a shared lock, on the grounds that it's better to use a shared
+	 * lock and get a garbled next pointer every now and then, than take the
+	 * concurrency hit of an exlusive lock.
+	 *
+	 * Wrap-around is handled at the beginning of this function.
+	 */
+	fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0);
+
+	return slot;
+}
+
+/*
+ * Sets the available space to zero for all slots numbered >= nslots.
+ * Returns true if the page was modified.
+ */
+bool
+fsm_truncate_avail(Page page, int nslots)
+{
+	FSMPage fsmpage = (FSMPage) PageGetContents(page);
+	uint8 *ptr;
+	bool changed = false;
+
+	Assert(nslots >= 0 && nslots < LeafNodesPerPage);
+
+	/* Clear all truncated leaf nodes */
+	ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots];
+	for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++)
+	{
+		if (*ptr != 0)
+			changed = true;
+		*ptr = 0;
+	}
+
+	/* Fix upper nodes. */
+	if (changed)
+		fsm_rebuild_page(page);
+
+	return changed;
+}
+
+/*
+ * Reconstructs the upper levels of a page. Returns true if the page
+ * was modified.
+ */
+bool
+fsm_rebuild_page(Page page)
+{
+	FSMPage fsmpage = (FSMPage) PageGetContents(page);
+	bool	changed = false;
+	int		nodeno;
+
+	/*
+	 * Start from the lowest non-leaflevel, at last node, working our way
+	 * backwards, through all non-leaf nodes at all levels, up to the root.
+	 */
+	for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--)
+	{
+		int lchild = leftchild(nodeno);
+		int rchild = lchild + 1;
+		uint8 newvalue = 0;
+
+		if (lchild < NodesPerPage)
+			newvalue = fsmpage->fp_nodes[lchild];
+
+		if (rchild < NodesPerPage)
+			newvalue = Max(newvalue,
+						   fsmpage->fp_nodes[rchild]);
+
+		if (fsmpage->fp_nodes[nodeno] != newvalue)
+		{
+			fsmpage->fp_nodes[nodeno] = newvalue;
+			changed = true;
+		}
+	}
+
+	return changed;
+}
+
diff --git a/src/backend/storage/freespace/indexfsm.c b/src/backend/storage/freespace/indexfsm.c
new file mode 100644
index 00000000000..62fd3d37942
--- /dev/null
+++ b/src/backend/storage/freespace/indexfsm.c
@@ -0,0 +1,92 @@
+/*-------------------------------------------------------------------------
+ *
+ * indexfsm.c
+ *	  POSTGRES free space map for quickly finding free pages in relations
+ *
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/storage/freespace/indexfsm.c,v 1.1 2008/09/30 10:52:13 heikki Exp $
+ *
+ *
+ * NOTES:
+ *
+ *  This is similar to the FSM used for heap, in freespace.c, but instead
+ *  of tracking the amount of free space on pages, we only track whether
+ *  pages are completely free or in-use. We use the same FSM implementation
+ *  as for heaps, using BLCKSZ - 1 to denote used pages, and 0 for unused.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/freespace.h"
+#include "storage/indexfsm.h"
+#include "storage/smgr.h"
+
+/*
+ * Exported routines
+ */
+
+/*
+ * InitIndexFreeSpaceMap - Create or reset the FSM fork for relation.
+ */
+void
+InitIndexFreeSpaceMap(Relation rel)
+{
+	/* Create FSM fork if it doesn't exist yet, or truncate it if it does */
+	RelationOpenSmgr(rel);
+	if (!smgrexists(rel->rd_smgr, FSM_FORKNUM))
+		smgrcreate(rel->rd_smgr, FSM_FORKNUM, rel->rd_istemp, false);
+	else
+		smgrtruncate(rel->rd_smgr, FSM_FORKNUM, 0, rel->rd_istemp);
+}
+
+/*
+ * GetFreeIndexPage - return a free page from the FSM
+ *
+ * As a side effect, the page is marked as used in the FSM.
+ */
+BlockNumber
+GetFreeIndexPage(Relation rel)
+{
+	BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ/2);
+
+	if (blkno != InvalidBlockNumber)
+		RecordUsedIndexPage(rel, blkno);
+
+	return blkno;
+}
+
+/*
+ * RecordFreeIndexPage - mark a page as free in the FSM
+ */
+void
+RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
+{
+	RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1);
+}
+
+
+/*
+ * RecordUsedIndexPage - mark a page as used in the FSM
+ */
+void
+RecordUsedIndexPage(Relation rel, BlockNumber usedBlock)
+{
+	RecordPageWithFreeSpace(rel, usedBlock, 0);
+}
+
+/*
+ * IndexFreeSpaceMapTruncate - adjust for truncation of a relation.
+ *
+ * We need to delete any stored data past the new relation length, so that
+ * we don't bogusly return removed block numbers.
+ */
+void
+IndexFreeSpaceMapTruncate(Relation rel, BlockNumber nblocks)
+{
+	FreeSpaceMapTruncateRel(rel, nblocks);
+}
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index d388321e9c2..0365e566093 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.96 2008/05/12 00:00:50 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.97 2008/09/30 10:52:13 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -26,7 +26,6 @@
 #include "postmaster/bgwriter.h"
 #include "postmaster/postmaster.h"
 #include "storage/bufmgr.h"
-#include "storage/freespace.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
@@ -110,7 +109,6 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 		size = add_size(size, ProcArrayShmemSize());
 		size = add_size(size, BackendStatusShmemSize());
 		size = add_size(size, SInvalShmemSize());
-		size = add_size(size, FreeSpaceShmemSize());
 		size = add_size(size, BgWriterShmemSize());
 		size = add_size(size, AutoVacuumShmemSize());
 		size = add_size(size, BTreeShmemSize());
@@ -203,11 +201,6 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 	 */
 	CreateSharedInvalidationState();
 
-	/*
-	 * Set up free-space map
-	 */
-	InitFreeSpaceMap();
-
 	/*
 	 * Set up interprocess signaling mechanisms
 	 */
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index da4a9766ca9..4909256cc1d 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.111 2008/08/11 11:05:11 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.112 2008/09/30 10:52:13 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -21,7 +21,6 @@
 #include "access/xlogutils.h"
 #include "commands/tablespace.h"
 #include "storage/bufmgr.h"
-#include "storage/freespace.h"
 #include "storage/ipc.h"
 #include "storage/smgr.h"
 #include "utils/hsearch.h"
@@ -474,13 +473,6 @@ smgr_internal_unlink(RelFileNode rnode, ForkNumber forknum,
 	 */
 	DropRelFileNodeBuffers(rnode, forknum, isTemp, 0);
 
-	/*
-	 * Tell the free space map to forget this relation.  It won't be accessed
-	 * any more anyway, but we may as well recycle the map space quickly.
-	 */
-	if (forknum == MAIN_FORKNUM)
-		FreeSpaceMapForgetRel(&rnode);
-
 	/*
 	 * It'd be nice to tell the stats collector to forget it immediately, too.
 	 * But we can't because we don't know the OID (and in cases involving
@@ -577,13 +569,6 @@ smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks,
 	 */
 	DropRelFileNodeBuffers(reln->smgr_rnode, forknum, isTemp, nblocks);
 
-	/*
-	 * Tell the free space map to forget anything it may have stored for the
-	 * about-to-be-deleted blocks.	We want to be sure it won't return bogus
-	 * block numbers later on.
-	 */
-	FreeSpaceMapTruncateRel(&reln->smgr_rnode, nblocks);
-
 	/* Do the truncation */
 	(*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, forknum, nblocks,
 												 isTemp);
@@ -905,13 +890,6 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
 		DropRelFileNodeBuffers(xlrec->rnode, xlrec->forknum, false,
 							   xlrec->blkno);
 
-		/*
-		 * Tell the free space map to forget anything it may have stored for
-		 * the about-to-be-deleted blocks.	We want to be sure it won't return
-		 * bogus block numbers later on.
-		 */
-		FreeSpaceMapTruncateRel(&reln->smgr_rnode, xlrec->blkno);
-
 		/* Do the truncation */
 		(*(smgrsw[reln->smgr_which].smgr_truncate)) (reln,
 													 xlrec->forknum,
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 8449cb4d4c1..c9e7b5e6267 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.556 2008/08/19 18:30:04 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.557 2008/09/30 10:52:13 heikki Exp $
  *
  * NOTES
  *	  this is the "main" module of the postgres backend and
@@ -57,7 +57,6 @@
 #include "postmaster/autovacuum.h"
 #include "rewrite/rewriteHandler.h"
 #include "storage/bufmgr.h"
-#include "storage/freespace.h"
 #include "storage/ipc.h"
 #include "storage/proc.h"
 #include "storage/sinval.h"
@@ -3258,13 +3257,6 @@ PostgresMain(int argc, char *argv[], const char *username)
 		StartupXLOG();
 		on_shmem_exit(ShutdownXLOG, 0);
 
-		/*
-		 * Read any existing FSM cache file, and register to write one out at
-		 * exit.
-		 */
-		LoadFreeSpaceMap();
-		on_shmem_exit(DumpFreeSpaceMap, 0);
-
 		/*
 		 * We have to build the flat file for pg_database, but not for the
 		 * user and group tables, since we won't try to do authentication.
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 7bfb23aaf08..cec75ada720 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.273 2008/08/10 19:02:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.274 2008/09/30 10:52:13 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -304,6 +304,7 @@ AllocateRelationDesc(Relation relation, Form_pg_class relp)
 	 */
 	MemSet(relation, 0, sizeof(RelationData));
 	relation->rd_targblock = InvalidBlockNumber;
+	relation->rd_fsm_nblocks_cache = InvalidBlockNumber;
 
 	/* make sure relation is marked as having no open file yet */
 	relation->rd_smgr = NULL;
@@ -1364,6 +1365,7 @@ formrdesc(const char *relationName, Oid relationReltype,
 	 */
 	relation = (Relation) palloc0(sizeof(RelationData));
 	relation->rd_targblock = InvalidBlockNumber;
+	relation->rd_fsm_nblocks_cache = InvalidBlockNumber;
 
 	/* make sure relation is marked as having no open file yet */
 	relation->rd_smgr = NULL;
@@ -1652,8 +1654,9 @@ RelationReloadIndexInfo(Relation relation)
 	heap_freetuple(pg_class_tuple);
 	/* We must recalculate physical address in case it changed */
 	RelationInitPhysicalAddr(relation);
-	/* Make sure targblock is reset in case rel was truncated */
+	/* Must reset targblock and fsm_nblocks_cache in case rel was truncated */
 	relation->rd_targblock = InvalidBlockNumber;
+	relation->rd_fsm_nblocks_cache = InvalidBlockNumber;
 	/* Must free any AM cached data, too */
 	if (relation->rd_amcache)
 		pfree(relation->rd_amcache);
@@ -1736,6 +1739,7 @@ RelationClearRelation(Relation relation, bool rebuild)
 	if (relation->rd_isnailed)
 	{
 		relation->rd_targblock = InvalidBlockNumber;
+		relation->rd_fsm_nblocks_cache = InvalidBlockNumber;
 		if (relation->rd_rel->relkind == RELKIND_INDEX)
 		{
 			relation->rd_isvalid = false;		/* needs to be revalidated */
@@ -2330,6 +2334,7 @@ RelationBuildLocalRelation(const char *relname,
 	rel = (Relation) palloc0(sizeof(RelationData));
 
 	rel->rd_targblock = InvalidBlockNumber;
+	rel->rd_fsm_nblocks_cache = InvalidBlockNumber;
 
 	/* make sure relation is marked as having no open file yet */
 	rel->rd_smgr = NULL;
@@ -3586,6 +3591,7 @@ load_relcache_init_file(void)
 		 */
 		rel->rd_smgr = NULL;
 		rel->rd_targblock = InvalidBlockNumber;
+		rel->rd_fsm_nblocks_cache = InvalidBlockNumber;
 		if (rel->rd_isnailed)
 			rel->rd_refcnt = 1;
 		else
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f0f49538e78..93f20eef350 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -10,7 +10,7 @@
  * Written by Peter Eisentraut <peter_e@gmx.net>.
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.473 2008/09/23 21:12:03 mha Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.474 2008/09/30 10:52:13 heikki Exp $
  *
  *--------------------------------------------------------------------
  */
@@ -57,7 +57,6 @@
 #include "regex/regex.h"
 #include "storage/bufmgr.h"
 #include "storage/fd.h"
-#include "storage/freespace.h"
 #include "tcop/tcopprot.h"
 #include "tsearch/ts_cache.h"
 #include "utils/builtins.h"
@@ -446,8 +445,6 @@ const char *const config_group_names[] =
 	gettext_noop("Resource Usage"),
 	/* RESOURCES_MEM */
 	gettext_noop("Resource Usage / Memory"),
-	/* RESOURCES_FSM */
-	gettext_noop("Resource Usage / Free Space Map"),
 	/* RESOURCES_KERNEL */
 	gettext_noop("Resource Usage / Kernel Resources"),
 	/* WAL */
@@ -1528,23 +1525,6 @@ static struct config_int ConfigureNamesInt[] =
 		100000000, 0, 1000000000, NULL, NULL
 	},
 
-	{
-		{"max_fsm_relations", PGC_POSTMASTER, RESOURCES_FSM,
-			gettext_noop("Sets the maximum number of tables and indexes for which free space is tracked."),
-			NULL
-		},
-		&MaxFSMRelations,
-		1000, 100, INT_MAX, NULL, NULL
-	},
-	{
-		{"max_fsm_pages", PGC_POSTMASTER, RESOURCES_FSM,
-			gettext_noop("Sets the maximum number of disk pages for which free space is tracked."),
-			NULL
-		},
-		&MaxFSMPages,
-		20000, 1000, INT_MAX, NULL, NULL
-	},
-
 	{
 		{"max_locks_per_transaction", PGC_POSTMASTER, LOCK_MANAGEMENT,
 			gettext_noop("Sets the maximum number of locks per transaction."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 342be9d6c3c..56afb2e4885 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -114,13 +114,6 @@
 #maintenance_work_mem = 16MB		# min 1MB
 #max_stack_depth = 2MB			# min 100kB
 
-# - Free Space Map -
-
-#max_fsm_pages = 204800			# min max_fsm_relations*16, 6 bytes each
-					# (change requires restart)
-#max_fsm_relations = 1000		# min 100, ~70 bytes each
-					# (change requires restart)
-
 # - Kernel Resource Usage -
 
 #max_files_per_process = 1000		# min 25
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 4caee3e2154..02105ac57ea 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -42,7 +42,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  * Portions taken from FreeBSD.
  *
- * $PostgreSQL: pgsql/src/bin/initdb/initdb.c,v 1.161 2008/09/23 10:58:03 heikki Exp $
+ * $PostgreSQL: pgsql/src/bin/initdb/initdb.c,v 1.162 2008/09/30 10:52:13 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -119,7 +119,6 @@ static int	output_errno = 0;
 /* defaults */
 static int	n_connections = 10;
 static int	n_buffers = 50;
-static int	n_fsm_pages = 20000;
 
 /*
  * Warning messages for authentication methods
@@ -1041,13 +1040,10 @@ static void
 test_config_settings(void)
 {
 	/*
-	 * These macros define the minimum shared_buffers we want for a given
-	 * max_connections value, and the max_fsm_pages setting to be used for a
-	 * given shared_buffers value.	The arrays show the settings to try.
+	 * This macro defines the minimum shared_buffers we want for a given
+	 * max_connections value. The arrays show the settings to try.
 	 */
-
 #define MIN_BUFS_FOR_CONNS(nconns)	((nconns) * 10)
-#define FSM_FOR_BUFS(nbuffers)	((nbuffers) > 1000 ? 50 * (nbuffers) : 20000)
 
 	static const int trial_conns[] = {
 		100, 50, 40, 30, 20, 10
@@ -1065,7 +1061,6 @@ test_config_settings(void)
 				status,
 				test_conns,
 				test_buffs,
-				test_max_fsm,
 				ok_buffers = 0;
 
 
@@ -1076,16 +1071,14 @@ test_config_settings(void)
 	{
 		test_conns = trial_conns[i];
 		test_buffs = MIN_BUFS_FOR_CONNS(test_conns);
-		test_max_fsm = FSM_FOR_BUFS(test_buffs);
 
 		snprintf(cmd, sizeof(cmd),
 				 SYSTEMQUOTE "\"%s\" --boot -x0 %s "
 				 "-c max_connections=%d "
 				 "-c shared_buffers=%d "
-				 "-c max_fsm_pages=%d "
 				 "< \"%s\" > \"%s\" 2>&1" SYSTEMQUOTE,
 				 backend_exec, boot_options,
-				 test_conns, test_buffs, test_max_fsm,
+				 test_conns, test_buffs,
 				 DEVNULL, DEVNULL);
 		status = system(cmd);
 		if (status == 0)
@@ -1100,7 +1093,7 @@ test_config_settings(void)
 
 	printf("%d\n", n_connections);
 
-	printf(_("selecting default shared_buffers/max_fsm_pages ... "));
+	printf(_("selecting default shared_buffers ... "));
 	fflush(stdout);
 
 	for (i = 0; i < bufslen; i++)
@@ -1112,28 +1105,25 @@ test_config_settings(void)
 			test_buffs = ok_buffers;
 			break;
 		}
-		test_max_fsm = FSM_FOR_BUFS(test_buffs);
 
 		snprintf(cmd, sizeof(cmd),
 				 SYSTEMQUOTE "\"%s\" --boot -x0 %s "
 				 "-c max_connections=%d "
 				 "-c shared_buffers=%d "
-				 "-c max_fsm_pages=%d "
 				 "< \"%s\" > \"%s\" 2>&1" SYSTEMQUOTE,
 				 backend_exec, boot_options,
-				 n_connections, test_buffs, test_max_fsm,
+				 n_connections, test_buffs,
 				 DEVNULL, DEVNULL);
 		status = system(cmd);
 		if (status == 0)
 			break;
 	}
 	n_buffers = test_buffs;
-	n_fsm_pages = FSM_FOR_BUFS(n_buffers);
 
 	if ((n_buffers * (BLCKSZ / 1024)) % 1024 == 0)
-		printf("%dMB/%d\n", (n_buffers * (BLCKSZ / 1024)) / 1024, n_fsm_pages);
+		printf("%dMB\n", (n_buffers * (BLCKSZ / 1024)) / 1024);
 	else
-		printf("%dkB/%d\n", n_buffers * (BLCKSZ / 1024), n_fsm_pages);
+		printf("%dkB\n", n_buffers * (BLCKSZ / 1024));
 }
 
 /*
@@ -1164,9 +1154,6 @@ setup_config(void)
 				 n_buffers * (BLCKSZ / 1024));
 	conflines = replace_token(conflines, "#shared_buffers = 32MB", repltok);
 
-	snprintf(repltok, sizeof(repltok), "max_fsm_pages = %d", n_fsm_pages);
-	conflines = replace_token(conflines, "#max_fsm_pages = 204800", repltok);
-
 #if DEF_PGPORT != 5432
 	snprintf(repltok, sizeof(repltok), "#port = %d", DEF_PGPORT);
 	conflines = replace_token(conflines, "#port = 5432", repltok);
diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h
index 7be2dfc9f65..6f018f0bee2 100644
--- a/src/include/access/rmgr.h
+++ b/src/include/access/rmgr.h
@@ -3,7 +3,7 @@
  *
  * Resource managers definition
  *
- * $PostgreSQL: pgsql/src/include/access/rmgr.h,v 1.17 2006/11/05 22:42:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/rmgr.h,v 1.18 2008/09/30 10:52:13 heikki Exp $
  */
 #ifndef RMGR_H
 #define RMGR_H
@@ -23,6 +23,7 @@ typedef uint8 RmgrId;
 #define RM_DBASE_ID				4
 #define RM_TBLSPC_ID			5
 #define RM_MULTIXACT_ID			6
+#define RM_FREESPACE_ID			7
 #define RM_HEAP2_ID				9
 #define RM_HEAP_ID				10
 #define RM_BTREE_ID				11
diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h
index 86dd22647c3..d417e8c9805 100644
--- a/src/include/storage/freespace.h
+++ b/src/include/storage/freespace.h
@@ -7,152 +7,32 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/freespace.h,v 1.28 2008/03/10 02:04:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/freespace.h,v 1.29 2008/09/30 10:52:13 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
 #ifndef FREESPACE_H_
 #define FREESPACE_H_
 
-#include "storage/relfilenode.h"
-#include "storage/itemptr.h"
+#include "utils/rel.h"
+#include "storage/bufpage.h"
+#include "access/xlog.h"
 
-
-/* Initial value for average-request moving average */
-#define INITIAL_AVERAGE ((Size) (BLCKSZ / 32))
-
-/*
- * Number of pages and bytes per allocation chunk.	Indexes can squeeze 50%
- * more pages into the same space because they don't need to remember how much
- * free space on each page.  The nominal number of pages, CHUNKPAGES, is for
- * regular rels, and INDEXCHUNKPAGES is for indexes.  CHUNKPAGES should be
- * even so that no space is wasted in the index case.
- */
-#define CHUNKPAGES	16
-#define CHUNKBYTES	(CHUNKPAGES * sizeof(FSMPageData))
-#define INDEXCHUNKPAGES ((int) (CHUNKBYTES / sizeof(IndexFSMPageData)))
-
-
-/*
- * Typedefs and macros for items in the page-storage arena.  We use the
- * existing ItemPointer and BlockId data structures, which are designed
- * to pack well (they should be 6 and 4 bytes apiece regardless of machine
- * alignment issues).  Unfortunately we can't use the ItemPointer access
- * macros, because they include Asserts insisting that ip_posid != 0.
- */
-typedef ItemPointerData FSMPageData;
-typedef BlockIdData IndexFSMPageData;
-
-#define FSMPageGetPageNum(ptr)	\
-	BlockIdGetBlockNumber(&(ptr)->ip_blkid)
-#define FSMPageGetSpace(ptr)	\
-	((Size) (ptr)->ip_posid)
-#define FSMPageSetPageNum(ptr, pg)	\
-	BlockIdSet(&(ptr)->ip_blkid, pg)
-#define FSMPageSetSpace(ptr, sz)	\
-	((ptr)->ip_posid = (OffsetNumber) (sz))
-#define IndexFSMPageGetPageNum(ptr) \
-	BlockIdGetBlockNumber(ptr)
-#define IndexFSMPageSetPageNum(ptr, pg) \
-	BlockIdSet(ptr, pg)
-
-/*
- * Shared free-space-map objects
- *
- * The per-relation objects are indexed by a hash table, and are also members
- * of two linked lists: one ordered by recency of usage (most recent first),
- * and the other ordered by physical location of the associated storage in
- * the page-info arena.
- *
- * Each relation owns one or more chunks of per-page storage in the "arena".
- * The chunks for each relation are always consecutive, so that it can treat
- * its page storage as a simple array.	We further insist that its page data
- * be ordered by block number, so that binary search is possible.
- *
- * Note: we handle pointers to these items as pointers, not as SHMEM_OFFSETs.
- * This assumes that all processes accessing the map will have the shared
- * memory segment mapped at the same place in their address space.
- */
-typedef struct FSMHeader FSMHeader;
-typedef struct FSMRelation FSMRelation;
-
-/* Header for whole map */
-struct FSMHeader
-{
-	FSMRelation *usageList;		/* FSMRelations in usage-recency order */
-	FSMRelation *usageListTail; /* tail of usage-recency list */
-	FSMRelation *firstRel;		/* FSMRelations in arena storage order */
-	FSMRelation *lastRel;		/* tail of storage-order list */
-	int			numRels;		/* number of FSMRelations now in use */
-	double		sumRequests;	/* sum of requested chunks over all rels */
-	char	   *arena;			/* arena for page-info storage */
-	int			totalChunks;	/* total size of arena, in chunks */
-	int			usedChunks;		/* # of chunks assigned */
-	/* NB: there are totalChunks - usedChunks free chunks at end of arena */
-};
-
-/*
- * Per-relation struct --- this is an entry in the shared hash table.
- * The hash key is the RelFileNode value (hence, we look at the physical
- * relation ID, not the logical ID, which is appropriate).
- */
-struct FSMRelation
-{
-	RelFileNode key;			/* hash key (must be first) */
-	FSMRelation *nextUsage;		/* next rel in usage-recency order */
-	FSMRelation *priorUsage;	/* prior rel in usage-recency order */
-	FSMRelation *nextPhysical;	/* next rel in arena-storage order */
-	FSMRelation *priorPhysical; /* prior rel in arena-storage order */
-	bool		isIndex;		/* if true, we store only page numbers */
-	Size		avgRequest;		/* moving average of space requests */
-	BlockNumber interestingPages;		/* # of pages with useful free space */
-	int			firstChunk;		/* chunk # of my first chunk in arena */
-	int			storedPages;	/* # of pages stored in arena */
-	int			nextPage;		/* index (from 0) to start next search at */
-};
-
-
-
-/* GUC variables */
-extern PGDLLIMPORT int MaxFSMRelations;
-extern PGDLLIMPORT int MaxFSMPages;
-
-
-/*
- * function prototypes
- */
-extern void InitFreeSpaceMap(void);
-extern Size FreeSpaceShmemSize(void);
-extern FSMHeader *GetFreeSpaceMap(void);
-
-extern BlockNumber GetPageWithFreeSpace(RelFileNode *rel, Size spaceNeeded);
-extern BlockNumber RecordAndGetPageWithFreeSpace(RelFileNode *rel,
+/* prototypes for public functions in freespace.c */
+extern Size GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk);
+extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded);
+extern BlockNumber RecordAndGetPageWithFreeSpace(Relation rel,
 							  BlockNumber oldPage,
 							  Size oldSpaceAvail,
 							  Size spaceNeeded);
-extern Size GetAvgFSMRequestSize(RelFileNode *rel);
-extern void RecordRelationFreeSpace(RelFileNode *rel,
-						BlockNumber interestingPages,
-						int nPages,
-						FSMPageData *pageSpaces);
-
-extern BlockNumber GetFreeIndexPage(RelFileNode *rel);
-extern void RecordIndexFreeSpace(RelFileNode *rel,
-					 BlockNumber interestingPages,
-					 int nPages,
-					 BlockNumber *pages);
-
-extern void FreeSpaceMapTruncateRel(RelFileNode *rel, BlockNumber nblocks);
-extern void FreeSpaceMapForgetRel(RelFileNode *rel);
-extern void FreeSpaceMapForgetDatabase(Oid dbid);
-
-extern void PrintFreeSpaceMapStatistics(int elevel);
+extern void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk,
+									Size spaceAvail);
 
-extern void DumpFreeSpaceMap(int code, Datum arg);
-extern void LoadFreeSpaceMap(void);
+extern void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks);
+extern void FreeSpaceMapVacuum(Relation rel);
 
-#ifdef FREESPACE_DEBUG
-extern void DumpFreeSpace(void);
-#endif
+/* WAL prototypes */
+extern void fsm_desc(StringInfo buf, uint8 xl_info, char *rec);
+extern void fsm_redo(XLogRecPtr lsn, XLogRecord *record);
 
 #endif   /* FREESPACE_H */
diff --git a/src/include/storage/fsm_internals.h b/src/include/storage/fsm_internals.h
new file mode 100644
index 00000000000..e7fbbf2b9bf
--- /dev/null
+++ b/src/include/storage/fsm_internals.h
@@ -0,0 +1,73 @@
+/*-------------------------------------------------------------------------
+ *
+ * fsm_internal.h
+ *	  internal functions for free space map
+ *
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * $PostgreSQL: pgsql/src/include/storage/fsm_internals.h,v 1.1 2008/09/30 10:52:14 heikki Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FSM_INTERNALS_H
+#define FSM_INTERNALS_H
+
+#include "storage/buf.h"
+#include "storage/bufpage.h"
+#include "lib/stringinfo.h"
+
+/*
+ * Structure of a FSM page. See src/backend/storage/freespace/README for
+ * details.
+ */
+typedef struct
+{
+	/*
+	 * fsm_search_avail() tries to spread the load of multiple backends
+	 * by returning different pages to different backends in a round-robin
+	 * fashion. fp_next_slot points to the next slot to be returned
+	 * (assuming there's enough space on it for the request). It's defined
+	 * as an int, because it's updated without an exclusive lock. uint16
+	 * would be more appropriate, but int is more likely to be atomically
+	 * fetchable/storable.
+	 */
+	int fp_next_slot;
+
+	/*
+	 * fp_nodes contains the binary tree, stored in array. The first
+	 * NonLeafNodesPerPage elements are upper nodes, and the following
+	 * LeafNodesPerPage elements are leaf nodes. Unused nodes are zero.
+	 */
+	uint8	fp_nodes[1];
+} FSMPageData;
+
+typedef FSMPageData *FSMPage;
+
+/*
+ * Number of non-leaf and leaf nodes, and nodes in total, on an FSM page.
+ * These definitions are internal to fsmpage.c.
+ */
+#define NodesPerPage (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \
+					  offsetof(FSMPageData, fp_nodes))
+
+#define NonLeafNodesPerPage (BLCKSZ / 2 - 1)
+#define LeafNodesPerPage (NodesPerPage - NonLeafNodesPerPage)
+
+/*
+ * Number of FSM "slots" on a FSM page. This is what should be used
+ * outside fsmpage.c.
+ */
+#define SlotsPerFSMPage LeafNodesPerPage
+
+/* Prototypes for functions in fsmpage.c */
+extern int fsm_search_avail(Buffer buf, uint8 min_cat, bool advancenext,
+							bool exclusive_lock_held);
+extern uint8 fsm_get_avail(Page page, int slot);
+extern uint8 fsm_get_max_avail(Page page);
+extern bool fsm_set_avail(Page page, int slot, uint8 value);
+extern bool fsm_truncate_avail(Page page, int nslots);
+extern bool fsm_rebuild_page(Page page);
+
+#endif   /* FSM_INTERNALS_H */
diff --git a/src/include/storage/indexfsm.h b/src/include/storage/indexfsm.h
new file mode 100644
index 00000000000..76bb26f7bc8
--- /dev/null
+++ b/src/include/storage/indexfsm.h
@@ -0,0 +1,27 @@
+/*-------------------------------------------------------------------------
+ *
+ * indexfsm.h
+ *	  POSTGRES free space map for quickly finding an unused page in index
+ *
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * $PostgreSQL: pgsql/src/include/storage/indexfsm.h,v 1.1 2008/09/30 10:52:14 heikki Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef INDEXFSM_H_
+#define INDEXFSM_H_
+
+#include "utils/rel.h"
+
+extern void InitIndexFreeSpaceMap(Relation rel);
+
+extern BlockNumber GetFreeIndexPage(Relation rel);
+extern void RecordFreeIndexPage(Relation rel, BlockNumber page);
+extern void RecordUsedIndexPage(Relation rel, BlockNumber page);
+
+extern void IndexFreeSpaceMapTruncate(Relation rel, BlockNumber nblocks);
+
+#endif   /* INDEXFSM_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index b1088fcd33d..5f993fa2bac 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.39 2008/06/19 21:32:56 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.40 2008/09/30 10:52:14 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -35,6 +35,10 @@
  * by allowing values not listed in the enum declaration to be assigned.
  * The extra value MaxDynamicLWLock is there to keep the compiler from
  * deciding that the enum can be represented as char or short ...
+ *
+ * If you remove a lock, please replace it with a placeholder like was done
+ * for FreeSpaceMapLock. This retains the lock numbering, which is helpful for
+ * DTrace and other external debugging scripts.
  */
 typedef enum LWLockId
 {
@@ -45,7 +49,7 @@ typedef enum LWLockId
 	ProcArrayLock,
 	SInvalReadLock,
 	SInvalWriteLock,
-	FreeSpaceLock,
+	UnusedLock1,				/* FreeSpaceMapLock used to be here */
 	WALInsertLock,
 	WALWriteLock,
 	ControlFileLock,
diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h
index 8ac8147ed93..571f261c3d8 100644
--- a/src/include/storage/relfilenode.h
+++ b/src/include/storage/relfilenode.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/relfilenode.h,v 1.16 2008/08/11 11:05:11 heikki Exp $
+ * $PostgreSQL: pgsql/src/include/storage/relfilenode.h,v 1.17 2008/09/30 10:52:14 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -23,11 +23,12 @@
 typedef enum ForkNumber
 {
 	InvalidForkNumber = -1,
-	MAIN_FORKNUM = 0
-	/* NOTE: change NUM_FORKS below when you add new forks */
+	MAIN_FORKNUM = 0,
+	FSM_FORKNUM
+	/* NOTE: change MAX_FORKNUM below when you add new forks */
 } ForkNumber;
 
-#define MAX_FORKNUM		MAIN_FORKNUM
+#define MAX_FORKNUM		FSM_FORKNUM
 
 /*
  * RelFileNode must provide all that we need to know to physically access
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 436b003286c..0eca0f54a3b 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -7,7 +7,7 @@
  *
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  *
- *	  $PostgreSQL: pgsql/src/include/utils/guc_tables.h,v 1.42 2008/09/10 18:09:20 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/include/utils/guc_tables.h,v 1.43 2008/09/30 10:52:14 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -49,7 +49,6 @@ enum config_group
 	CONN_AUTH_SECURITY,
 	RESOURCES,
 	RESOURCES_MEM,
-	RESOURCES_FSM,
 	RESOURCES_KERNEL,
 	WAL,
 	WAL_SETTINGS,
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 0d9d75dd8b1..71ad936d274 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.107 2008/06/19 00:46:06 alvherre Exp $
+ * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.108 2008/09/30 10:52:14 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -195,6 +195,9 @@ typedef struct RelationData
 	List	   *rd_indpred;		/* index predicate tree, if any */
 	void	   *rd_amcache;		/* available for use by index AM */
 
+	/* Cached last-seen size of the FSM */
+	BlockNumber	rd_fsm_nblocks_cache;
+
 	/* use "struct" here to avoid needing to include pgstat.h: */
 	struct PgStat_TableStatus *pgstat_info;		/* statistics collection area */
 } RelationData;
-- 
GitLab