diff --git a/src/backend/storage/page/Makefile b/src/backend/storage/page/Makefile
index a59a450643de9f5079b09f472b3d6373041a9387..82d9c37fda52cc33435b9b529b0695dccfe99417 100644
--- a/src/backend/storage/page/Makefile
+++ b/src/backend/storage/page/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/storage/page
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS =  bufpage.o itemptr.o
+OBJS =  bufpage.o checksum.o itemptr.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 326d3de88895c4eaa98bfa0e6bca729a707be01c..f0e365379a489b5ca18b4264e8819dcfc2ad5e44 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -16,6 +16,7 @@
 
 #include "access/htup_details.h"
 #include "access/xlog.h"
+#include "storage/checksum.h"
 
 bool ignore_checksum_failure = false;
 
@@ -948,33 +949,30 @@ PageSetChecksumInplace(Page page, BlockNumber blkno)
 static uint16
 PageCalcChecksum16(Page page, BlockNumber blkno)
 {
-	pg_crc32    		crc;
-	PageHeader	p = (PageHeader) page;
+	PageHeader	phdr   = (PageHeader) page;
+	uint16		save_checksum;
+	uint32		checksum;
 
 	/* only calculate the checksum for properly-initialized pages */
 	Assert(!PageIsNew(page));
 
-	INIT_CRC32(crc);
-
 	/*
-	 * Initialize the checksum calculation with the block number. This helps
-	 * catch corruption from whole blocks being transposed with other whole
-	 * blocks.
+	 * Save pd_checksum and set it to zero, so that the checksum calculation
+	 * isn't affected by the checksum stored on the page. We do this to
+	 * allow optimization of the checksum calculation on the whole block
+	 * in one go.
 	 */
-	COMP_CRC32(crc, &blkno, sizeof(blkno));
+	save_checksum = phdr->pd_checksum;
+	phdr->pd_checksum = 0;
+	checksum = checksum_block(page, BLCKSZ);
+	phdr->pd_checksum = save_checksum;
 
-	/*
-	 * Now add in the LSN, which is always the first field on the page.
-	 */
-	COMP_CRC32(crc, page, sizeof(p->pd_lsn));
+	/* mix in the block number to detect transposed pages */
+	checksum ^= blkno;
 
 	/*
-	 * Now add the rest of the page, skipping the pd_checksum field.
+	 * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of
+	 * one. That avoids checksums of zero, which seems like a good idea.
 	 */
-	COMP_CRC32(crc, page + sizeof(p->pd_lsn) + sizeof(p->pd_checksum),
-				  BLCKSZ - sizeof(p->pd_lsn) - sizeof(p->pd_checksum));
-
-	FIN_CRC32(crc);
-
-	return (uint16) crc;
+	return (checksum % 65535) + 1;
 }
diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c
new file mode 100644
index 0000000000000000000000000000000000000000..d9348ee3c29bc33a88a5a48690dc563497ffeaf6
--- /dev/null
+++ b/src/backend/storage/page/checksum.c
@@ -0,0 +1,160 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksum.c
+ *	  Checksum implementation for data pages.
+ *
+ * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/page/checksum.c
+ *
+ *-------------------------------------------------------------------------
+ *
+ * Checksum algorithm
+ *
+ * The algorithm used to checksum pages is chosen for very fast calculation.
+ * Workloads where the database working set fits into OS file cache but not
+ * into shared buffers can read in pages at a very fast pace and the checksum
+ * algorithm itself can become the largest bottleneck.
+ *
+ * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand
+ * for Fowler/Noll/Vo) The primitive of a plain FNV-1a hash folds in data 1
+ * byte at a time according to the formula:
+ *
+ *     hash = (hash ^ value) * FNV_PRIME
+ *
+ * FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/
+ *
+ * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of
+ * high bits - high order bits in input data only affect high order bits in
+ * output data. To resolve this we xor in the value prior to multiplication
+ * shifted right by 17 bits. The number 17 was chosen because it doesn't
+ * have common denominator with set bit positions in FNV_PRIME and empirically
+ * provides the fastest mixing for high order bits of final iterations quickly
+ * avalanche into lower positions. For performance reasons we choose to combine
+ * 4 bytes at a time. The actual hash formula used as the basis is:
+ *
+ *     hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 17)
+ *
+ * The main bottleneck in this calculation is the multiplication latency. To
+ * hide the latency and to make use of SIMD parallelism multiple hash values
+ * are calculated in parallel. The page is treated as a 32 column two
+ * dimensional array of 32 bit values. Each column is aggregated separately
+ * into a partial checksum. Each partial checksum uses a different initial
+ * value (offset basis in FNV terminology). The initial values actually used
+ * were chosen randomly, as the values themselves don't matter as much as that
+ * they are different and don't match anything in real data. After initializing
+ * partial checksums each value in the column is aggregated according to the
+ * above formula. Finally two more iterations of the formula are performed with
+ * value 0 to mix the bits of the last value added.
+ *
+ * The partial checksums are then folded together using xor to form a single
+ * 32-bit checksum. The caller can safely reduce the value to 16 bits
+ * using modulo 2^16-1. That will cause a very slight bias towards lower
+ * values but this is not significant for the performance of the
+ * checksum.
+ *
+ * The algorithm choice was based on what instructions are available in SIMD
+ * instruction sets. This meant that a fast and good algorithm needed to use
+ * multiplication as the main mixing operator. The simplest multiplication
+ * based checksum primitive is the one used by FNV. The prime used is chosen
+ * for good dispersion of values. It has no known simple patterns that result
+ * in collisions. Test of 5-bit differentials of the primitive over 64bit keys
+ * reveals no differentials with 3 or more values out of 100000 random keys
+ * colliding. Avalanche test shows that only high order bits of the last word
+ * have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes,
+ * overwriting page from random position to end with 0 bytes, and overwriting
+ * random segments of page with 0x00, 0xFF and random data all show optimal
+ * 2e-16 false positive rate within margin of error.
+ *
+ * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer
+ * multiplication instruction. As of 2013 the corresponding instruction is
+ * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32).
+ * Vectorization requires a compiler to do the vectorization for us. For recent
+ * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough
+ * to achieve vectorization.
+ *
+ * The optimal amount of parallelism to use depends on CPU specific instruction
+ * latency, SIMD instruction width, throughput and the amount of registers
+ * available to hold intermediate state. Generally, more parallelism is better
+ * up to the point that state doesn't fit in registers and extra load-store
+ * instructions are needed to swap values in/out. The number chosen is a fixed
+ * part of the algorithm because changing the parallelism changes the checksum
+ * result.
+ *
+ * The parallelism number 32 was chosen based on the fact that it is the
+ * largest state that fits into architecturally visible x86 SSE registers while
+ * leaving some free registers for intermediate values. For future processors
+ * with 256bit vector registers this will leave some performance on the table.
+ * When vectorization is not available it might be beneficial to restructure
+ * the computation to calculate a subset of the columns at a time and perform
+ * multiple passes to avoid register spilling. This optimization opportunity
+ * is not used. Current coding also assumes that the compiler has the ability
+ * to unroll the inner loop to avoid loop overhead and minimize register
+ * spilling. For less sophisticated compilers it might be beneficial to manually
+ * unroll the inner loop.
+ */
+#include "postgres.h"
+
+#include "storage/checksum.h"
+
+/* number of checksums to calculate in parallel */
+#define N_SUMS 32
+/* prime multiplier of FNV-1a hash */
+#define FNV_PRIME 16777619
+
+/*
+ * Base offsets to initialize each of the parallel FNV hashes into a
+ * different initial state.
+ */
+static const uint32 checksumBaseOffsets[N_SUMS] = {
+	0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A,
+	0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C,
+	0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA,
+	0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB,
+	0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE,
+	0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4,
+	0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E,
+	0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756
+};
+
+/*
+ * Calculate one round of the checksum.
+ */
+#define CHECKSUM_COMP(checksum, value) do {\
+	uint32 __tmp = (checksum) ^ (value);\
+	(checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17);\
+} while (0)
+
+uint32
+checksum_block(char *data, uint32 size)
+{
+	uint32 sums[N_SUMS];
+	uint32 (*dataArr)[N_SUMS] = (uint32 (*)[N_SUMS]) data;
+	uint32 result = 0;
+	int i, j;
+
+	/* ensure that the size is compatible with the algorithm */
+	Assert((size % (sizeof(uint32)*N_SUMS)) == 0);
+
+	/* initialize partial checksums to their corresponding offsets */
+	memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
+
+	/* main checksum calculation */
+	for (i = 0; i < size/sizeof(uint32)/N_SUMS; i++)
+		for (j = 0; j < N_SUMS; j++)
+			CHECKSUM_COMP(sums[j], dataArr[i][j]);
+
+	/* finally add in two rounds of zeroes for additional mixing */
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < N_SUMS; j++)
+			CHECKSUM_COMP(sums[j], 0);
+
+	/* xor fold partial checksums together */
+	for (i = 0; i < N_SUMS; i++)
+		result ^= sums[i];
+
+	return result;
+}
diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h
new file mode 100644
index 0000000000000000000000000000000000000000..e41fd9804b94513604f60e1dd0523e690b469a29
--- /dev/null
+++ b/src/include/storage/checksum.h
@@ -0,0 +1,23 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksum.h
+ *	  Checksum implementation for data pages.
+ *
+ *
+ * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/checksum.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CHECKSUM_H
+#define CHECKSUM_H
+
+/*
+ * Fowler-Noll-Vo 1a block checksum algorithm. The data argument should be
+ * aligned on a 4-byte boundary.
+ */
+extern uint32 checksum_block(char *data, uint32 size);
+
+#endif   /* CHECKSUM_H */