diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 4404aa53d36e9ceb52ccb41816765477d60dff93..dc7749522db3bc65a3759d30d991d1257890dece 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.70 2000/01/15 02:59:33 petere Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.71 2000/01/17 01:15:17 inoue Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -74,6 +74,18 @@ static int	WriteMode = BUFFER_LATE_WRITE;		/* Delayed write is
 
 static void WaitIO(BufferDesc *buf, SPINLOCK spinlock);
 
+static void StartBufferIO(BufferDesc *buf, bool forInput);
+static void TerminateBufferIO(BufferDesc *buf);
+static void ContinueBufferIO(BufferDesc *buf, bool forInput);
+extern void InitBufferIO(void);
+extern void AbortBufferIO(void);
+
+/*
+ * Macro : BUFFER_IS_BROKEN
+ *      Note that write error doesn't mean the buffer broken
+*/
+#define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY))
+
 #ifndef HAS_TEST_AND_SET
 static void SignalIO(BufferDesc *buf);
 extern long *NWaitIOBackendP;	/* defined in buf_init.c */
@@ -312,12 +324,7 @@ ReadBufferWithBufferLock(Relation reln,
 	}
 
 	/* If anyone was waiting for IO to complete, wake them up now */
-#ifdef HAS_TEST_AND_SET
-	S_UNLOCK(&(bufHdr->io_in_progress_lock));
-#else
-	if (bufHdr->refcount > 1)
-		SignalIO(bufHdr);
-#endif
+	TerminateBufferIO(bufHdr);
 
 	SpinRelease(BufMgrLock);
 
@@ -375,13 +382,19 @@ BufferAlloc(Relation reln,
 		inProgress = (buf->flags & BM_IO_IN_PROGRESS);
 
 		*foundPtr = TRUE;
-		if (inProgress)
+		if (inProgress) /* confirm end of IO */
 		{
 			WaitIO(buf, BufMgrLock);
-			if (buf->flags & BM_IO_ERROR)
-			{
+			inProgress = (buf->flags & BM_IO_IN_PROGRESS);
+		}
+		if (BUFFER_IS_BROKEN(buf))
+		{
+			/* I couldn't understand the following old comment.
+			 * If there's no IO for the buffer and the buffer
+			 * is BROKEN,it should be read again. So start a
+			 * new buffer IO here. 
 
-				/*
+				 *
 				 * wierd race condition:
 				 *
 				 * We were waiting for someone else to read the buffer. While
@@ -396,13 +409,14 @@ BufferAlloc(Relation reln,
 				 *
 				 * This is never going to happen, don't worry about it.
 				 */
-				*foundPtr = FALSE;
-			}
+			*foundPtr = FALSE;
 		}
 #ifdef BMTRACE
 		_bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), RelationGetRelid(reln), blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCFND);
 #endif	 /* BMTRACE */
 
+		if (!(*foundPtr))
+			StartBufferIO(buf, true);
 		SpinRelease(BufMgrLock);
 
 		return buf;
@@ -454,7 +468,6 @@ BufferAlloc(Relation reln,
 			 * in WaitIO until we're done.
 			 */
 			inProgress = TRUE;
-			buf->flags |= BM_IO_IN_PROGRESS;
 #ifdef HAS_TEST_AND_SET
 
 			/*
@@ -462,9 +475,8 @@ BufferAlloc(Relation reln,
 			 * since no one had it pinned (it just came off the free
 			 * list), no one else can have this lock.
 			 */
-			Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)));
-			S_LOCK(&(buf->io_in_progress_lock));
 #endif	 /* HAS_TEST_AND_SET */
+			StartBufferIO(buf, false);
 
 			/*
 			 * Write the buffer out, being careful to release BufMgrLock
@@ -486,12 +498,7 @@ BufferAlloc(Relation reln,
 				inProgress = FALSE;
 				buf->flags |= BM_IO_ERROR;
 				buf->flags &= ~BM_IO_IN_PROGRESS;
-#ifdef HAS_TEST_AND_SET
-				S_UNLOCK(&(buf->io_in_progress_lock));
-#else							/* !HAS_TEST_AND_SET */
-				if (buf->refcount > 1)
-					SignalIO(buf);
-#endif	 /* !HAS_TEST_AND_SET */
+				TerminateBufferIO(buf);
 				PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
 				Assert(buf->refcount > 0);
 				buf->refcount--;
@@ -533,12 +540,7 @@ BufferAlloc(Relation reln,
 			{
 				inProgress = FALSE;
 				buf->flags &= ~BM_IO_IN_PROGRESS;
-#ifdef HAS_TEST_AND_SET
-				S_UNLOCK(&(buf->io_in_progress_lock));
-#else							/* !HAS_TEST_AND_SET */
-				if (buf->refcount > 1)
-					SignalIO(buf);
-#endif	 /* !HAS_TEST_AND_SET */
+				TerminateBufferIO(buf);
 				PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
 				buf->refcount--;
 				buf = (BufferDesc *) NULL;
@@ -563,12 +565,7 @@ BufferAlloc(Relation reln,
 				 */
 				if (buf != NULL)
 				{
-#ifdef HAS_TEST_AND_SET
-					S_UNLOCK(&(buf->io_in_progress_lock));
-#else							/* !HAS_TEST_AND_SET */
-					if (buf->refcount > 1)
-						SignalIO(buf);
-#endif	 /* !HAS_TEST_AND_SET */
+					TerminateBufferIO(buf);
 					/* give up the buffer since we don't need it any more */
 					PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
 					Assert(buf->refcount > 0);
@@ -588,10 +585,13 @@ BufferAlloc(Relation reln,
 				if (inProgress)
 				{
 					WaitIO(buf2, BufMgrLock);
-					if (buf2->flags & BM_IO_ERROR)
-						*foundPtr = FALSE;
+					inProgress = (buf2->flags & BM_IO_IN_PROGRESS);
 				}
+				if (BUFFER_IS_BROKEN(buf2))
+					*foundPtr = FALSE;
 
+				if (!(*foundPtr))
+					StartBufferIO(buf2, true);
 				SpinRelease(BufMgrLock);
 
 				return buf2;
@@ -640,12 +640,10 @@ BufferAlloc(Relation reln,
 	 */
 	if (!inProgress)
 	{
-		buf->flags |= BM_IO_IN_PROGRESS;
-#ifdef HAS_TEST_AND_SET
-		Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)));
-		S_LOCK(&(buf->io_in_progress_lock));
-#endif	 /* HAS_TEST_AND_SET */
+		StartBufferIO(buf, true);
 	}
+	else
+		ContinueBufferIO(buf, true);
 
 #ifdef BMTRACE
 	_bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), RelationGetRelid(reln), blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCNOTFND);
@@ -806,7 +804,9 @@ FlushBuffer(Buffer buffer, bool release)
 
 	/* To check if block content changed while flushing. - vadim 01/17/97 */
 	SpinAcquire(BufMgrLock);
+	WaitIO(bufHdr, BufMgrLock);	/* confirm end of IO */
 	bufHdr->flags &= ~BM_JUST_DIRTIED;
+	StartBufferIO(bufHdr, false);	/* output IO start */
 	SpinRelease(BufMgrLock);
 
 	status = smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum,
@@ -824,6 +824,8 @@ FlushBuffer(Buffer buffer, bool release)
 	BufferFlushCount++;
 
 	SpinAcquire(BufMgrLock);
+	bufHdr->flags &= ~BM_IO_IN_PROGRESS;	/* mark IO finished */
+	TerminateBufferIO(bufHdr);		/* output IO finished */
 
 	/*
 	 * If this buffer was marked by someone as DIRTY while we were
@@ -998,7 +1000,9 @@ BufferSync()
 				 * To check if block content changed while flushing (see
 				 * below). - vadim 01/17/97
 				 */
+				WaitIO(bufHdr, BufMgrLock);	/* confirm end of IO */
 				bufHdr->flags &= ~BM_JUST_DIRTIED;
+				StartBufferIO(bufHdr, false);	/* output IO start */
 
 				/*
 				 * If we didn't have the reldesc in our local cache, flush
@@ -1034,6 +1038,8 @@ BufferSync()
 					elog(ERROR, "BufferSync: cannot write %u for %s",
 						 bufHdr->tag.blockNum, bufHdr->sb_relname);
 				}
+				bufHdr->flags &= ~BM_IO_IN_PROGRESS;	/* mark IO finished */
+				TerminateBufferIO(bufHdr);	/* Sync IO finished */
 				BufferFlushCount++;
 
 				/*
@@ -1084,10 +1090,16 @@ BufferSync()
 static void
 WaitIO(BufferDesc *buf, SPINLOCK spinlock)
 {
-	SpinRelease(spinlock);
-	S_LOCK(&(buf->io_in_progress_lock));
-	S_UNLOCK(&(buf->io_in_progress_lock));
-	SpinAcquire(spinlock);
+	/*
+	 * Changed to wait until there's no IO - Inoue 01/13/2000
+	 */
+	while ((buf->flags & BM_IO_IN_PROGRESS) != 0)
+	{
+		SpinRelease(spinlock);
+		S_LOCK(&(buf->io_in_progress_lock));
+		S_UNLOCK(&(buf->io_in_progress_lock));
+		SpinAcquire(spinlock);
+	}
 }
 
 #else							/* !HAS_TEST_AND_SET */
@@ -2163,3 +2175,112 @@ LockBuffer(Buffer buffer, int mode)
 #endif
 
 }
+
+/*
+ *	Functions for IO error handling
+ *
+ *	Note : We assume that nested buffer IO never occur.
+ *	i.e at most one io_in_progress spinlock is held
+ *	per proc.
+*/
+static	BufferDesc *InProgressBuf = (BufferDesc *)NULL;
+static	bool	IsForInput;
+
+/*
+ * Function:StartBufferIO
+ *	(Assumptions)
+ *	My process is executing no IO
+ *	BufMgrLock is held
+ *	BM_IO_IN_PROGRESS mask is not set for the buffer 
+ *	The buffer is Pinned
+ *
+*/
+static void StartBufferIO(BufferDesc *buf, bool forInput)
+{
+	Assert(!InProgressBuf);
+	Assert(!(buf->flags & BM_IO_IN_PROGRESS));
+	buf->flags |= BM_IO_IN_PROGRESS;
+#ifdef	HAS_TEST_AND_SET
+	Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)))
+	S_LOCK(&(buf->io_in_progress_lock));
+#endif /* HAS_TEST_AND_SET */
+	InProgressBuf = buf;
+	IsForInput = forInput;
+}
+
+/*
+ * Function:TerminateBufferIO
+ *	(Assumptions)
+ *	My process is executing IO for the buffer
+ *	BufMgrLock is held
+ *	The buffer is Pinned
+ *
+*/
+static void TerminateBufferIO(BufferDesc *buf)
+{
+	Assert(buf == InProgressBuf);
+#ifdef	HAS_TEST_AND_SET
+	S_UNLOCK(&(buf->io_in_progress_lock));
+#else
+	if (buf->refcount > 1)
+		SignalIO(buf);
+#endif /* HAS_TEST_AND_SET */
+	InProgressBuf = (BufferDesc *)0;
+}
+
+/*
+ * Function:ContinueBufferIO
+ *	(Assumptions)
+ *	My process is executing IO for the buffer
+ *	BufMgrLock is held
+ *	The buffer is Pinned
+ *
+*/
+static void ContinueBufferIO(BufferDesc *buf, bool forInput)
+{
+	Assert(buf == InProgressBuf);
+	Assert(buf->flags & BM_IO_IN_PROGRESS);
+	IsForInput = forInput;
+}
+
+extern void	InitBufferIO(void)
+{
+	InProgressBuf = (BufferDesc *)0;
+}
+
+/*
+ *	This function is called from ProcReleaseSpins().
+ *	BufMgrLock isn't held when this function is called. 
+ *	BM_IO_ERROR is always set. If BM_IO_ERROR was already
+ *	set in case of output,this routine would kill all 
+ *	backends and reset postmaster.
+ */
+extern void	AbortBufferIO(void)
+{
+	BufferDesc *buf = InProgressBuf;
+	if (buf)
+	{
+		Assert(buf->flags & BM_IO_IN_PROGRESS);
+		SpinAcquire(BufMgrLock);
+		if (IsForInput)
+		{
+			Assert(!(buf->flags & BM_DIRTY));
+		}
+		else
+		{
+			Assert(!(buf->flags & BM_DIRTY));
+			/* Assert(!(buf->flags & BM_IO_ERROR)); */
+			if (buf->flags & BM_IO_ERROR)
+			{
+				elog(NOTICE, "!!! write error seems permanent !!!");
+				elog(NOTICE, "!!! now kill all backends and reset postmaster !!!");
+				proc_exit(255);
+			}
+			buf->flags |= BM_DIRTY;
+		}
+		buf->flags |= BM_IO_ERROR;
+		TerminateBufferIO(buf);
+		buf->flags &= ~BM_IO_IN_PROGRESS;
+		SpinRelease(BufMgrLock);
+	}
+}
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index c685b1ad93c53672f8acbfb9273a14a288b9d0d1..2a41bdc317a91e50657eea37bff9f0c732e2a958 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.65 1999/12/16 01:25:08 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.66 2000/01/17 01:15:18 inoue Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -46,7 +46,7 @@
  *		This is so that we can support more backends. (system-wide semaphore
  *		sets run out pretty fast.)				  -ay 4/95
  *
- * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.65 1999/12/16 01:25:08 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.66 2000/01/17 01:15:18 inoue Exp $
  */
 #include <sys/time.h>
 #include <unistd.h>
@@ -848,6 +848,7 @@ ProcReleaseSpins(PROC *proc)
 			SpinRelease(i);
 		}
 	}
+	AbortBufferIO();
 }
 
 /*****************************************************************************
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index fcf9e730ac70fc9db143fab5bd893fe324e67fec..21b43a076d0cf7a38aff6654ffc03b0c00b4f7fd 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: bufmgr.h,v 1.32 1999/09/28 11:40:53 vadim Exp $
+ * $Id: bufmgr.h,v 1.33 2000/01/17 01:15:19 inoue Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -186,5 +186,6 @@ extern void SetBufferCommitInfoNeedsSave(Buffer buffer);
 
 extern void UnlockBuffers(void);
 extern void LockBuffer(Buffer buffer, int mode);
+extern void AbortBufferIO(void);
 
 #endif