diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index ce52e2dd48ed8e4cbdad1fdf0acb4c8ec27f01dd..2d81383ae8ad467427c6161694517856df26d055 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2149,15 +2149,6 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, &vmbuffer, NULL); page = BufferGetPage(buffer); - if (PageIsAllVisible(page)) - { - all_visible_cleared = true; - PageClearAllVisible(page); - visibilitymap_clear(relation, - BufferGetBlockNumber(buffer), - vmbuffer); - } - /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); @@ -2172,6 +2163,15 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, RelationPutHeapTuple(relation, buffer, heaptup); } + if (PageIsAllVisible(page)) + { + all_visible_cleared = true; + PageClearAllVisible(page); + visibilitymap_clear(relation, + BufferGetBlockNumber(buffer), + vmbuffer); + } + /* * XXX Should we set PageSetPrunable on this page ? See heap_insert() */ diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 5696abe4d2a11cec7ab97e8525458dcc124dad39..9152c7d151150573fbdbd5bb8d7a8125da18adf6 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -293,6 +293,13 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr, * relation. On return, *buf is a valid buffer with the map page containing * the bit for heapBlk, or InvalidBuffer. The caller is responsible for * releasing *buf after it's done testing and setting bits. + * + * NOTE: This function is typically called without a lock on the heap page, + * so somebody else could change the bit just after we look at it. In fact, + * since we don't lock the visibility map page either, it's even possible that + * someone else could have changed the bit just before we look at it, but yet + * we might see the old value. It is the caller's responsibility to deal with + * all concurrency issues! */ bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) @@ -327,7 +334,9 @@ visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) map = PageGetContents(BufferGetPage(*buf)); /* - * We don't need to lock the page, as we're only looking at a single bit. + * A single-bit read is atomic. There could be memory-ordering effects + * here, but for performance reasons we make it the caller's job to worry + * about that. */ result = (map[mapByte] & (1 << mapBit)) ? true : false; diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 0e0193d40e1818f4177bcb5bd68148c2739eb1ec..3ff56a736648c5ab45533b376892b21e9ec569e3 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -419,6 +419,15 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * Note: if scan_all is true, we won't actually skip any pages; but we * maintain next_not_all_visible_block anyway, so as to set up the * all_visible_according_to_vm flag correctly for each page. + * + * Note: The value returned by visibilitymap_test could be slightly + * out-of-date, since we make this test before reading the corresponding + * heap page or locking the buffer. This is OK. If we mistakenly think + * that the page is all-visible when in fact the flag's just been cleared, + * we might fail to vacuum the page. But it's OK to skip pages when + * scan_all is not set, so no great harm done; the next vacuum will find + * them. If we make the reverse mistake and vacuum a page unnecessarily, + * it'll just be a no-op. */ for (next_not_all_visible_block = 0; next_not_all_visible_block < nblocks; @@ -852,22 +861,37 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, freespace = PageGetHeapFreeSpace(page); /* mark page all-visible, if appropriate */ - if (all_visible && !all_visible_according_to_vm) + if (all_visible) { if (!PageIsAllVisible(page)) { PageSetAllVisible(page); MarkBufferDirty(buf); + visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer, + visibility_cutoff_xid); + } + else if (!all_visible_according_to_vm) + { + /* + * It should never be the case that the visibility map page + * is set while the page-level bit is clear, but the reverse + * is allowed. Set the visibility map bit as well so that + * we get back in sync. + */ + visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer, + visibility_cutoff_xid); } - visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer, - visibility_cutoff_xid); } /* * As of PostgreSQL 9.2, the visibility map bit should never be set if - * the page-level bit is clear. + * the page-level bit is clear. However, it's possible that the bit + * got cleared after we checked it and before we took the buffer + * content lock, so we must recheck before jumping to the conclusion + * that something bad has happened. */ - else if (all_visible_according_to_vm && !PageIsAllVisible(page)) + else if (all_visible_according_to_vm && !PageIsAllVisible(page) + && visibilitymap_test(onerel, blkno, &vmbuffer)) { elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", relname, blkno); diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 4abd805aa31733fafc0b18e80a65c4f91695fae5..af31671b3eb60979fdfbcacd4c362cecc231c32c 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -82,6 +82,19 @@ IndexOnlyNext(IndexOnlyScanState *node) * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, * we'll use the index tuple not the heap tuple as the data source. + * + * Note on Memory Ordering Effects: visibilitymap_test does not lock + * the visibility map buffer, and therefore the result we read here + * could be slightly stale. However, it can't be stale enough to + * matter. It suffices to show that (1) there is a read barrier + * between the time we read the index TID and the time we test the + * visibility map; and (2) there is a write barrier between the time + * some other concurrent process clears the visibility map bit and the + * time it inserts the index TID. Since acquiring or releasing a + * LWLock interposes a full barrier, this is easy to show: (1) is + * satisfied by the release of the index buffer content lock after + * reading the TID; and (2) is satisfied by the acquisition of the + * buffer content lock in order to insert the TID. */ if (!visibilitymap_test(scandesc->heapRelation, ItemPointerGetBlockNumber(tid),