=== Applying patches on top of PostgreSQL commit ID d4c3a6b8ad830882066122081a7141ecd573f45d === /etc/rc.d/jail: WARNING: Per-jail configuration via jail_* variables is obsolete. Please consider migrating to /etc/jail.conf. Fri Jan 31 19:50:34 UTC 2025 On branch cf/3986 nothing to commit, working tree clean === applying patch ./v4-0000-squashed-prerequisites.patch Applied patch to 'contrib/amcheck/verify_nbtree.c' cleanly. Applied patch to 'contrib/pg_surgery/heap_surgery.c' cleanly. Applied patch to 'src/backend/access/brin/brin_bloom.c' cleanly. Applied patch to 'src/backend/access/brin/brin_minmax_multi.c' cleanly. Applied patch to 'src/backend/access/gin/ginpostinglist.c' cleanly. Applied patch to 'src/backend/access/gist/gist.c' cleanly. Applied patch to 'src/backend/access/gist/gistget.c' cleanly. Applied patch to 'src/backend/access/hash/hash.c' cleanly. Applied patch to 'src/backend/access/hash/hashovfl.c' cleanly. Applied patch to 'src/backend/access/hash/hashpage.c' cleanly. Applied patch to 'src/backend/access/hash/hashsearch.c' cleanly. Applied patch to 'src/backend/access/heap/README.HOT' cleanly. Applied patch to 'src/backend/access/heap/heapam.c' with conflicts. Applied patch to 'src/backend/access/heap/heapam_handler.c' cleanly. Applied patch to 'src/backend/access/heap/hio.c' cleanly. Applied patch to 'src/backend/access/heap/pruneheap.c' with conflicts. Applied patch to 'src/backend/access/heap/rewriteheap.c' cleanly. Applied patch to 'src/backend/access/heap/vacuumlazy.c' with conflicts. Applied patch to 'src/backend/access/nbtree/nbtdedup.c' cleanly. Applied patch to 'src/backend/access/nbtree/nbtinsert.c' cleanly. Applied patch to 'src/backend/access/nbtree/nbtpage.c' cleanly. Applied patch to 'src/backend/access/nbtree/nbtree.c' cleanly. Applied patch to 'src/backend/access/nbtree/nbtsearch.c' cleanly. Applied patch to 'src/backend/access/nbtree/nbtxlog.c' cleanly. Applied patch to 'src/backend/access/spgist/spgdoinsert.c' cleanly. Applied patch to 'src/backend/access/spgist/spgscan.c' cleanly. Applied patch to 'src/backend/access/spgist/spgvacuum.c' cleanly. Applied patch to 'src/backend/nodes/tidbitmap.c' cleanly. Applied patch to 'src/backend/replication/logical/reorderbuffer.c' cleanly. Applied patch to 'src/backend/storage/freespace/freespace.c' cleanly. Applied patch to 'src/backend/storage/page/bufpage.c' cleanly. Applied patch to 'src/include/access/ginblock.h' cleanly. Applied patch to 'src/include/access/hash.h' cleanly. Applied patch to 'src/include/access/heapam.h' with conflicts. Applied patch to 'src/include/access/heaptoast.h' cleanly. Applied patch to 'src/include/access/htup_details.h' cleanly. Applied patch to 'src/include/access/itup.h' cleanly. Applied patch to 'src/include/access/nbtree.h' cleanly. Applied patch to 'src/include/access/spgist_private.h' cleanly. Applied patch to 'src/test/modules/test_ginpostinglist/test_ginpostinglist.c' cleanly. Applied patch to 'src/test/regress/expected/insert.out' cleanly. Applied patch to 'src/test/regress/sql/insert.sql' cleanly. U src/backend/access/heap/heapam.c U src/backend/access/heap/pruneheap.c U src/backend/access/heap/vacuumlazy.c U src/include/access/heapam.h diff --cc src/backend/access/heap/heapam.c index ea0a12b39a,633c6e4303..0000000000 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@@ -539,79 -445,37 +539,84 @@@ heap_prepare_pagescan(TableScanDesc ssc * tuple for visibility the hard way. */ all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; + check_serializable = + CheckForSerializableConflictOutNeeded(scan->rs_base.rs_rd, snapshot); - for (lineoff = FirstOffsetNumber; lineoff <= lines; lineoff++) + /* + * We call page_collect_tuples() with constant arguments, to get the + * compiler to constant fold the constant arguments. Separate calls with + * constant arguments, rather than variables, are needed on several + * compilers to actually perform constant folding. + */ + if (likely(all_visible)) { - ItemId lpp = PageGetItemId(page, lineoff); - HeapTupleData loctup; - bool valid; + if (likely(!check_serializable)) + scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer, + block, lines, true, false); + else + scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer, + block, lines, true, true); + } + else + { + if (likely(!check_serializable)) + scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer, + block, lines, false, false); + else + scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer, + block, lines, false, true); + } - if (!ItemIdIsNormal(lpp)) - continue; + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); +} - loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); - loctup.t_data = (HeapTupleHeader) PageGetItem(page, lpp); - loctup.t_len = ItemIdGetLength(lpp); - ItemPointerSet(&(loctup.t_self), block, lineoff); +/* + * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM. + * + * Read the next block of the scan relation from the read stream and save it + * in the scan descriptor. It is already pinned. + */ +static inline void +heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir) +{ + Assert(scan->rs_read_stream); - if (all_visible) - valid = true; - else - valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); + /* release previous scan buffer, if any */ + if (BufferIsValid(scan->rs_cbuf)) + { + ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + } - HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd, - &loctup, buffer, snapshot); + /* + * Be sure to check for interrupts at least once per page. Checks at + * higher code levels won't be able to stop a seqscan that encounters many + * pages' worth of consecutive dead tuples. + */ + CHECK_FOR_INTERRUPTS(); - if (valid) - scan->rs_vistuples[ntup++] = lineoff; + /* + * If the scan direction is changing, reset the prefetch block to the + * current block. Otherwise, we will incorrectly prefetch the blocks + * between the prefetch block and the current block again before + * prefetching blocks in the new, correct scan direction. + */ + if (unlikely(scan->rs_dir != dir)) + { + scan->rs_prefetch_block = scan->rs_cblock; + read_stream_reset(scan->rs_read_stream); } - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + scan->rs_dir = dir; ++<<<<<<< ours + scan->rs_cbuf = read_stream_next_buffer(scan->rs_read_stream, NULL); + if (BufferIsValid(scan->rs_cbuf)) + scan->rs_cblock = BufferGetBlockNumber(scan->rs_cbuf); ++======= + Assert(ntup <= ClusterMaxHeapTuplesPerPage); + scan->rs_ntuples = ntup; ++>>>>>>> theirs } /* @@@ -7298,6 -6740,183 +7303,48 @@@ heap_freeze_prepared_tuples(Buffer buff htup = (HeapTupleHeader) PageGetItem(page, itemid); heap_execute_freeze_tuple(htup, frz); } ++<<<<<<< ours ++======= + + MarkBufferDirty(buffer); + + /* Now WAL-log freezing if necessary */ + if (RelationNeedsWAL(rel)) + { + xl_heap_freeze_plan plans[MaxHeapTuplesPerPageLimit]; + OffsetNumber offsets[MaxHeapTuplesPerPageLimit]; + int nplans; + xl_heap_freeze_page xlrec; + XLogRecPtr recptr; + + /* Prepare deduplicated representation for use in WAL record */ + nplans = heap_log_freeze_plan(tuples, ntuples, plans, offsets); + + xlrec.snapshotConflictHorizon = snapshotConflictHorizon; + xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(rel); + xlrec.nplans = nplans; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage); + + /* + * The freeze plan array and offset array are not actually in the + * buffer, but pretend that they are. When XLogInsert stores the + * whole buffer, the arrays need not be stored too. + */ + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) plans, + nplans * sizeof(xl_heap_freeze_plan)); + XLogRegisterBufData(0, (char *) offsets, + ntuples * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); -} - -/* - * Comparator used to deduplicate XLOG_HEAP2_FREEZE_PAGE freeze plans - */ -static int -heap_log_freeze_cmp(const void *arg1, const void *arg2) -{ - HeapTupleFreeze *frz1 = (HeapTupleFreeze *) arg1; - HeapTupleFreeze *frz2 = (HeapTupleFreeze *) arg2; - - if (frz1->xmax < frz2->xmax) - return -1; - else if (frz1->xmax > frz2->xmax) - return 1; - - if (frz1->t_infomask2 < frz2->t_infomask2) - return -1; - else if (frz1->t_infomask2 > frz2->t_infomask2) - return 1; - - if (frz1->t_infomask < frz2->t_infomask) - return -1; - else if (frz1->t_infomask > frz2->t_infomask) - return 1; - - if (frz1->frzflags < frz2->frzflags) - return -1; - else if (frz1->frzflags > frz2->frzflags) - return 1; - - /* - * heap_log_freeze_eq would consider these tuple-wise plans to be equal. - * (So the tuples will share a single canonical freeze plan.) - * - * We tiebreak on page offset number to keep each freeze plan's page - * offset number array individually sorted. (Unnecessary, but be tidy.) - */ - if (frz1->offset < frz2->offset) - return -1; - else if (frz1->offset > frz2->offset) - return 1; - - Assert(false); - return 0; -} - -/* - * Compare fields that describe actions required to freeze tuple with caller's - * open plan. If everything matches then the frz tuple plan is equivalent to - * caller's plan. - */ -static inline bool -heap_log_freeze_eq(xl_heap_freeze_plan *plan, HeapTupleFreeze *frz) -{ - if (plan->xmax == frz->xmax && - plan->t_infomask2 == frz->t_infomask2 && - plan->t_infomask == frz->t_infomask && - plan->frzflags == frz->frzflags) - return true; - - /* Caller must call heap_log_freeze_new_plan again for frz */ - return false; -} - -/* - * Start new plan initialized using tuple-level actions. At least one tuple - * will have steps required to freeze described by caller's plan during REDO. - */ -static inline void -heap_log_freeze_new_plan(xl_heap_freeze_plan *plan, HeapTupleFreeze *frz) -{ - plan->xmax = frz->xmax; - plan->t_infomask2 = frz->t_infomask2; - plan->t_infomask = frz->t_infomask; - plan->frzflags = frz->frzflags; - plan->ntuples = 1; /* for now */ -} - -/* - * Deduplicate tuple-based freeze plans so that each distinct set of - * processing steps is only stored once in XLOG_HEAP2_FREEZE_PAGE records. - * Called during original execution of freezing (for logged relations). - * - * Return value is number of plans set in *plans_out for caller. Also writes - * an array of offset numbers into *offsets_out output argument for caller - * (actually there is one array per freeze plan, but that's not of immediate - * concern to our caller). - */ -static int -heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, - xl_heap_freeze_plan *plans_out, - OffsetNumber *offsets_out) -{ - int nplans = 0; - - /* Sort tuple-based freeze plans in the order required to deduplicate */ - qsort(tuples, ntuples, sizeof(HeapTupleFreeze), heap_log_freeze_cmp); - - for (int i = 0; i < ntuples; i++) - { - HeapTupleFreeze *frz = tuples + i; - - if (i == 0) - { - /* New canonical freeze plan starting with first tup */ - heap_log_freeze_new_plan(plans_out, frz); - nplans++; - } - else if (heap_log_freeze_eq(plans_out, frz)) - { - /* tup matches open canonical plan -- include tup in it */ - Assert(offsets_out[i - 1] < frz->offset); - plans_out->ntuples++; - } - else - { - /* Tup doesn't match current plan -- done with it now */ - plans_out++; - - /* New canonical freeze plan starting with this tup */ - heap_log_freeze_new_plan(plans_out, frz); - nplans++; - } - - /* - * Save page offset number in dedicated buffer in passing. - * - * REDO routine relies on the record's offset numbers array grouping - * offset numbers by freeze plan. The sort order within each grouping - * is ascending offset number order, just to keep things tidy. - */ - offsets_out[i] = frz->offset; - } - - Assert(nplans > 0 && nplans <= ntuples); - - return nplans; ++>>>>>>> theirs } /* @@@ -9133,6 -8753,1388 +9180,1391 @@@ ExtractReplicaIdentity(Relation relatio } /* ++<<<<<<< ours ++======= + * Handles XLOG_HEAP2_PRUNE record type. + * + * Acquires a full cleanup lock. + */ + static void + heap_xlog_prune(XLogReaderState *record) + { + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_prune *xlrec = (xl_heap_prune *) XLogRecGetData(record); + Buffer buffer; + RelFileLocator rlocator; + BlockNumber blkno; + XLogRedoAction action; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); + + /* + * We're about to remove tuples. In Hot Standby mode, ensure that there's + * no queries running for which the removed tuples are still visible. + */ + if (InHotStandby) + ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon, + xlrec->isCatalogRel, + rlocator); + + /* + * If we have a full-page image, restore it (using a cleanup lock) and + * we're done. + */ + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, + &buffer); + if (action == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(buffer); + OffsetNumber *end; + OffsetNumber *redirected; + OffsetNumber *nowdead; + OffsetNumber *nowunused; + int nredirected; + int ndead; + int nunused; + Size datalen; + + redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); + + nredirected = xlrec->nredirected; + ndead = xlrec->ndead; + end = (OffsetNumber *) ((char *) redirected + datalen); + nowdead = redirected + (nredirected * 2); + nowunused = nowdead + ndead; + nunused = (end - nowunused); + Assert(nunused >= 0); + + /* Update all line pointers per the record, and repair fragmentation */ + heap_page_prune_execute(buffer, + redirected, nredirected, + nowdead, ndead, + nowunused, nunused); + + /* + * Note: we don't worry about updating the page's prunability hints. + * At worst this will cause an extra prune cycle to occur soon. + */ + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + if (BufferIsValid(buffer)) + { + Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); + + UnlockReleaseBuffer(buffer); + + /* + * After pruning records from a page, it's useful to update the FSM + * about it, as it may cause the page become target for insertions + * later even if vacuum decides not to visit it (which is possible if + * gets marked all-visible.) + * + * Do this regardless of a full-page image being applied, since the + * FSM data is not in the page anyway. + */ + XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); + } + } + + /* + * Handles XLOG_HEAP2_VACUUM record type. + * + * Acquires an ordinary exclusive lock only. + */ + static void + heap_xlog_vacuum(XLogReaderState *record) + { + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_vacuum *xlrec = (xl_heap_vacuum *) XLogRecGetData(record); + Buffer buffer; + BlockNumber blkno; + XLogRedoAction action; + + /* + * If we have a full-page image, restore it (without using a cleanup lock) + * and we're done. + */ + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, false, + &buffer); + if (action == BLK_NEEDS_REDO) + { + Page page = (Page) BufferGetPage(buffer); + OffsetNumber *nowunused; + Size datalen; + OffsetNumber *offnum; + + nowunused = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); + + /* Shouldn't be a record unless there's something to do */ + Assert(xlrec->nunused > 0); + + /* Update all now-unused line pointers */ + offnum = nowunused; + for (int i = 0; i < xlrec->nunused; i++) + { + OffsetNumber off = *offnum++; + ItemId lp = PageGetItemId(page, off); + + Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp)); + ItemIdSetUnused(lp); + } + + /* Attempt to truncate line pointer array now */ + PageTruncateLinePointerArray(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + if (BufferIsValid(buffer)) + { + Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); + RelFileLocator rlocator; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); + + UnlockReleaseBuffer(buffer); + + /* + * After vacuuming LP_DEAD items from a page, it's useful to update + * the FSM about it, as it may cause the page become target for + * insertions later even if vacuum decides not to visit it (which is + * possible if gets marked all-visible.) + * + * Do this regardless of a full-page image being applied, since the + * FSM data is not in the page anyway. + */ + XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); + } + } + + /* + * Replay XLOG_HEAP2_VISIBLE record. + * + * The critical integrity requirement here is that we must never end up with + * a situation where the visibility map bit is set, and the page-level + * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent + * page modification would fail to clear the visibility map bit. + */ + static void + heap_xlog_visible(XLogReaderState *record) + { + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record); + Buffer vmbuffer = InvalidBuffer; + Buffer buffer; + Page page; + RelFileLocator rlocator; + BlockNumber blkno; + XLogRedoAction action; + + Assert((xlrec->flags & VISIBILITYMAP_XLOG_VALID_BITS) == xlrec->flags); + + XLogRecGetBlockTag(record, 1, &rlocator, NULL, &blkno); + + /* + * If there are any Hot Standby transactions running that have an xmin + * horizon old enough that this page isn't all-visible for them, they + * might incorrectly decide that an index-only scan can skip a heap fetch. + * + * NB: It might be better to throw some kind of "soft" conflict here that + * forces any index-only scan that is in flight to perform heap fetches, + * rather than killing the transaction outright. + */ + if (InHotStandby) + ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon, + xlrec->flags & VISIBILITYMAP_XLOG_CATALOG_REL, + rlocator); + + /* + * Read the heap page, if it still exists. If the heap file has dropped or + * truncated later in recovery, we don't need to update the page, but we'd + * better still update the visibility map. + */ + action = XLogReadBufferForRedo(record, 1, &buffer); + if (action == BLK_NEEDS_REDO) + { + /* + * We don't bump the LSN of the heap page when setting the visibility + * map bit (unless checksums or wal_hint_bits is enabled, in which + * case we must). This exposes us to torn page hazards, but since + * we're not inspecting the existing page contents in any way, we + * don't care. + */ + page = BufferGetPage(buffer); + + PageSetAllVisible(page); + + if (XLogHintBitIsNeeded()) + PageSetLSN(page, lsn); + + MarkBufferDirty(buffer); + } + else if (action == BLK_RESTORED) + { + /* + * If heap block was backed up, we already restored it and there's + * nothing more to do. (This can only happen with checksums or + * wal_log_hints enabled.) + */ + } + + if (BufferIsValid(buffer)) + { + Size space = PageGetFreeSpace(BufferGetPage(buffer)); + + UnlockReleaseBuffer(buffer); + + /* + * Since FSM is not WAL-logged and only updated heuristically, it + * easily becomes stale in standbys. If the standby is later promoted + * and runs VACUUM, it will skip updating individual free space + * figures for pages that became all-visible (or all-frozen, depending + * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum + * propagates too optimistic free space values to upper FSM layers; + * later inserters try to use such pages only to find out that they + * are unusable. This can cause long stalls when there are many such + * pages. + * + * Forestall those problems by updating FSM's idea about a page that + * is becoming all-visible or all-frozen. + * + * Do this regardless of a full-page image being applied, since the + * FSM data is not in the page anyway. + */ + if (xlrec->flags & VISIBILITYMAP_VALID_BITS) + XLogRecordPageWithFreeSpace(rlocator, blkno, space); + } + + /* + * Even if we skipped the heap page update due to the LSN interlock, it's + * still safe to update the visibility map. Any WAL record that clears + * the visibility map bit does so before checking the page LSN, so any + * bits that need to be cleared will still be cleared. + */ + if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false, + &vmbuffer) == BLK_NEEDS_REDO) + { + Page vmpage = BufferGetPage(vmbuffer); + Relation reln; + uint8 vmbits; + + /* initialize the page if it was read as zeros */ + if (PageIsNew(vmpage)) + PageInit(vmpage, BLCKSZ, 0); + + /* remove VISIBILITYMAP_XLOG_* */ + vmbits = xlrec->flags & VISIBILITYMAP_VALID_BITS; + + /* + * XLogReadBufferForRedoExtended locked the buffer. But + * visibilitymap_set will handle locking itself. + */ + LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); + + reln = CreateFakeRelcacheEntry(rlocator); + visibilitymap_pin(reln, blkno, &vmbuffer); + + visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer, + xlrec->snapshotConflictHorizon, vmbits); + + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + else if (BufferIsValid(vmbuffer)) + UnlockReleaseBuffer(vmbuffer); + } + + /* + * Replay XLOG_HEAP2_FREEZE_PAGE records + */ + static void + heap_xlog_freeze_page(XLogReaderState *record) + { + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record); + Buffer buffer; + + /* + * In Hot Standby mode, ensure that there's no queries running which still + * consider the frozen xids as running. + */ + if (InHotStandby) + { + RelFileLocator rlocator; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, NULL); + ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon, + xlrec->isCatalogRel, + rlocator); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Page page = BufferGetPage(buffer); + xl_heap_freeze_plan *plans; + OffsetNumber *offsets; + int curoff = 0; + + plans = (xl_heap_freeze_plan *) XLogRecGetBlockData(record, 0, NULL); + offsets = (OffsetNumber *) ((char *) plans + + (xlrec->nplans * + sizeof(xl_heap_freeze_plan))); + for (int p = 0; p < xlrec->nplans; p++) + { + HeapTupleFreeze frz; + + /* + * Convert freeze plan representation from WAL record into + * per-tuple format used by heap_execute_freeze_tuple + */ + frz.xmax = plans[p].xmax; + frz.t_infomask2 = plans[p].t_infomask2; + frz.t_infomask = plans[p].t_infomask; + frz.frzflags = plans[p].frzflags; + frz.offset = InvalidOffsetNumber; /* unused, but be tidy */ + + for (int i = 0; i < plans[p].ntuples; i++) + { + OffsetNumber offset = offsets[curoff++]; + ItemId lp; + HeapTupleHeader tuple; + + lp = PageGetItemId(page, offset); + tuple = (HeapTupleHeader) PageGetItem(page, lp); + heap_execute_freeze_tuple(tuple, &frz); + } + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + + /* + * Given an "infobits" field from an XLog record, set the correct bits in the + * given infomask and infomask2 for the tuple touched by the record. + * + * (This is the reverse of compute_infobits). + */ + static void + fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) + { + *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | + HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK); + *infomask2 &= ~HEAP_KEYS_UPDATED; + + if (infobits & XLHL_XMAX_IS_MULTI) + *infomask |= HEAP_XMAX_IS_MULTI; + if (infobits & XLHL_XMAX_LOCK_ONLY) + *infomask |= HEAP_XMAX_LOCK_ONLY; + if (infobits & XLHL_XMAX_EXCL_LOCK) + *infomask |= HEAP_XMAX_EXCL_LOCK; + /* note HEAP_XMAX_SHR_LOCK isn't considered here */ + if (infobits & XLHL_XMAX_KEYSHR_LOCK) + *infomask |= HEAP_XMAX_KEYSHR_LOCK; + + if (infobits & XLHL_KEYS_UPDATED) + *infomask2 |= HEAP_KEYS_UPDATED; + } + + static void + heap_xlog_delete(XLogReaderState *record) + { + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record); + Buffer buffer; + Page page; + ItemId lp = NULL; + HeapTupleHeader htup; + BlockNumber blkno; + RelFileLocator target_locator; + ItemPointerData target_tid; + + XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno); + ItemPointerSetBlockNumber(&target_tid, blkno); + ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(target_locator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, blkno, &vmbuffer); + visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + if (PageGetMaxOffsetNumber(page) >= xlrec->offnum) + lp = PageGetItemId(page, xlrec->offnum); + + if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + HeapTupleHeaderClearHotUpdated(htup); + fix_infomask_from_infobits(xlrec->infobits_set, + &htup->t_infomask, &htup->t_infomask2); + if (!(xlrec->flags & XLH_DELETE_IS_SUPER)) + HeapTupleHeaderSetXmax(htup, xlrec->xmax); + else + HeapTupleHeaderSetXmin(htup, InvalidTransactionId); + HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + + /* Mark the page as a candidate for pruning */ + PageSetPrunable(page, XLogRecGetXid(record)); + + if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* Make sure t_ctid is set correctly */ + if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE) + HeapTupleHeaderSetMovedPartitions(htup); + else + htup->t_ctid = target_tid; + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + + static void + heap_xlog_insert(XLogReaderState *record) + { + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record); + Buffer buffer; + Page page; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSizeLimit]; + } tbuf; + HeapTupleHeader htup; + xl_heap_header xlhdr; + uint32 newlen; + Size freespace = 0; + RelFileLocator target_locator; + BlockNumber blkno; + ItemPointerData target_tid; + XLogRedoAction action; + + XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno); + ItemPointerSetBlockNumber(&target_tid, blkno); + ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(target_locator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, blkno, &vmbuffer); + visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* + * If we inserted the first and only tuple on the page, re-initialize the + * page from scratch. + */ + if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) + { + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + PageInit(page, BufferGetPageSize(buffer), 0); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + Size datalen; + char *data; + + page = BufferGetPage(buffer); + + if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum) + elog(PANIC, "invalid max offset number"); + + data = XLogRecGetBlockData(record, 0, &datalen); + + newlen = datalen - SizeOfHeapHeader; + Assert(datalen > SizeOfHeapHeader && newlen <= ClusterMaxHeapTupleSize); + memcpy((char *) &xlhdr, data, SizeOfHeapHeader); + data += SizeOfHeapHeader; + + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ + memcpy((char *) htup + SizeofHeapTupleHeader, + data, + newlen); + newlen += SizeofHeapTupleHeader; + htup->t_infomask2 = xlhdr.t_infomask2; + htup->t_infomask = xlhdr.t_infomask; + htup->t_hoff = xlhdr.t_hoff; + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, FirstCommandId); + htup->t_ctid = target_tid; + + if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, + true, true) == InvalidOffsetNumber) + elog(PANIC, "failed to add tuple"); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ + if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) + PageSetAllVisible(page); + + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* + * If the page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(target_locator, blkno, freespace); + } + + /* + * Handles MULTI_INSERT record type. + */ + static void + heap_xlog_multi_insert(XLogReaderState *record) + { + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_multi_insert *xlrec; + RelFileLocator rlocator; + BlockNumber blkno; + Buffer buffer; + Page page; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSizeLimit]; + } tbuf; + HeapTupleHeader htup; + uint32 newlen; + Size freespace = 0; + int i; + bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0; + XLogRedoAction action; + + /* + * Insertion doesn't overwrite MVCC data, so no conflict processing is + * required. + */ + xlrec = (xl_heap_multi_insert *) XLogRecGetData(record); + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); + + /* check that the mutually exclusive flags are not both set */ + Assert(!((xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) && + (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET))); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rlocator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, blkno, &vmbuffer); + visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (isinit) + { + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + PageInit(page, BufferGetPageSize(buffer), 0); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + char *tupdata; + char *endptr; + Size len; + + /* Tuples are stored as block data */ + tupdata = XLogRecGetBlockData(record, 0, &len); + endptr = tupdata + len; + + page = (Page) BufferGetPage(buffer); + + for (i = 0; i < xlrec->ntuples; i++) + { + OffsetNumber offnum; + xl_multi_insert_tuple *xlhdr; + + /* + * If we're reinitializing the page, the tuples are stored in + * order from FirstOffsetNumber. Otherwise there's an array of + * offsets in the WAL record, and the tuples come after that. + */ + if (isinit) + offnum = FirstOffsetNumber + i; + else + offnum = xlrec->offsets[i]; + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "invalid max offset number"); + + xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata); + tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple; + + newlen = xlhdr->datalen; + Assert(newlen <= ClusterMaxHeapTupleSize); + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ + memcpy((char *) htup + SizeofHeapTupleHeader, + (char *) tupdata, + newlen); + tupdata += newlen; + + newlen += SizeofHeapTupleHeader; + htup->t_infomask2 = xlhdr->t_infomask2; + htup->t_infomask = xlhdr->t_infomask; + htup->t_hoff = xlhdr->t_hoff; + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, FirstCommandId); + ItemPointerSetBlockNumber(&htup->t_ctid, blkno); + ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); + + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "failed to add tuple"); + } + if (tupdata != endptr) + elog(PANIC, "total tuple length mismatch"); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ + if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) + PageSetAllVisible(page); + + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* + * If the page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); + } + + /* + * Handles UPDATE and HOT_UPDATE + */ + static void + heap_xlog_update(XLogReaderState *record, bool hot_update) + { + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record); + RelFileLocator rlocator; + BlockNumber oldblk; + BlockNumber newblk; + ItemPointerData newtid; + Buffer obuffer, + nbuffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleData oldtup; + HeapTupleHeader htup; + uint16 prefixlen = 0, + suffixlen = 0; + char *newp; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSizeLimit]; + } tbuf; + xl_heap_header xlhdr; + uint32 newlen; + Size freespace = 0; + XLogRedoAction oldaction; + XLogRedoAction newaction; + + /* initialize to keep the compiler quiet */ + oldtup.t_data = NULL; + oldtup.t_len = 0; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &newblk); + if (XLogRecGetBlockTagExtended(record, 1, NULL, NULL, &oldblk, NULL)) + { + /* HOT updates are never done across pages */ + Assert(!hot_update); + } + else + oldblk = newblk; + + ItemPointerSet(&newtid, newblk, xlrec->new_offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rlocator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, oldblk, &vmbuffer); + visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* + * In normal operation, it is important to lock the two pages in + * page-number order, to avoid possible deadlocks against other update + * operations going the other way. However, during WAL replay there can + * be no other update happening, so we don't need to worry about that. But + * we *do* need to worry that we don't expose an inconsistent state to Hot + * Standby queries --- so the original page can't be unlocked before we've + * added the new tuple to the new page. + */ + + /* Deal with old tuple version */ + oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1, + &obuffer); + if (oldaction == BLK_NEEDS_REDO) + { + page = BufferGetPage(obuffer); + offnum = xlrec->old_offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + oldtup.t_data = htup; + oldtup.t_len = ItemIdGetLength(lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + if (hot_update) + HeapTupleHeaderSetHotUpdated(htup); + else + HeapTupleHeaderClearHotUpdated(htup); + fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, + &htup->t_infomask2); + HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); + HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + /* Set forward chain link in t_ctid */ + htup->t_ctid = newtid; + + /* Mark the page as a candidate for pruning */ + PageSetPrunable(page, XLogRecGetXid(record)); + + if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(obuffer); + } + + /* + * Read the page the new tuple goes into, if different from old. + */ + if (oldblk == newblk) + { + nbuffer = obuffer; + newaction = oldaction; + } + else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) + { + nbuffer = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(nbuffer); + PageInit(page, BufferGetPageSize(nbuffer), 0); + newaction = BLK_NEEDS_REDO; + } + else + newaction = XLogReadBufferForRedo(record, 0, &nbuffer); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rlocator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, newblk, &vmbuffer); + visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* Deal with new tuple */ + if (newaction == BLK_NEEDS_REDO) + { + char *recdata; + char *recdata_end; + Size datalen; + Size tuplen; + + recdata = XLogRecGetBlockData(record, 0, &datalen); + recdata_end = recdata + datalen; + + page = BufferGetPage(nbuffer); + + offnum = xlrec->new_offnum; + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "invalid max offset number"); + + if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD) + { + Assert(newblk == oldblk); + memcpy(&prefixlen, recdata, sizeof(uint16)); + recdata += sizeof(uint16); + } + if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD) + { + Assert(newblk == oldblk); + memcpy(&suffixlen, recdata, sizeof(uint16)); + recdata += sizeof(uint16); + } + + memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader); + recdata += SizeOfHeapHeader; + + tuplen = recdata_end - recdata; + Assert(tuplen <= ClusterMaxHeapTupleSize); + + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + + /* + * Reconstruct the new tuple using the prefix and/or suffix from the + * old tuple, and the data stored in the WAL record. + */ + newp = (char *) htup + SizeofHeapTupleHeader; + if (prefixlen > 0) + { + int len; + + /* copy bitmap [+ padding] [+ oid] from WAL record */ + len = xlhdr.t_hoff - SizeofHeapTupleHeader; + memcpy(newp, recdata, len); + recdata += len; + newp += len; + + /* copy prefix from old tuple */ + memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen); + newp += prefixlen; + + /* copy new tuple data from WAL record */ + len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader); + memcpy(newp, recdata, len); + recdata += len; + newp += len; + } + else + { + /* + * copy bitmap [+ padding] [+ oid] + data from record, all in one + * go + */ + memcpy(newp, recdata, tuplen); + recdata += tuplen; + newp += tuplen; + } + Assert(recdata == recdata_end); + + /* copy suffix from old tuple */ + if (suffixlen > 0) + memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen); + + newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen; + htup->t_infomask2 = xlhdr.t_infomask2; + htup->t_infomask = xlhdr.t_infomask; + htup->t_hoff = xlhdr.t_hoff; + + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, FirstCommandId); + HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); + /* Make sure there is no forward chain link in t_ctid */ + htup->t_ctid = newtid; + + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "failed to add tuple"); + + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + MarkBufferDirty(nbuffer); + } + + if (BufferIsValid(nbuffer) && nbuffer != obuffer) + UnlockReleaseBuffer(nbuffer); + if (BufferIsValid(obuffer)) + UnlockReleaseBuffer(obuffer); + + /* + * If the new page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * However, don't update the FSM on HOT updates, because after crash + * recovery, either the old or the new tuple will certainly be dead and + * prunable. After pruning, the page will have roughly as much free space + * as it did before the update, assuming the new tuple is about the same + * size as the old one. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(rlocator, newblk, freespace); + } + + static void + heap_xlog_confirm(XLogReaderState *record) + { + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* + * Confirm tuple as actually inserted + */ + ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + + static void + heap_xlog_lock(XLogReaderState *record) + { + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED) + { + RelFileLocator rlocator; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + Relation reln; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block); + reln = CreateFakeRelcacheEntry(rlocator); + + visibilitymap_pin(reln, block, &vmbuffer); + visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN); + + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, + &htup->t_infomask2); + + /* + * Clear relevant update flags, but only if the modified infomask says + * there's no update. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask)) + { + HeapTupleHeaderClearHotUpdated(htup); + /* Make sure there is no forward chain link in t_ctid */ + ItemPointerSet(&htup->t_ctid, + BufferGetBlockNumber(buffer), + offnum); + } + HeapTupleHeaderSetXmax(htup, xlrec->xmax); + HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + + static void + heap_xlog_lock_updated(XLogReaderState *record) + { + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_lock_updated *xlrec; + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + xlrec = (xl_heap_lock_updated *) XLogRecGetData(record); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED) + { + RelFileLocator rlocator; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + Relation reln; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block); + reln = CreateFakeRelcacheEntry(rlocator); + + visibilitymap_pin(reln, block, &vmbuffer); + visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN); + + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, + &htup->t_infomask2); + HeapTupleHeaderSetXmax(htup, xlrec->xmax); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + + static void + heap_xlog_inplace(XLogReaderState *record) + { + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + uint32 oldlen; + Size newlen; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + char *newtup = XLogRecGetBlockData(record, 0, &newlen); + + page = BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + oldlen = ItemIdGetLength(lp) - htup->t_hoff; + if (oldlen != newlen) + elog(PANIC, "wrong tuple length"); + + memcpy((char *) htup + htup->t_hoff, newtup, newlen); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + + void + heap_redo(XLogReaderState *record) + { + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + /* + * These operations don't overwrite MVCC data so no conflict processing is + * required. The ones in heap2 rmgr do. + */ + + switch (info & XLOG_HEAP_OPMASK) + { + case XLOG_HEAP_INSERT: + heap_xlog_insert(record); + break; + case XLOG_HEAP_DELETE: + heap_xlog_delete(record); + break; + case XLOG_HEAP_UPDATE: + heap_xlog_update(record, false); + break; + case XLOG_HEAP_TRUNCATE: + + /* + * TRUNCATE is a no-op because the actions are already logged as + * SMGR WAL records. TRUNCATE WAL record only exists for logical + * decoding. + */ + break; + case XLOG_HEAP_HOT_UPDATE: + heap_xlog_update(record, true); + break; + case XLOG_HEAP_CONFIRM: + heap_xlog_confirm(record); + break; + case XLOG_HEAP_LOCK: + heap_xlog_lock(record); + break; + case XLOG_HEAP_INPLACE: + heap_xlog_inplace(record); + break; + default: + elog(PANIC, "heap_redo: unknown op code %u", info); + } + } + + void + heap2_redo(XLogReaderState *record) + { + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info & XLOG_HEAP_OPMASK) + { + case XLOG_HEAP2_PRUNE: + heap_xlog_prune(record); + break; + case XLOG_HEAP2_VACUUM: + heap_xlog_vacuum(record); + break; + case XLOG_HEAP2_FREEZE_PAGE: + heap_xlog_freeze_page(record); + break; + case XLOG_HEAP2_VISIBLE: + heap_xlog_visible(record); + break; + case XLOG_HEAP2_MULTI_INSERT: + heap_xlog_multi_insert(record); + break; + case XLOG_HEAP2_LOCK_UPDATED: + heap_xlog_lock_updated(record); + break; + case XLOG_HEAP2_NEW_CID: + + /* + * Nothing to do on a real replay, only used during logical + * decoding. + */ + break; + case XLOG_HEAP2_REWRITE: + heap_xlog_logical_rewrite(record); + break; + default: + elog(PANIC, "heap2_redo: unknown op code %u", info); + } + } + + /* + * Mask a heap page before performing consistency checks on it. + */ + void + heap_mask(char *pagedata, BlockNumber blkno) + { + Page page = (Page) pagedata; + OffsetNumber off; + + mask_page_lsn_and_checksum(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) + { + ItemId iid = PageGetItemId(page, off); + char *page_item; + + page_item = (char *) (page + ItemIdGetOffset(iid)); + + if (ItemIdIsNormal(iid)) + { + HeapTupleHeader page_htup = (HeapTupleHeader) page_item; + + /* + * If xmin of a tuple is not yet frozen, we should ignore + * differences in hint bits, since they can be set without + * emitting WAL. + */ + if (!HeapTupleHeaderXminFrozen(page_htup)) + page_htup->t_infomask &= ~HEAP_XACT_MASK; + else + { + /* Still we need to mask xmax hint bits. */ + page_htup->t_infomask &= ~HEAP_XMAX_INVALID; + page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED; + } + + /* + * During replay, we set Command Id to FirstCommandId. Hence, mask + * it. See heap_xlog_insert() for details. + */ + page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER; + + /* + * For a speculative tuple, heap_insert() does not set ctid in the + * caller-passed heap tuple itself, leaving the ctid field to + * contain a speculative token value - a per-backend monotonically + * increasing identifier. Besides, it does not WAL-log ctid under + * any circumstances. + * + * During redo, heap_xlog_insert() sets t_ctid to current block + * number and self offset number. It doesn't care about any + * speculative insertions on the primary. Hence, we set t_ctid to + * current block number and self offset number to ignore any + * inconsistency. + */ + if (HeapTupleHeaderIsSpeculative(page_htup)) + ItemPointerSet(&page_htup->t_ctid, blkno, off); + + /* + * NB: Not ignoring ctid changes due to the tuple having moved + * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's + * important information that needs to be in-sync between primary + * and standby, and thus is WAL logged. + */ + } + + /* + * Ignore any padding bytes after the tuple, when the length of the + * item is not MAXALIGNed. + */ + if (ItemIdHasStorage(iid)) + { + int len = ItemIdGetLength(iid); + int padlen = MAXALIGN(len) - len; + + if (padlen > 0) + memset(page_item + len, MASK_MARKER, padlen); + } + } + } + + /* ++>>>>>>> theirs * HeapCheckForSerializableConflictOut * We are reading a tuple. If it's not visible, there may be a * rw-conflict out with the inserter. Otherwise, if it is visible to us diff --cc src/backend/access/heap/pruneheap.c index 5e2d2645dd,deb153198f..0000000000 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@@ -54,102 -41,18 +54,112 @@@ typedef struc int nredirected; /* numbers of entries in arrays below */ int ndead; int nunused; + int nfrozen; /* arrays that accumulate indexes of items to be changed */ ++<<<<<<< ours + OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; + OffsetNumber nowdead[MaxHeapTuplesPerPage]; + OffsetNumber nowunused[MaxHeapTuplesPerPage]; + HeapTupleFreeze frozen[MaxHeapTuplesPerPage]; + + /*------------------------------------------------------- + * Working state for HOT chain processing + *------------------------------------------------------- + */ + + /* + * 'root_items' contains offsets of all LP_REDIRECT line pointers and + * normal non-HOT tuples. They can be stand-alone items or the first item + * in a HOT chain. 'heaponly_items' contains heap-only tuples which can + * only be removed as part of a HOT chain. + */ + int nroot_items; + OffsetNumber root_items[MaxHeapTuplesPerPage]; + int nheaponly_items; + OffsetNumber heaponly_items[MaxHeapTuplesPerPage]; + + /* + * processed[offnum] is true if item at offnum has been processed. + * + * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is + * 1. Otherwise every access would need to subtract 1. + */ + bool processed[MaxHeapTuplesPerPage + 1]; ++======= + OffsetNumber redirected[MaxHeapTuplesPerPageLimit * 2]; + OffsetNumber nowdead[MaxHeapTuplesPerPageLimit]; + OffsetNumber nowunused[MaxHeapTuplesPerPageLimit]; ++>>>>>>> theirs /* - * marked[i] is true if item i is entered in one of the above arrays. + * Tuple visibility is only computed once for each tuple, for correctness + * and efficiency reasons; see comment in heap_page_prune_and_freeze() for + * details. This is of type int8[], instead of HTSV_Result[], so we can + * use -1 to indicate no visibility has been computed, e.g. for LP_DEAD + * items. * - * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is + * This needs to be ClusterMaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is * 1. Otherwise every access would need to subtract 1. */ ++<<<<<<< ours + int8 htsv[MaxHeapTuplesPerPage + 1]; + + /* + * Freezing-related state. + */ + HeapPageFreeze pagefrz; + + /*------------------------------------------------------- + * Information about what was done + * + * These fields are not used by pruning itself for the most part, but are + * used to collect information about what was pruned and what state the + * page is in after pruning, for the benefit of the caller. They are + * copied to the caller's PruneFreezeResult at the end. + * ------------------------------------------------------- + */ + + int ndeleted; /* Number of tuples deleted from the page */ + + /* Number of live and recently dead tuples, after pruning */ + int live_tuples; + int recently_dead_tuples; + + /* Whether or not the page makes rel truncation unsafe */ + bool hastup; + + /* + * LP_DEAD items on the page after pruning. Includes existing LP_DEAD + * items + */ + int lpdead_items; /* number of items in the array */ + OffsetNumber *deadoffsets; /* points directly to presult->deadoffsets */ + + /* + * all_visible and all_frozen indicate if the all-visible and all-frozen + * bits in the visibility map can be set for this page after pruning. + * + * visibility_cutoff_xid is the newest xmin of live tuples on the page. + * The caller can use it as the conflict horizon, when setting the VM + * bits. It is only valid if we froze some tuples, and all_frozen is + * true. + * + * NOTE: all_visible and all_frozen don't include LP_DEAD items. That's + * convenient for heap_page_prune_and_freeze(), to use them to decide + * whether to freeze the page or not. The all_visible and all_frozen + * values returned to the caller are adjusted to include LP_DEAD items at + * the end. + * + * all_frozen should only be considered valid if all_visible is also set; + * we don't bother to clear the all_frozen flag every time we clear the + * all_visible flag. + */ + bool all_visible; + bool all_frozen; + TransactionId visibility_cutoff_xid; ++======= + bool marked[MaxHeapTuplesPerPageLimit + 1]; ++>>>>>>> theirs } PruneState; /* Local functions */ @@@ -987,31 -478,69 +997,43 @@@ htsv_get_valid_status(int status * prstate showing the changes to be made. Items to be redirected are added * to the redirected[] array (two entries per redirection); items to be set to * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED - * state are added to nowunused[]. - * - * Returns the number of tuples (to be) deleted from the page. + * state are added to nowunused[]. We perform bookkeeping of live tuples, + * visibility etc. based on what the page will look like after the changes + * applied. All that bookkeeping is performed in the heap_prune_record_*() + * subroutines. The division of labor is that heap_prune_chain() decides the + * fate of each tuple, ie. whether it's going to be removed, redirected or + * left unchanged, and the heap_prune_record_*() subroutines update PruneState + * based on that outcome. */ -static int -heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, - int8 *htsv, PruneState *prstate) +static void +heap_prune_chain(Page page, BlockNumber blockno, OffsetNumber maxoff, + OffsetNumber rootoffnum, PruneState *prstate) { - int ndeleted = 0; - Page dp = (Page) BufferGetPage(buffer); TransactionId priorXmax = InvalidTransactionId; ItemId rootlp; ++<<<<<<< ours + OffsetNumber offnum; + OffsetNumber chainitems[MaxHeapTuplesPerPage]; ++======= + HeapTupleHeader htup; + OffsetNumber latestdead = InvalidOffsetNumber, + maxoff = PageGetMaxOffsetNumber(dp), + offnum; + OffsetNumber chainitems[MaxHeapTuplesPerPageLimit]; + int nchain = 0, + i; + + rootlp = PageGetItemId(dp, rootoffnum); ++>>>>>>> theirs /* - * If it's a heap-only tuple, then it is not the start of a HOT chain. + * After traversing the HOT chain, ndeadchain is the index in chainitems + * of the first live successor after the last dead item. */ - if (ItemIdIsNormal(rootlp)) - { - Assert(htsv[rootoffnum] != -1); - htup = (HeapTupleHeader) PageGetItem(dp, rootlp); - - if (HeapTupleHeaderIsHeapOnly(htup)) - { - /* - * If the tuple is DEAD and doesn't chain to anything else, mark - * it unused immediately. (If it does chain, we can only remove - * it as part of pruning its chain.) - * - * We need this primarily to handle aborted HOT updates, that is, - * XMIN_INVALID heap-only tuples. Those might not be linked to by - * any chain, since the parent tuple might be re-updated before - * any pruning occurs. So we have to be able to reap them - * separately from chain-pruning. (Note that - * HeapTupleHeaderIsHotUpdated will never return true for an - * XMIN_INVALID tuple, so this code will work even when there were - * sequential updates within the aborted transaction.) - * - * Note that we might first arrive at a dead heap-only tuple - * either here or while following a chain below. Whichever path - * gets there first will mark the tuple unused. - */ - if (htsv[rootoffnum] == HEAPTUPLE_DEAD && - !HeapTupleHeaderIsHotUpdated(htup)) - { - heap_prune_record_unused(prstate, rootoffnum); - HeapTupleHeaderAdvanceConflictHorizon(htup, - &prstate->snapshotConflictHorizon); - ndeleted++; - } + int ndeadchain = 0, + nchain = 0; - /* Nothing more to do */ - return ndeleted; - } - } + rootlp = PageGetItemId(page, rootoffnum); /* Start from the root tuple */ offnum = rootoffnum; @@@ -1213,61 -773,27 +1235,69 @@@ heap_prune_record_prunable(PruneState * /* Record line pointer to be redirected */ static void heap_prune_record_redirect(PruneState *prstate, - OffsetNumber offnum, OffsetNumber rdoffnum) + OffsetNumber offnum, OffsetNumber rdoffnum, + bool was_normal) { ++<<<<<<< ours + Assert(!prstate->processed[offnum]); + prstate->processed[offnum] = true; + + /* + * Do not mark the redirect target here. It needs to be counted + * separately as an unchanged tuple. + */ + + Assert(prstate->nredirected < MaxHeapTuplesPerPage); ++======= + Assert(prstate->nredirected < ClusterMaxHeapTuplesPerPage); ++>>>>>>> theirs prstate->redirected[prstate->nredirected * 2] = offnum; prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum; + prstate->nredirected++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; - Assert(!prstate->marked[rdoffnum]); - prstate->marked[rdoffnum] = true; + + /* + * If the root entry had been a normal tuple, we are deleting it, so count + * it in the result. But changing a redirect (even to DEAD state) doesn't + * count. + */ + if (was_normal) + prstate->ndeleted++; + + prstate->hastup = true; } /* Record line pointer to be marked dead */ static void -heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum) +heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum, + bool was_normal) { ++<<<<<<< ours + Assert(!prstate->processed[offnum]); + prstate->processed[offnum] = true; + + Assert(prstate->ndead < MaxHeapTuplesPerPage); ++======= + Assert(prstate->ndead < ClusterMaxHeapTuplesPerPage); ++>>>>>>> theirs prstate->nowdead[prstate->ndead] = offnum; prstate->ndead++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; + + /* + * Deliberately delay unsetting all_visible until later during pruning. + * Removable dead tuples shouldn't preclude freezing the page. + */ + + /* Record the dead offset for vacuum */ + prstate->deadoffsets[prstate->lpdead_items++] = offnum; + + /* + * If the root entry had been a normal tuple, we are deleting it, so count + * it in the result. But changing a redirect (even to DEAD state) doesn't + * count. + */ + if (was_normal) + prstate->ndeleted++; } /* @@@ -1294,271 -819,23 +1324,275 @@@ heap_prune_record_dead_or_unused(PruneS /* Record line pointer to be marked unused */ static void -heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) +heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal) { ++<<<<<<< ours + Assert(!prstate->processed[offnum]); + prstate->processed[offnum] = true; + + Assert(prstate->nunused < MaxHeapTuplesPerPage); ++======= + Assert(prstate->nunused < ClusterMaxHeapTuplesPerPage); ++>>>>>>> theirs prstate->nowunused[prstate->nunused] = offnum; prstate->nunused++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; + + /* + * If the root entry had been a normal tuple, we are deleting it, so count + * it in the result. But changing a redirect (even to DEAD state) doesn't + * count. + */ + if (was_normal) + prstate->ndeleted++; +} + +/* + * Record an unused line pointer that is left unchanged. + */ +static void +heap_prune_record_unchanged_lp_unused(Page page, PruneState *prstate, OffsetNumber offnum) +{ + Assert(!prstate->processed[offnum]); + prstate->processed[offnum] = true; +} + +/* + * Record line pointer that is left unchanged. We consider freezing it, and + * update bookkeeping of tuple counts and page visibility. + */ +static void +heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumber offnum) +{ + HeapTupleHeader htup; + + Assert(!prstate->processed[offnum]); + prstate->processed[offnum] = true; + + prstate->hastup = true; /* the page is not empty */ + + /* + * The criteria for counting a tuple as live in this block need to match + * what analyze.c's acquire_sample_rows() does, otherwise VACUUM and + * ANALYZE may produce wildly different reltuples values, e.g. when there + * are many recently-dead tuples. + * + * The logic here is a bit simpler than acquire_sample_rows(), as VACUUM + * can't run inside a transaction block, which makes some cases impossible + * (e.g. in-progress insert from the same transaction). + * + * HEAPTUPLE_DEAD are handled by the other heap_prune_record_*() + * subroutines. They don't count dead items like acquire_sample_rows() + * does, because we assume that all dead items will become LP_UNUSED + * before VACUUM finishes. This difference is only superficial. VACUUM + * effectively agrees with ANALYZE about DEAD items, in the end. VACUUM + * won't remember LP_DEAD items, but only because they're not supposed to + * be left behind when it is done. (Cases where we bypass index vacuuming + * will violate this optimistic assumption, but the overall impact of that + * should be negligible.) + */ + htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, offnum)); + + switch (prstate->htsv[offnum]) + { + case HEAPTUPLE_LIVE: + + /* + * Count it as live. Not only is this natural, but it's also what + * acquire_sample_rows() does. + */ + prstate->live_tuples++; + + /* + * Is the tuple definitely visible to all transactions? + * + * NB: Like with per-tuple hint bits, we can't set the + * PD_ALL_VISIBLE flag if the inserter committed asynchronously. + * See SetHintBits for more info. Check that the tuple is hinted + * xmin-committed because of that. + */ + if (prstate->all_visible) + { + TransactionId xmin; + + if (!HeapTupleHeaderXminCommitted(htup)) + { + prstate->all_visible = false; + break; + } + + /* + * The inserter definitely committed. But is it old enough + * that everyone sees it as committed? A FrozenTransactionId + * is seen as committed to everyone. Otherwise, we check if + * there is a snapshot that considers this xid to still be + * running, and if so, we don't consider the page all-visible. + */ + xmin = HeapTupleHeaderGetXmin(htup); + + /* + * For now always use prstate->cutoffs for this test, because + * we only update 'all_visible' when freezing is requested. We + * could use GlobalVisTestIsRemovableXid instead, if a + * non-freezing caller wanted to set the VM bit. + */ + Assert(prstate->cutoffs); + if (!TransactionIdPrecedes(xmin, prstate->cutoffs->OldestXmin)) + { + prstate->all_visible = false; + break; + } + + /* Track newest xmin on page. */ + if (TransactionIdFollows(xmin, prstate->visibility_cutoff_xid) && + TransactionIdIsNormal(xmin)) + prstate->visibility_cutoff_xid = xmin; + } + break; + + case HEAPTUPLE_RECENTLY_DEAD: + prstate->recently_dead_tuples++; + prstate->all_visible = false; + + /* + * This tuple will soon become DEAD. Update the hint field so + * that the page is reconsidered for pruning in future. + */ + heap_prune_record_prunable(prstate, + HeapTupleHeaderGetUpdateXid(htup)); + break; + + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * We do not count these rows as live, because we expect the + * inserting transaction to update the counters at commit, and we + * assume that will happen only after we report our results. This + * assumption is a bit shaky, but it is what acquire_sample_rows() + * does, so be consistent. + */ + prstate->all_visible = false; + + /* + * If we wanted to optimize for aborts, we might consider marking + * the page prunable when we see INSERT_IN_PROGRESS. But we + * don't. See related decisions about when to mark the page + * prunable in heapam.c. + */ + break; + + case HEAPTUPLE_DELETE_IN_PROGRESS: + + /* + * This an expected case during concurrent vacuum. Count such + * rows as live. As above, we assume the deleting transaction + * will commit and update the counters after we report. + */ + prstate->live_tuples++; + prstate->all_visible = false; + + /* + * This tuple may soon become DEAD. Update the hint field so that + * the page is reconsidered for pruning in future. + */ + heap_prune_record_prunable(prstate, + HeapTupleHeaderGetUpdateXid(htup)); + break; + + default: + + /* + * DEAD tuples should've been passed to heap_prune_record_dead() + * or heap_prune_record_unused() instead. + */ + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result %d", + prstate->htsv[offnum]); + break; + } + + /* Consider freezing any normal tuples which will not be removed */ + if (prstate->freeze) + { + bool totally_frozen; + + if ((heap_prepare_freeze_tuple(htup, + prstate->cutoffs, + &prstate->pagefrz, + &prstate->frozen[prstate->nfrozen], + &totally_frozen))) + { + /* Save prepared freeze plan for later */ + prstate->frozen[prstate->nfrozen++].offset = offnum; + } + + /* + * If any tuple isn't either totally frozen already or eligible to + * become totally frozen (according to its freeze plan), then the page + * definitely cannot be set all-frozen in the visibility map later on. + */ + if (!totally_frozen) + prstate->all_frozen = false; + } +} + + +/* + * Record line pointer that was already LP_DEAD and is left unchanged. + */ +static void +heap_prune_record_unchanged_lp_dead(Page page, PruneState *prstate, OffsetNumber offnum) +{ + Assert(!prstate->processed[offnum]); + prstate->processed[offnum] = true; + + /* + * Deliberately don't set hastup for LP_DEAD items. We make the soft + * assumption that any LP_DEAD items encountered here will become + * LP_UNUSED later on, before count_nondeletable_pages is reached. If we + * don't make this assumption then rel truncation will only happen every + * other VACUUM, at most. Besides, VACUUM must treat + * hastup/nonempty_pages as provisional no matter how LP_DEAD items are + * handled (handled here, or handled later on). + * + * Similarly, don't unset all_visible until later, at the end of + * heap_page_prune_and_freeze(). This will allow us to attempt to freeze + * the page after pruning. As long as we unset it before updating the + * visibility map, this will be correct. + */ + + /* Record the dead offset for vacuum */ + prstate->deadoffsets[prstate->lpdead_items++] = offnum; } +/* + * Record LP_REDIRECT that is left unchanged. + */ +static void +heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum) +{ + /* + * A redirect line pointer doesn't count as a live tuple. + * + * If we leave a redirect line pointer in place, there will be another + * tuple on the page that it points to. We will do the bookkeeping for + * that separately. So we have nothing to do here, except remember that + * we processed this item. + */ + Assert(!prstate->processed[offnum]); + prstate->processed[offnum] = true; +} /* - * Perform the actual page changes needed by heap_page_prune. - * It is expected that the caller has a full cleanup lock on the - * buffer. + * Perform the actual page changes needed by heap_page_prune_and_freeze(). + * + * If 'lp_truncate_only' is set, we are merely marking LP_DEAD line pointers + * as unused, not redirecting or removing anything else. The + * PageRepairFragmentation() call is skipped in that case. + * + * If 'lp_truncate_only' is not set, the caller must hold a cleanup lock on + * the buffer. If it is set, an ordinary exclusive lock suffices. */ void -heap_page_prune_execute(Buffer buffer, +heap_page_prune_execute(Buffer buffer, bool lp_truncate_only, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused) diff --cc src/backend/access/heap/vacuumlazy.c index 075af385cd,c147c77984..0000000000 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@@ -966,7 -866,8 +966,12 @@@ lazy_scan_heap(LVRelState *vacrel * dead_items TIDs, pause and do a cycle of vacuuming before we tackle * this page. */ ++<<<<<<< ours + if (TidStoreMemoryUsage(vacrel->dead_items) > vacrel->dead_items_info->max_bytes) ++======= + Assert(dead_items->max_items >= ClusterMaxHeapTuplesPerPage); + if (dead_items->max_items - dead_items->num_items < ClusterMaxHeapTuplesPerPage) ++>>>>>>> theirs { /* * Before beginning index vacuuming, we release any pin we may @@@ -1518,8 -1412,22 +1523,27 @@@ lazy_scan_prune(LVRelState *vacrel bool *has_lpdead_items) { Relation rel = vacrel->rel; ++<<<<<<< ours + PruneFreezeResult presult; + int prune_options = 0; ++======= + OffsetNumber offnum, + maxoff; + ItemId itemid; + PruneResult presult; + int tuples_frozen, + lpdead_items, + live_tuples, + recently_dead_tuples; + HeapPageFreeze pagefrz; + bool hastup = false; + bool all_visible, + all_frozen; + TransactionId visibility_cutoff_xid; + int64 fpi_before = pgWalUsage.wal_fpi; + OffsetNumber deadoffsets[MaxHeapTuplesPerPageLimit]; + HeapTupleFreeze frozen[MaxHeapTuplesPerPageLimit]; ++>>>>>>> theirs Assert(BufferGetBlockNumber(buf) == blkno); @@@ -2335,14 -2488,18 +2359,14 @@@ lazy_vacuum_heap_rel(LVRelState *vacrel * Caller must have an exclusive buffer lock on the buffer (though a full * cleanup lock is also acceptable). vmbuffer must be valid and already have * a pin on blkno's visibility map page. - * - * index is an offset into the vacrel->dead_items array for the first listed - * LP_DEAD item on the page. The return value is the first index immediately - * after all LP_DEAD items for the same page in the array. */ -static int +static void lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, - int index, Buffer vmbuffer) + OffsetNumber *deadoffsets, int num_offsets, + Buffer vmbuffer) { - VacDeadItems *dead_items = vacrel->dead_items; Page page = BufferGetPage(buffer); - OffsetNumber unused[MaxHeapTuplesPerPage]; + OffsetNumber unused[MaxHeapTuplesPerPageLimit]; int nunused = 0; TransactionId visibility_cutoff_xid; bool all_frozen; @@@ -2976,8 -3126,48 +3000,53 @@@ count_nondeletable_pages(LVRelState *va } /* ++<<<<<<< ours + * Allocate dead_items and dead_items_info (either using palloc, or in dynamic + * shared memory). Sets both in vacrel for caller. ++======= + * Returns the number of dead TIDs that VACUUM should allocate space to + * store, given a heap rel of size vacrel->rel_pages, and given current + * maintenance_work_mem setting (or current autovacuum_work_mem setting, + * when applicable). + * + * See the comments at the head of this file for rationale. + */ + static int + dead_items_max_items(LVRelState *vacrel) + { + int64 max_items; + int vac_work_mem = AmAutoVacuumWorkerProcess() && + autovacuum_work_mem != -1 ? + autovacuum_work_mem : maintenance_work_mem; + + if (vacrel->nindexes > 0) + { + BlockNumber rel_pages = vacrel->rel_pages; + + max_items = MAXDEADITEMS(vac_work_mem * 1024L); + max_items = Min(max_items, INT_MAX); + max_items = Min(max_items, MAXDEADITEMS(MaxAllocSize)); + + /* curious coding here to ensure the multiplication can't overflow */ + if ((BlockNumber) (max_items / ClusterMaxHeapTuplesPerPage) > rel_pages) + max_items = rel_pages * ClusterMaxHeapTuplesPerPage; + + /* stay sane if small maintenance_work_mem */ + max_items = Max(max_items, ClusterMaxHeapTuplesPerPage); + } + else + { + /* One-pass case only stores a single heap page's TIDs at a time */ + max_items = ClusterMaxHeapTuplesPerPage; + } + + return (int) max_items; + } + + /* + * Allocate dead_items (either using palloc, or in dynamic shared memory). + * Sets dead_items in vacrel for caller. ++>>>>>>> theirs * * Also handles parallel initialization as part of allocating dead_items in * DSM when required. @@@ -2985,10 -3175,11 +3054,18 @@@ static void dead_items_alloc(LVRelState *vacrel, int nworkers) { ++<<<<<<< ours + VacDeadItemsInfo *dead_items_info; + int vac_work_mem = AmAutoVacuumWorkerProcess() && + autovacuum_work_mem != -1 ? + autovacuum_work_mem : maintenance_work_mem; ++======= + VacDeadItems *dead_items; + int max_items; + + max_items = dead_items_max_items(vacrel); + Assert(max_items >= ClusterMaxHeapTuplesPerPage); ++>>>>>>> theirs /* * Initialize state for a parallel vacuum. As of now, only one worker can diff --cc src/include/access/heapam.h index 1640d9c32f,3217d72f99..0000000000 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@@ -93,9 -73,9 +93,15 @@@ typedef struct HeapScanDescDat ParallelBlockTableScanWorkerData *rs_parallelworkerdata; /* these fields only used in page-at-a-time mode and for bitmap scans */ ++<<<<<<< ours + uint32 rs_cindex; /* current tuple's index in vistuples */ + uint32 rs_ntuples; /* number of visible tuples on page */ + OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */ ++======= + int rs_cindex; /* current tuple's index in vistuples */ + int rs_ntuples; /* number of visible tuples on page */ + OffsetNumber rs_vistuples[MaxHeapTuplesPerPageLimit]; /* their offsets */ ++>>>>>>> theirs } HeapScanDescData; typedef struct HeapScanDescData *HeapScanDesc; @@@ -236,49 -198,32 +242,57 @@@ typedef struct PruneFreezeResul { int ndeleted; /* Number of tuples deleted from the page */ int nnewlpdead; /* Number of newly LP_DEAD items */ + int nfrozen; /* Number of tuples we froze */ + + /* Number of live and recently dead tuples on the page, after pruning */ + int live_tuples; + int recently_dead_tuples; /* - * Tuple visibility is only computed once for each tuple, for correctness - * and efficiency reasons; see comment in heap_page_prune() for details. - * This is of type int8[], instead of HTSV_Result[], so we can use -1 to - * indicate no visibility has been computed, e.g. for LP_DEAD items. + * all_visible and all_frozen indicate if the all-visible and all-frozen + * bits in the visibility map can be set for this page, after pruning. + * + * vm_conflict_horizon is the newest xmin of live tuples on the page. The + * caller can use it as the conflict horizon when setting the VM bits. It + * is only valid if we froze some tuples (nfrozen > 0), and all_frozen is + * true. * ++<<<<<<< ours + * These are only set if the HEAP_PRUNE_FREEZE option is set. + */ + bool all_visible; + bool all_frozen; + TransactionId vm_conflict_horizon; ++======= + * This needs to be ClusterMaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is + * 1. Otherwise every access would need to subtract 1. + */ + int8 htsv[MaxHeapTuplesPerPageLimit + 1]; + } PruneResult; ++>>>>>>> theirs -/* - * Pruning calculates tuple visibility once and saves the results in an array - * of int8. See PruneResult.htsv for details. This helper function is meant to - * guard against examining visibility status array members which have not yet - * been computed. - */ -static inline HTSV_Result -htsv_get_valid_status(int status) + /* + * Whether or not the page makes rel truncation unsafe. This is set to + * 'true', even if the page contains LP_DEAD items. VACUUM will remove + * them before attempting to truncate. + */ + bool hastup; + + /* + * LP_DEAD items on the page after pruning. Includes existing LP_DEAD + * items. + */ + int lpdead_items; + OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; +} PruneFreezeResult; + +/* 'reason' codes for heap_page_prune_and_freeze() */ +typedef enum { - Assert(status >= HEAPTUPLE_DEAD && - status <= HEAPTUPLE_DELETE_IN_PROGRESS); - return (HTSV_Result) status; -} + PRUNE_ON_ACCESS, /* on-access pruning */ + PRUNE_VACUUM_SCAN, /* VACUUM 1st heap pass */ + PRUNE_VACUUM_CLEANUP, /* VACUUM 2nd heap pass */ +} PruneReason; /* ---------------- * function prototypes for heap access method