=== Applying patches on top of PostgreSQL commit ID 53a49365052026907afff7613929710d1e7f0da0 === /etc/rc.d/jail: WARNING: Per-jail configuration via jail_* variables is obsolete. Please consider migrating to /etc/jail.conf. Sat Feb 1 02:02:32 UTC 2025 On branch cf/5299 nothing to commit, working tree clean === applying patch ./v1-0001-Avoid-full-btree-index-scans-when-skipping-is-pos.patch /work/patches/./v1-0001-Avoid-full-btree-index-scans-when-skipping-is-pos.patch:79: trailing whitespace. /work/patches/./v1-0001-Avoid-full-btree-index-scans-when-skipping-is-pos.patch:167: trailing whitespace. * Instead, we transition to a position which will /work/patches/./v1-0001-Avoid-full-btree-index-scans-when-skipping-is-pos.patch:190: trailing whitespace. * own page. /work/patches/./v1-0001-Avoid-full-btree-index-scans-when-skipping-is-pos.patch:198: trailing whitespace. Applied patch to 'src/include/access/nbtree.h' with conflicts. Applied patch to 'src/backend/access/nbtree/nbtree.c' with conflicts. Applied patch to 'src/backend/access/nbtree/nbtsearch.c' with conflicts. U src/backend/access/nbtree/nbtree.c U src/backend/access/nbtree/nbtsearch.c U src/include/access/nbtree.h warning: 4 lines add whitespace errors. diff --cc src/backend/access/nbtree/nbtree.c index 3d617f168f,04c8d6e786..0000000000 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@@ -549,9 -554,10 +555,11 @@@ btinitparallelscan(void *target BTParallelScanDesc bt_target = (BTParallelScanDesc) target; SpinLockInit(&bt_target->btps_mutex); - bt_target->btps_scanPage = InvalidBlockNumber; + bt_target->btps_nextScanPage = InvalidBlockNumber; + bt_target->btps_lastCurrPage = InvalidBlockNumber; bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; + bt_target->btps_arrElemsGen = 0; + bt_target->btps_checkPrimScan = false; ConditionVariableInit(&bt_target->btps_cv); } @@@ -575,9 -581,10 +583,11 @@@ btparallelrescan(IndexScanDesc scan * consistency. */ SpinLockAcquire(&btscan->btps_mutex); - btscan->btps_scanPage = InvalidBlockNumber; + btscan->btps_nextScanPage = InvalidBlockNumber; + btscan->btps_lastCurrPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; + btscan->btps_arrElemsGen = 0; + btscan->btps_checkPrimScan = false; SpinLockRelease(&btscan->btps_mutex); } @@@ -655,14 -650,9 +665,16 @@@ _bt_parallel_seize(IndexScanDesc scan, { /* We're done with this parallel index scan */ status = false; + so->testPrimScan = false; + so->arrElemsGen = 0; } + else if (btscan->btps_pageStatus == BTPARALLEL_IDLE && + btscan->btps_nextScanPage == P_NONE) + { + /* End this parallel index scan */ + status = false; + endscan = true; + } else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN) { Assert(so->numArrayKeys); @@@ -679,7 -669,10 +691,9 @@@ array->cur_elem = btscan->btps_arrElems[i]; skey->sk_argument = array->elem_values[array->cur_elem]; } - *pageno = InvalidBlockNumber; exit_loop = true; + so->arrElemsGen = btscan->btps_arrElemsGen; + so->testPrimScan = false; } else { @@@ -706,9 -698,10 +720,16 @@@ * of advancing it to a new page! */ btscan->btps_pageStatus = BTPARALLEL_ADVANCING; ++<<<<<<< ours + Assert(btscan->btps_nextScanPage != P_NONE); + *next_scan_page = btscan->btps_nextScanPage; + *last_curr_page = btscan->btps_lastCurrPage; ++======= + *pageno = btscan->btps_scanPage; + + so->arrElemsGen = btscan->btps_arrElemsGen; + ++>>>>>>> theirs exit_loop = true; } SpinLockRelease(&btscan->btps_mutex); @@@ -726,41 -715,87 +747,111 @@@ } /* ++<<<<<<< ours + * _bt_parallel_release() -- Complete the process of advancing the scan to a + * new page. We now have the new value btps_nextScanPage; another backend ++======= + * _bt_parallel_opt_release_early() -- Complete the process of advancing the scan to a + * new page. We now have the new value btps_scanPage; some other backend ++>>>>>>> theirs * can now begin advancing the scan. * - * Callers whose scan uses array keys must save their scan_page argument so + * Callers whose scan uses array keys must save their curr_page argument so * that it can be passed to _bt_parallel_primscan_schedule, should caller - * determine that another primitive index scan is required. If that happens, - * scan_page won't be scanned by any backend (unless the next primitive index - * scan lands on scan_page). + * determine that another primitive index scan is required. + * + * If caller's next_scan_page is P_NONE, the scan has reached the index's + * rightmost/leftmost page. This is treated as reaching the end of the scan + * within _bt_parallel_seize. + * + * Note: unlike the serial case, parallel scans don't need to remember both + * sibling links. next_scan_page is whichever link is next given the scan's + * direction. That's all we'll ever need, since the direction of a parallel + * scan can never change. */ void ++<<<<<<< ours +_bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, + BlockNumber curr_page) ++======= + _bt_parallel_opt_release_early(IndexScanDesc scan, BlockNumber scan_page) { ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + SpinLockAcquire(&btscan->btps_mutex); + /* + * If a parallel worker noticed that it had skipped past the end of a + * primitive scan after another backend already acquired the parallel scan + * status, we don't release the scan before reading the page's contents. + * Instead, we transition to a position which will + */ + if (likely(!btscan->btps_checkPrimScan)) + { + btscan->btps_scanPage = scan_page; + btscan->btps_pageStatus = BTPARALLEL_IDLE; + SpinLockRelease(&btscan->btps_mutex); + ConditionVariableSignal(&btscan->btps_cv); + } + else + { + BTScanOpaque so = (BTScanOpaque) scan->opaque; + so->testPrimScan = true; + SpinLockRelease(&btscan->btps_mutex); + } + } + + /* + * _bt_parallel_opt_release_late() -- Complete the process of advancing the + * scan to a new page. + * + * We're only called when a concurrent backend wanted to schedule a skip scan, + * but failed to do so because the parallel scan already advanced past its + * own page. + */ + void + _bt_parallel_opt_release_late(IndexScanDesc scan, BlockNumber scan_page) ++>>>>>>> theirs + { + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTParallelScanDesc btscan; + + if (!so->testPrimScan) + return; + - btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + Assert(BlockNumberIsValid(next_scan_page)); + + btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan, parallel_scan->ps_offset); SpinLockAcquire(&btscan->btps_mutex); ++<<<<<<< ours + btscan->btps_nextScanPage = next_scan_page; + btscan->btps_lastCurrPage = curr_page; ++======= + Assert(btscan->btps_checkPrimScan); + btscan->btps_scanPage = scan_page; ++>>>>>>> theirs btscan->btps_pageStatus = BTPARALLEL_IDLE; + + /* + * A late release implies that 1) a concurrent backend noticed we + * should've started a new primitive scan, and that 2) the current scan + * position is already at-or-past the point where that scan would've + * started. So, we do what a new primitive scan would've done with the + * shared state: we increase the generation number, and unset + * checkPrimScan. + */ + btscan->btps_checkPrimScan = false; + btscan->btps_arrElemsGen += 1; SpinLockRelease(&btscan->btps_mutex); ConditionVariableSignal(&btscan->btps_cv); + + so->testPrimScan = false; } /* @@@ -777,9 -812,8 +868,10 @@@ _bt_parallel_done(IndexScanDesc scan ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; bool status_changed = false; + so->testPrimScan = false; + Assert(!BTScanPosIsValid(so->currPos)); + /* Do nothing, for non-parallel scans */ if (parallel_scan == NULL) return; @@@ -815,13 -849,13 +907,20 @@@ /* * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan. * ++<<<<<<< ours + * Caller passes the curr_page most recently passed to _bt_parallel_release + * by its backend. Caller successfully schedules the next primitive index scan + * if the shared parallel state hasn't been seized since caller's backend last + * advanced the scan. ++======= + * Caller passes the block number most recently passed to + * _bt_parallel_opt_release_early by its backend. Caller successfully + * schedules the next primitive index scan if the shared parallel state hasn't + * been seized since caller's backend last advanced the scan. ++>>>>>>> theirs */ void -_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page) +_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page) { BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; @@@ -833,12 -867,13 +932,19 @@@ parallel_scan->ps_offset); SpinLockAcquire(&btscan->btps_mutex); ++<<<<<<< ours + if (btscan->btps_lastCurrPage == curr_page && + btscan->btps_pageStatus == BTPARALLEL_IDLE) ++======= + if ((btscan->btps_scanPage == prev_scan_page && + btscan->btps_pageStatus == BTPARALLEL_IDLE) || + unlikely(so->testPrimScan)) ++>>>>>>> theirs { - btscan->btps_scanPage = InvalidBlockNumber; + btscan->btps_nextScanPage = InvalidBlockNumber; + btscan->btps_lastCurrPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN; + btscan->btps_arrElemsGen += 1; /* Serialize scan's current array keys */ for (int i = 0; i < so->numArrayKeys; i++) diff --cc src/backend/access/nbtree/nbtsearch.c index 472ce06f19,8a9a0e6626..0000000000 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@@ -1490,9 -1545,17 +1490,9 @@@ _bt_next(IndexScanDesc scan, ScanDirect * that there can be no more matching tuples in the current scan direction * (could just be for the current primitive index scan when scan has arrays). * - * _bt_first caller passes us an offnum returned by _bt_binsrch, which might - * be an out of bounds offnum such as "maxoff + 1" in certain corner cases. - * _bt_checkkeys will stop the scan as soon as an equality qual fails (when - * its scan key was marked required), so _bt_first _must_ pass us an offnum - * exactly at the beginning of where equal tuples are to be found. When we're - * passed an offnum past the end of the page, we might still manage to stop - * the scan on this page by calling _bt_checkkeys against the high key. - * * In the case of a parallel scan, caller must have called _bt_parallel_seize * prior to calling this function; this function will invoke - * _bt_parallel_release before returning. + * _bt_parallel_opt_release_early before returning. * * Returns true if any matching items found on the page, false if none. */ @@@ -1511,40 -1573,27 +1511,46 @@@ _bt_readpage(IndexScanDesc scan, ScanDi int itemIndex, indnatts; - /* - * We must have the buffer pinned and locked, but the usual macro can't be - * used here; this function is what makes it good for currPos. - */ - Assert(BufferIsValid(so->currPos.buf)); - + /* save the page/buffer block number, along with its sibling links */ page = BufferGetPage(so->currPos.buf); opaque = BTPageGetOpaque(page); + so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + so->currPos.prevPage = opaque->btpo_prev; + so->currPos.nextPage = opaque->btpo_next; + + Assert(!P_IGNORE(opaque)); + Assert(BTScanPosIsPinned(so->currPos)); + Assert(!so->needPrimScan); - /* allow next page be processed by parallel worker */ if (scan->parallel_scan) { + /* allow next/prev page to be read by other worker without delay */ if (ScanDirectionIsForward(dir)) - pstate.prev_scan_page = opaque->btpo_next; + _bt_parallel_release(scan, so->currPos.nextPage, + so->currPos.currPage); else ++<<<<<<< ours + _bt_parallel_release(scan, so->currPos.prevPage, + so->currPos.currPage); ++======= + pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf); + + _bt_parallel_opt_release_early(scan, pstate.prev_scan_page); ++>>>>>>> theirs } - indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); + /* initialize remaining currPos fields related to current page */ + so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); + so->currPos.dir = dir; + so->currPos.nextTupleOffset = 0; + /* either moreLeft or moreRight should be set now (may be unset later) */ + Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : + so->currPos.moreLeft); + + PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot); + + /* initialize local variables */ + indnatts = IndexRelationGetNumberOfAttributes(rel); arrayKeys = so->numArrayKeys != 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); @@@ -2167,77 -2173,96 +2180,132 @@@ _bt_readfirstpage(IndexScanDesc scan, O } /* - * _bt_readnextpage() -- Read next page containing valid data for scan + * _bt_readnextpage() -- Read next page containing valid data for _bt_next + * + * Caller's blkno is the next interesting page's link, taken from either the + * previously-saved right link or left link. lastcurrblkno is the page that + * was current at the point where the blkno link was saved, which we use to + * reason about concurrent page splits/page deletions during backwards scans. + * + * On entry, caller shouldn't hold any locks or pins on any page (we work + * directly off of blkno and lastcurrblkno instead). Parallel scan callers + * that seized the scan before calling here should pass seized=true; such a + * caller's blkno and lastcurrblkno arguments come from the seized scan. + * seized=false callers just pass us the blkno/lastcurrblkno taken from their + * so->currPos, which (along with so->currPos itself) can be used to end the + * scan. A seized=false caller's blkno can never be assumed to be the page + * that must be read next during a parallel scan, though. We must figure that + * part out for ourselves by seizing the scan (the correct page to read might + * already be beyond the seized=false caller's blkno during a parallel scan). * * On success exit, so->currPos is updated to contain data from the next - * interesting page, and we return true. Caller must release the lock (and - * maybe the pin) on the buffer on success exit. + * interesting page, and we return true. We hold a pin on the buffer on + * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe + * to eagerly drop the pin (to avoid blocking VACUUM). * * If there are no more matching records in the given direction, we drop all - * locks and pins, set so->currPos.buf to InvalidBuffer, and return false. + * locks and pins, invalidate so->currPos, and return false. + * + * We always release the scan for a parallel scan caller, regardless of + * success or failure; we'll call _bt_parallel_release as soon as possible. */ static bool -_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) +_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, + BlockNumber lastcurrblkno, ScanDirection dir, bool seized) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; - Relation rel; - Page page; - BTPageOpaque opaque; - bool status; - rel = scan->indexRelation; + Assert(so->currPos.currPage == lastcurrblkno || seized); + Assert(!BTScanPosIsPinned(so->currPos)); + /* + * Remember that the scan already read lastcurrblkno, a page to the left + * of blkno (or remember reading a page to the right, for backwards scans) + */ if (ScanDirectionIsForward(dir)) ++<<<<<<< ours + so->currPos.moreLeft = true; ++======= + { + for (;;) + { + /* + * if we're at end of scan, give up and mark parallel scan as + * done, so that all the workers can finish their scan + */ + if (blkno == P_NONE || !so->currPos.moreRight) + { + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + return false; + } + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + /* step right one page */ + so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(so->currPos.buf); + opaque = BTPageGetOpaque(page); + /* check for deleted page */ + if (!P_IGNORE(opaque)) + { + PredicateLockPage(rel, blkno, scan->xs_snapshot); + /* see if there are any matches on this page */ + /* note that this will clear moreRight if we can stop */ + if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false)) + break; + } + else if (scan->parallel_scan != NULL) + { + /* allow next page be processed by parallel worker */ + _bt_parallel_opt_release_early(scan, opaque->btpo_next); + } + + /* nope, keep going */ + if (scan->parallel_scan != NULL) + { + _bt_relbuf(rel, so->currPos.buf); + status = _bt_parallel_seize(scan, &blkno, false); + if (!status) + { + BTScanPosInvalidate(so->currPos); + return false; + } + } + else + { + blkno = opaque->btpo_next; + _bt_relbuf(rel, so->currPos.buf); + } + } + } ++>>>>>>> theirs else + so->currPos.moreRight = true; + + for (;;) { - /* - * Should only happen in parallel cases, when some other backend - * advanced the scan. - */ - if (so->currPos.currPage != blkno) + Page page; + BTPageOpaque opaque; + + if (blkno == P_NONE || + (ScanDirectionIsForward(dir) ? + !so->currPos.moreRight : !so->currPos.moreLeft)) { - BTScanPosUnpinIfPinned(so->currPos); - so->currPos.currPage = blkno; + /* most recent _bt_readpage call (for lastcurrblkno) ended scan */ + Assert(so->currPos.currPage == lastcurrblkno && !seized); + BTScanPosInvalidate(so->currPos); + _bt_parallel_done(scan); /* iff !so->needPrimScan */ + return false; } - /* Done if we know that the left sibling link isn't of interest */ - if (!so->currPos.moreLeft) + Assert(!so->needPrimScan); + + /* parallel scan must never actually visit so->currPos blkno */ + if (!seized && scan->parallel_scan != NULL && + !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false)) { - BTScanPosUnpinIfPinned(so->currPos); - _bt_parallel_done(scan); + /* whole scan is now done (or another primitive scan required) */ BTScanPosInvalidate(so->currPos); return false; } @@@ -2280,31 -2319,57 +2348,46 @@@ /* note that this will clear moreLeft if we can stop */ if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false)) break; + blkno = so->currPos.prevPage; } ++<<<<<<< ours + } + else + { + /* _bt_readpage not called, so do all this for ourselves */ + if (ScanDirectionIsForward(dir)) + blkno = opaque->btpo_next; + else + blkno = opaque->btpo_prev; ++======= + else if (scan->parallel_scan != NULL) + { + /* allow next page be processed by parallel worker */ + _bt_parallel_opt_release_early(scan, BufferGetBlockNumber(so->currPos.buf)); + } + + /* + * For parallel scans, get the last page scanned as it is quite + * possible that by the time we try to seize the scan, some other + * worker has already advanced the scan to a different page. We + * must continue based on the latest page scanned by any worker. + */ ++>>>>>>> theirs if (scan->parallel_scan != NULL) - { - _bt_relbuf(rel, so->currPos.buf); - status = _bt_parallel_seize(scan, &blkno, false); - if (!status) - { - BTScanPosInvalidate(so->currPos); - return false; - } - so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); - } + _bt_parallel_release(scan, blkno, lastcurrblkno); } - } - - return true; -} -/* - * _bt_parallel_readpage() -- Read current page containing valid data for scan - * - * On success, release lock and maybe pin on buffer. We return true to - * indicate success. - */ -static bool -_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - - Assert(!so->needPrimScan); - - _bt_initialize_more_data(so, dir); - - if (!_bt_readnextpage(scan, blkno, dir)) - return false; + /* no matching tuples on this page */ + _bt_relbuf(rel, so->currPos.buf); + seized = false; /* released by _bt_readpage (or by us) */ + } - /* We have at least one item to return as scan's next item */ + /* + * _bt_readpage succeeded. Drop the lock (and maybe the pin) on + * so->currPos.buf in preparation for btgettuple returning tuples. + */ + Assert(so->currPos.currPage == blkno); + Assert(BTScanPosIsPinned(so->currPos)); _bt_drop_lock_and_maybe_pin(scan, &so->currPos); return true; diff --cc src/include/access/nbtree.h index 6a501537e1,efe6728dd3..0000000000 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@@ -1186,14 -1196,15 +1191,23 @@@ extern int btgettreeheight(Relation rel /* * prototypes for internal functions in nbtree.c */ ++<<<<<<< ours +extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, + BlockNumber *last_curr_page, bool first); +extern void _bt_parallel_release(IndexScanDesc scan, + BlockNumber next_scan_page, + BlockNumber curr_page); ++======= + extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, + bool first); + extern void _bt_parallel_opt_release_early(IndexScanDesc scan, + BlockNumber scan_page); + extern void _bt_parallel_opt_release_late(IndexScanDesc scan, + BlockNumber scan_page); ++>>>>>>> theirs extern void _bt_parallel_done(IndexScanDesc scan); extern void _bt_parallel_primscan_schedule(IndexScanDesc scan, - BlockNumber prev_scan_page); + BlockNumber curr_page); /* * prototypes for functions in nbtdedup.c