=== Applying patches on top of PostgreSQL commit ID 53a49365052026907afff7613929710d1e7f0da0 === /etc/rc.d/jail: WARNING: Per-jail configuration via jail_* variables is obsolete. Please consider migrating to /etc/jail.conf. Fri Jan 31 23:10:31 UTC 2025 On branch cf/4871 nothing to commit, working tree clean === applying patch ./v19_0001-Introduce-IndexAmRoutine.ammorderbyopfirstcol.patch Applied patch to 'contrib/bloom/blutils.c' cleanly. Applied patch to 'doc/src/sgml/indexam.sgml' cleanly. Applied patch to 'src/backend/access/brin/brin.c' cleanly. Applied patch to 'src/backend/access/gin/ginutil.c' cleanly. Applied patch to 'src/backend/access/gist/gist.c' cleanly. Applied patch to 'src/backend/access/hash/hash.c' cleanly. Applied patch to 'src/backend/access/nbtree/nbtree.c' cleanly. Applied patch to 'src/backend/access/spgist/spgutils.c' cleanly. Applied patch to 'src/backend/optimizer/path/indxpath.c' cleanly. Applied patch to 'src/include/access/amapi.h' cleanly. Applied patch to 'src/include/nodes/pathnodes.h' cleanly. [cf/4871 47421ffb4f] Introduce IndexAmRoutine.ammorderbyopfirstcol Author: Anton A. Melnikov Date: Sun Dec 31 14:10:23 2023 +0300 11 files changed, 38 insertions(+), 8 deletions(-) === applying patch ./v19_0002-Allow-ordering-by-operator-in-ordered-indexes.patch Applied patch to 'src/backend/executor/execAmi.c' cleanly. Applied patch to 'src/backend/executor/nodeIndexscan.c' cleanly. Applied patch to 'src/backend/optimizer/path/indxpath.c' cleanly. [cf/4871 34efa02359] Allow ordering by operator in ordered indexes Author: Anton A. Melnikov Date: Sun Dec 31 14:34:57 2023 +0300 3 files changed, 18 insertions(+), 13 deletions(-) === applying patch ./v19_0003-Extract-BTScanState-from-BTScanOpaqueData.patch Applied patch to 'src/backend/access/nbtree/nbtree.c' with conflicts. Applied patch to 'src/backend/access/nbtree/nbtsearch.c' with conflicts. Applied patch to 'src/backend/access/nbtree/nbtutils.c' cleanly. Applied patch to 'src/include/access/nbtree.h' with conflicts. U src/backend/access/nbtree/nbtree.c U src/backend/access/nbtree/nbtsearch.c U src/include/access/nbtree.h diff --cc src/backend/access/nbtree/nbtree.c index da576f84ca,fc3954cc15..0000000000 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@@ -362,23 -402,12 +403,18 @@@ btrescan(IndexScanDesc scan, ScanKey sc ScanKey orderbys, int norderbys) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanState state = &so->state; - /* we aren't holding any read locks, but gotta drop the pins */ - if (BTScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); - BTScanPosInvalidate(so->currPos); - } + _bt_release_scan_state(scan, state, false); - so->markItemIndex = -1; so->needPrimScan = false; so->scanBehind = false; ++<<<<<<< ours + so->oppositeDirCheck = false; + BTScanPosUnpinIfPinned(so->markPos); + BTScanPosInvalidate(so->markPos); ++======= ++>>>>>>> theirs /* * Allocate tuple workspace arrays, if needed for an index-only scan and diff --cc src/backend/access/nbtree/nbtsearch.c index 472ce06f19,368857d3aa..0000000000 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@@ -32,26 -29,26 +32,39 @@@ static Buffer _bt_moveright(Relation re static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum); - static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, - OffsetNumber offnum, bool firstPage); - static void _bt_saveitem(BTScanOpaque so, int itemIndex, + static bool _bt_readpage(IndexScanDesc scan, BTScanState state, + ScanDirection dir, OffsetNumber offnum, + bool firstPage); + static void _bt_saveitem(BTScanState state, int itemIndex, OffsetNumber offnum, IndexTuple itup); - static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, + static int _bt_setuppostingitems(BTScanState state, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup); - static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, + static inline void _bt_savepostingitem(BTScanState state, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset); ++<<<<<<< ours +static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so); +static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); +static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, + ScanDirection dir); +static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, + BlockNumber lastcurrblkno, ScanDirection dir, + bool seized); +static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, + BlockNumber lastcurrblkno); +static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); ++======= + static bool _bt_steppage(IndexScanDesc scan, BTScanState state, + ScanDirection dir); + static bool _bt_readnextpage(IndexScanDesc scan, BTScanState state, + BlockNumber blkno, ScanDirection dir); + static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, + ScanDirection dir); + static Buffer _bt_walk_left(Relation rel, Buffer buf); + static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); + static inline void _bt_initialize_more_data(IndexScanDesc scan, BTScanState state, ScanDirection dir); ++>>>>>>> theirs /* @@@ -883,18 -932,24 +948,27 @@@ _bt_first(IndexScanDesc scan, ScanDirec { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; ++<<<<<<< ours ++======= + BTScanPos currPos = &so->state.currPos; + Buffer buf; ++>>>>>>> theirs BTStack stack; OffsetNumber offnum; - StrategyNumber strat; BTScanInsertData inskey; ScanKey startKeys[INDEX_MAX_KEYS]; ScanKeyData notnullkeys[INDEX_MAX_KEYS]; int keysz = 0; - int i; - bool status; StrategyNumber strat_total; ++<<<<<<< ours + BlockNumber blkno = InvalidBlockNumber, + lastcurrblkno; ++======= + BlockNumber blkno; ++>>>>>>> theirs - Assert(!BTScanPosIsValid(so->currPos)); + Assert(!BTScanPosIsValid(*currPos)); - pgstat_count_index_scan(rel); - /* * Examine the scan keys and eliminate any redundant keys; also mark the * keys that must be matched to continue the scan. @@@ -1392,20 -1463,29 +1466,35 @@@ _bt_freestack(stack); } - if (!BufferIsValid(buf)) + if (!BufferIsValid(so->currPos.buf)) { - /* - * Mark parallel scan as done, so that all the workers can finish - * their scan. - */ + Assert(!so->needPrimScan); _bt_parallel_done(scan); ++<<<<<<< ours ++======= + BTScanPosInvalidate(*currPos); ++>>>>>>> theirs return false; } } ++<<<<<<< ours + /* position to the precise item on the page */ + offnum = _bt_binsrch(rel, &inskey, so->currPos.buf); ++======= + PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); + + _bt_initialize_more_data(scan, &so->state, dir); + + /* position to the precise item on the page */ + offnum = _bt_binsrch(rel, &inskey, buf); + Assert(!BTScanPosIsValid(*currPos)); + currPos->buf = buf; ++>>>>>>> theirs /* - * Now load data from the first page of the scan. + * Now load data from the first page of the scan (usually the page + * currently in so->currPos.buf). * * If inskey.nextkey = false and inskey.backward = false, offnum is * positioned at the first non-pivot tuple >= inskey.scankeys. @@@ -1423,11 -1503,33 +1512,41 @@@ * for the page. For example, when inskey is both < the leaf page's high * key and > all of its non-pivot tuples, offnum will be "maxoff + 1". */ ++<<<<<<< ours + if (!_bt_readfirstpage(scan, offnum, dir)) + return false; + + _bt_returnitem(scan, so); + return true; ++======= + if (!_bt_load_first_page(scan, &so->state, dir, offnum)) + return false; + + readcomplete: + /* OK, currPos->itemIndex says what to return */ + return _bt_return_current_item(scan, &so->state); + } + + /* + * Advance to next tuple on current page; or if there's no more, + * try to step to the next page with data. + */ + static bool + _bt_next_item(IndexScanDesc scan, BTScanState state, ScanDirection dir) + { + if (ScanDirectionIsForward(dir)) + { + if (++state->currPos.itemIndex <= state->currPos.lastItem) + return true; + } + else + { + if (--state->currPos.itemIndex >= state->currPos.firstItem) + return true; + } + + return _bt_steppage(scan, state, dir); ++>>>>>>> theirs } /* @@@ -1448,32 -1550,12 +1567,22 @@@ boo _bt_next(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; ++<<<<<<< ours + + Assert(BTScanPosIsValid(so->currPos)); ++======= ++>>>>>>> theirs - /* - * Advance to next tuple on current page; or if there's no more, try to - * step to the next page with data. - */ - if (ScanDirectionIsForward(dir)) - { - if (++so->currPos.itemIndex > so->currPos.lastItem) - { - if (!_bt_steppage(scan, dir)) - return false; - } - } - else - { - if (--so->currPos.itemIndex < so->currPos.firstItem) - { - if (!_bt_steppage(scan, dir)) - return false; - } - } + if (!_bt_next_item(scan, &so->state, dir)) + return false; ++<<<<<<< ours + _bt_returnitem(scan, so); + return true; ++======= + /* OK, itemIndex says what to return */ + return _bt_return_current_item(scan, &so->state); ++>>>>>>> theirs } /* @@@ -1497,11 -1587,11 +1606,12 @@@ * Returns true if any matching items found on the page, false if none. */ static bool - _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, + _bt_readpage(IndexScanDesc scan, BTScanState state, ScanDirection dir, OffsetNumber offnum, bool firstPage) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPos pos = &state->currPos; Page page; BTPageOpaque opaque; OffsetNumber minoff; @@@ -1511,40 -1601,27 +1621,56 @@@ int itemIndex, indnatts; ++<<<<<<< ours + /* save the page/buffer block number, along with its sibling links */ + page = BufferGetPage(so->currPos.buf); ++======= + /* + * We must have the buffer pinned and locked, but the usual macro can't be + * used here; this function is what makes it good for currPos. + */ + Assert(BufferIsValid(pos->buf)); + + page = BufferGetPage(pos->buf); ++>>>>>>> theirs opaque = BTPageGetOpaque(page); + so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + so->currPos.prevPage = opaque->btpo_prev; + so->currPos.nextPage = opaque->btpo_next; + + Assert(!P_IGNORE(opaque)); + Assert(BTScanPosIsPinned(so->currPos)); + Assert(!so->needPrimScan); - /* allow next page be processed by parallel worker */ if (scan->parallel_scan) { + /* allow next/prev page to be read by other worker without delay */ if (ScanDirectionIsForward(dir)) - pstate.prev_scan_page = opaque->btpo_next; + _bt_parallel_release(scan, so->currPos.nextPage, + so->currPos.currPage); else ++<<<<<<< ours + _bt_parallel_release(scan, so->currPos.prevPage, + so->currPos.currPage); ++======= + pstate.prev_scan_page = BufferGetBlockNumber(pos->buf); + + _bt_parallel_release(scan, pstate.prev_scan_page); ++>>>>>>> theirs } - indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); + /* initialize remaining currPos fields related to current page */ + so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); + so->currPos.dir = dir; + so->currPos.nextTupleOffset = 0; + /* either moreLeft or moreRight should be set now (may be unset later) */ + Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : + so->currPos.moreLeft); + + PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot); + + /* initialize local variables */ + indnatts = IndexRelationGetNumberOfAttributes(rel); arrayKeys = so->numArrayKeys != 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); @@@ -1563,6 -1641,35 +1689,38 @@@ pstate.targetdistance = 0; /* ++<<<<<<< ours ++======= + * We note the buffer's block number so that we can release the pin later. + * This allows us to re-read the buffer if it is needed again for hinting. + */ + pos->currPage = BufferGetBlockNumber(pos->buf); + + /* + * We save the LSN of the page as we read it, so that we know whether it + * safe to apply LP_DEAD hints to the page later. This allows us to drop + * the pin for MVCC scans, which allows vacuum to avoid blocking. + */ + pos->lsn = BufferGetLSNAtomic(pos->buf); + + /* + * we must save the page's right-link while scanning it; this tells us + * where to step right to after we're done with these items. There is no + * corresponding need for the left-link, since splits always go right. + */ + pos->nextPage = opaque->btpo_next; + + /* initialize tuple workspace to empty */ + pos->nextTupleOffset = 0; + + /* + * Now that the current page has been made consistent, the macro should be + * good. + */ + Assert(BTScanPosIsPinned(*pos)); + + /* ++>>>>>>> theirs * Prechecking the value of the continuescan flag for the last item on the * page (for backwards scan it will be the first item on a page). If we * observe it to be true, then it should be true for all other items. This @@@ -1868,27 -1951,23 +2026,33 @@@ } } } + /* When !continuescan, there can't be any more matches, so stop */ if (!pstate.continuescan) ++<<<<<<< ours ++======= + { + /* there can't be any more matches, so stop */ + pos->moreLeft = false; ++>>>>>>> theirs break; - } offnum = OffsetNumberPrev(offnum); } + /* + * We don't need to visit page to the left when no more matches will + * be found there + */ + if (!pstate.continuescan) + so->currPos.moreLeft = false; + Assert(itemIndex >= 0); - so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxTIDsPerBTreePage - 1; - so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; + pos->firstItem = itemIndex; + pos->lastItem = MaxTIDsPerBTreePage - 1; + pos->itemIndex = MaxTIDsPerBTreePage - 1; } - return (so->currPos.firstItem <= so->currPos.lastItem); + return (pos->firstItem <= pos->lastItem); } /* Save an index item into so->currPos.items[itemIndex] */ @@@ -2000,25 -2060,28 +2165,32 @@@ _bt_returnitem(IndexScanDesc scan, BTSc /* * _bt_steppage() -- Step to next page containing valid data for scan * - * On entry, if so->currPos.buf is valid the buffer is pinned but not locked; - * if pinned, we'll drop the pin before moving to next page. The buffer is - * not locked on entry. + * Wrapper on _bt_readnextpage that performs final steps for the current page. * - * For success on a scan using a non-MVCC snapshot we hold a pin, but not a - * read lock, on that page. If we do not hold the pin, we set so->currPos.buf - * to InvalidBuffer. We return true to indicate success. + * On entry, if so->currPos.buf is valid the buffer is pinned but not locked. + * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped + * the pin eagerly earlier on. The scan must have so->currPos.currPage set to + * a valid block, in any case. */ static bool - _bt_steppage(IndexScanDesc scan, ScanDirection dir) + _bt_steppage(IndexScanDesc scan, BTScanState state, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; ++<<<<<<< ours + BlockNumber blkno, + lastcurrblkno; ++======= + BTScanPos currPos = &state->currPos; + Relation rel = scan->indexRelation; + BlockNumber blkno = InvalidBlockNumber; + bool status; ++>>>>>>> theirs - Assert(BTScanPosIsValid(so->currPos)); + Assert(BTScanPosIsValid(*currPos)); /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); + if (state->numKilled > 0) + _bt_killitems(state, rel); /* * Before we modify currPos, make a copy of the page data if there was a @@@ -2050,262 -2113,276 +2222,512 @@@ * markPos state. But depending on the current array state like this * would add complexity. Instead, we just unset markPos's copy of * moreRight or moreLeft (whichever might be affected), while making - * btrestpos reset the scan's arrays to their initial scan positions. - * In effect, btrestpos leaves advancing the arrays up to the first + * btrestrpos reset the scan's arrays to their initial scan positions. + * In effect, btrestrpos leaves advancing the arrays up to the first * _bt_readpage call (that takes place after it has restored markPos). */ ++<<<<<<< ours + if (so->needPrimScan) + { + if (ScanDirectionIsForward(so->currPos.dir)) + so->markPos.moreRight = true; ++======= + Assert(state->markPos.dir == dir); + if (so->needPrimScan) + { + if (ScanDirectionIsForward(dir)) + state->markPos.moreRight = true; ++>>>>>>> theirs else - so->markPos.moreLeft = true; + state->markPos.moreLeft = true; } + + /* mark/restore not supported by parallel scans */ + Assert(!scan->parallel_scan); } + BTScanPosUnpinIfPinned(so->currPos); + + /* Walk to the next page with data */ if (ScanDirectionIsForward(dir)) + blkno = so->currPos.nextPage; + else + blkno = so->currPos.prevPage; + lastcurrblkno = so->currPos.currPage; + + /* + * Cancel primitive index scans that were scheduled when the call to + * _bt_readpage for currPos happened to use the opposite direction to the + * one that we're stepping in now. (It's okay to leave the scan's array + * keys as-is, since the next _bt_readpage will advance them.) + */ + if (so->currPos.dir != dir) + so->needPrimScan = false; + + return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false); +} + +/* + * _bt_readfirstpage() -- Read first page containing valid data for _bt_first + * + * _bt_first caller passes us an offnum returned by _bt_binsrch, which might + * be an out of bounds offnum such as "maxoff + 1" in certain corner cases. + * _bt_checkkeys will stop the scan as soon as an equality qual fails (when + * its scan key was marked required), so _bt_first _must_ pass us an offnum + * exactly at the beginning of where equal tuples are to be found. When we're + * passed an offnum past the end of the page, we might still manage to stop + * the scan on this page by calling _bt_checkkeys against the high key. See + * _bt_readpage for full details. + * + * On entry, so->currPos must be pinned and locked (so offnum stays valid). + * Parallel scan callers must have seized the scan before calling here. + * + * On exit, we'll have updated so->currPos and retained locks and pins + * according to the same rules as those laid out for _bt_readnextpage exit. + * Like _bt_readnextpage, our return value indicates if there are any matching + * records in the given direction. + * + * We always release the scan for a parallel scan caller, regardless of + * success or failure; we'll call _bt_parallel_release as soon as possible. + */ +static bool +_bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + so->numKilled = 0; /* just paranoia */ + so->markItemIndex = -1; /* ditto */ + + /* Initialize so->currPos for the first page (page in so->currPos.buf) */ + if (so->needPrimScan) { ++<<<<<<< ours + Assert(so->numArrayKeys); + + so->currPos.moreLeft = true; + so->currPos.moreRight = true; + so->needPrimScan = false; ++======= + /* Walk right to the next page with data */ + if (scan->parallel_scan != NULL) + { + /* + * Seize the scan to get the next block number; if the scan has + * ended already, bail out. + */ + status = _bt_parallel_seize(scan, &blkno, false); + if (!status) + { + /* release the previous buffer, if pinned */ + BTScanPosUnpinIfPinned(*currPos); + BTScanPosInvalidate(*currPos); + return false; + } + } + else + { + /* Not parallel, so use the previously-saved nextPage link. */ + blkno = currPos->nextPage; + } + + /* Remember we left a page with data */ + currPos->moreLeft = true; + + /* release the previous buffer, if pinned */ + BTScanPosUnpinIfPinned(*currPos); ++>>>>>>> theirs + } + else if (ScanDirectionIsForward(dir)) + { ++<<<<<<< ours + so->currPos.moreLeft = false; + so->currPos.moreRight = true; } else { + so->currPos.moreLeft = true; + so->currPos.moreRight = false; + } + + /* + * Attempt to load matching tuples from the first page. + * + * Note that _bt_readpage will finish initializing the so->currPos fields. + * _bt_readpage also releases parallel scan (even when it returns false). + */ + if (_bt_readpage(scan, dir, offnum, true)) + { + /* + * _bt_readpage succeeded. Drop the lock (and maybe the pin) on + * so->currPos.buf in preparation for btgettuple returning tuples. + */ + Assert(BTScanPosIsPinned(so->currPos)); + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + return true; + } + + /* There's no actually-matching data on the page in so->currPos.buf */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + + /* Call _bt_readnextpage using its _bt_steppage wrapper function */ + if (!_bt_steppage(scan, dir)) + return false; ++======= + /* Remember we left a page with data */ + currPos->moreRight = true; + + if (scan->parallel_scan != NULL) + { + /* + * Seize the scan to get the current block number; if the scan has + * ended already, bail out. + */ + status = _bt_parallel_seize(scan, &blkno, false); + BTScanPosUnpinIfPinned(*currPos); + if (!status) + { + BTScanPosInvalidate(*currPos); + return false; + } + } + else + { + /* Not parallel, so just use our own notion of the current page */ + blkno = currPos->currPage; + } + } + + if (!_bt_readnextpage(scan, state, blkno, dir)) + return false; + + /* We have at least one item to return as scan's next item */ + _bt_drop_lock_and_maybe_pin(scan, currPos); ++>>>>>>> theirs + /* _bt_readpage for a later page (now in so->currPos) succeeded */ return true; } /* - * _bt_readnextpage() -- Read next page containing valid data for scan + * _bt_readnextpage() -- Read next page containing valid data for _bt_next + * + * Caller's blkno is the next interesting page's link, taken from either the + * previously-saved right link or left link. lastcurrblkno is the page that + * was current at the point where the blkno link was saved, which we use to + * reason about concurrent page splits/page deletions during backwards scans. + * + * On entry, caller shouldn't hold any locks or pins on any page (we work + * directly off of blkno and lastcurrblkno instead). Parallel scan callers + * that seized the scan before calling here should pass seized=true; such a + * caller's blkno and lastcurrblkno arguments come from the seized scan. + * seized=false callers just pass us the blkno/lastcurrblkno taken from their + * so->currPos, which (along with so->currPos itself) can be used to end the + * scan. A seized=false caller's blkno can never be assumed to be the page + * that must be read next during a parallel scan, though. We must figure that + * part out for ourselves by seizing the scan (the correct page to read might + * already be beyond the seized=false caller's blkno during a parallel scan). * * On success exit, so->currPos is updated to contain data from the next - * interesting page, and we return true. Caller must release the lock (and - * maybe the pin) on the buffer on success exit. + * interesting page, and we return true. We hold a pin on the buffer on + * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe + * to eagerly drop the pin (to avoid blocking VACUUM). * * If there are no more matching records in the given direction, we drop all - * locks and pins, set so->currPos.buf to InvalidBuffer, and return false. + * locks and pins, invalidate so->currPos, and return false. + * + * We always release the scan for a parallel scan caller, regardless of + * success or failure; we'll call _bt_parallel_release as soon as possible. */ static bool ++<<<<<<< ours +_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, + BlockNumber lastcurrblkno, ScanDirection dir, bool seized) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; ++======= + _bt_readnextpage(IndexScanDesc scan, BTScanState state, BlockNumber blkno, + ScanDirection dir) + { + BTScanPos currPos = &state->currPos; + Relation rel; + Page page; + BTPageOpaque opaque; + bool status; ++>>>>>>> theirs - rel = scan->indexRelation; + Assert(so->currPos.currPage == lastcurrblkno || seized); + Assert(!BTScanPosIsPinned(so->currPos)); + /* + * Remember that the scan already read lastcurrblkno, a page to the left + * of blkno (or remember reading a page to the right, for backwards scans) + */ if (ScanDirectionIsForward(dir)) + so->currPos.moreLeft = true; + else + so->currPos.moreRight = true; + + for (;;) { ++<<<<<<< ours + Page page; + BTPageOpaque opaque; + + if (blkno == P_NONE || + (ScanDirectionIsForward(dir) ? + !so->currPos.moreRight : !so->currPos.moreLeft)) + { + /* most recent _bt_readpage call (for lastcurrblkno) ended scan */ + Assert(so->currPos.currPage == lastcurrblkno && !seized); + BTScanPosInvalidate(so->currPos); + _bt_parallel_done(scan); /* iff !so->needPrimScan */ + return false; + } + + Assert(!so->needPrimScan); + + /* parallel scan must never actually visit so->currPos blkno */ + if (!seized && scan->parallel_scan != NULL && + !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false)) + { + /* whole scan is now done (or another primitive scan required) */ + BTScanPosInvalidate(so->currPos); + return false; + } + + if (ScanDirectionIsForward(dir)) + { + /* read blkno, but check for interrupts first */ + CHECK_FOR_INTERRUPTS(); + so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + } + else + { + /* read blkno, avoiding race (also checks for interrupts) */ + so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno, + lastcurrblkno); + if (so->currPos.buf == InvalidBuffer) + { + /* must have been a concurrent deletion of leftmost page */ + BTScanPosInvalidate(so->currPos); + _bt_parallel_done(scan); ++======= + for (;;) + { + /* + * if we're at end of scan, give up and mark parallel scan as + * done, so that all the workers can finish their scan + */ + if (blkno == P_NONE || !currPos->moreRight) + { + _bt_parallel_done(scan); + BTScanPosInvalidate(*currPos); + return false; + } + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + /* step right one page */ + currPos->buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(currPos->buf); + opaque = BTPageGetOpaque(page); + /* check for deleted page */ + if (!P_IGNORE(opaque)) + { + PredicateLockPage(rel, blkno, scan->xs_snapshot); + /* see if there are any matches on this page */ + /* note that this will clear moreRight if we can stop */ + if (_bt_readpage(scan, state, dir, P_FIRSTDATAKEY(opaque), false)) + break; + } + else if (scan->parallel_scan != NULL) + { + /* allow next page be processed by parallel worker */ + _bt_parallel_release(scan, opaque->btpo_next); + } + + /* nope, keep going */ + if (scan->parallel_scan != NULL) + { + _bt_relbuf(rel, currPos->buf); + status = _bt_parallel_seize(scan, &blkno, false); + if (!status) + { + BTScanPosInvalidate(*currPos); + return false; + } + } + else + { + blkno = opaque->btpo_next; + _bt_relbuf(rel, currPos->buf); + } + } + } + else + { + /* + * Should only happen in parallel cases, when some other backend + * advanced the scan. + */ + if (currPos->currPage != blkno) + { + BTScanPosUnpinIfPinned(*currPos); + currPos->currPage = blkno; + } + + /* + * Walk left to the next page with data. This is much more complex + * than the walk-right case because of the possibility that the page + * to our left splits while we are in flight to it, plus the + * possibility that the page we were on gets deleted after we leave + * it. See nbtree/README for details. + * + * It might be possible to rearrange this code to have less overhead + * in pinning and locking, but that would require capturing the left + * sibling block number when the page is initially read, and then + * optimistically starting there (rather than pinning the page twice). + * It is not clear that this would be worth the complexity. + */ + if (BTScanPosIsPinned(*currPos)) + _bt_lockbuf(rel, currPos->buf, BT_READ); + else + currPos->buf = _bt_getbuf(rel, currPos->currPage, BT_READ); + + for (;;) + { + /* Done if we know there are no matching keys to the left */ + if (!currPos->moreLeft) + { + _bt_relbuf(rel, currPos->buf); + _bt_parallel_done(scan); + BTScanPosInvalidate(*currPos); ++>>>>>>> theirs return false; } + } ++<<<<<<< ours + page = BufferGetPage(so->currPos.buf); + opaque = BTPageGetOpaque(page); + lastcurrblkno = blkno; + if (likely(!P_IGNORE(opaque))) + { + /* see if there are any matches on this page */ + if (ScanDirectionIsForward(dir)) + { + /* note that this will clear moreRight if we can stop */ + if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false)) + break; + blkno = so->currPos.nextPage; + } + else + { ++======= + /* Step to next physical page */ + currPos->buf = _bt_walk_left(rel, currPos->buf); + + /* if we're physically at end of index, return failure */ + if (currPos->buf == InvalidBuffer) + { + _bt_parallel_done(scan); + BTScanPosInvalidate(*currPos); + return false; + } + + /* + * Okay, we managed to move left to a non-deleted page. Done if + * it's not half-dead and contains matching tuples. Else loop back + * and do it all again. + */ + page = BufferGetPage(currPos->buf); + opaque = BTPageGetOpaque(page); + if (!P_IGNORE(opaque)) + { + PredicateLockPage(rel, BufferGetBlockNumber(currPos->buf), scan->xs_snapshot); + /* see if there are any matches on this page */ ++>>>>>>> theirs /* note that this will clear moreLeft if we can stop */ - if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false)) + if (_bt_readpage(scan, state, dir, PageGetMaxOffsetNumber(page), false)) break; + blkno = so->currPos.prevPage; } ++<<<<<<< ours + } + else + { + /* _bt_readpage not called, so do all this for ourselves */ + if (ScanDirectionIsForward(dir)) + blkno = opaque->btpo_next; + else + blkno = opaque->btpo_prev; + if (scan->parallel_scan != NULL) + _bt_parallel_release(scan, blkno, lastcurrblkno); ++======= + else if (scan->parallel_scan != NULL) + { + /* allow next page be processed by parallel worker */ + _bt_parallel_release(scan, BufferGetBlockNumber(currPos->buf)); + } + + /* + * For parallel scans, get the last page scanned as it is quite + * possible that by the time we try to seize the scan, some other + * worker has already advanced the scan to a different page. We + * must continue based on the latest page scanned by any worker. + */ + if (scan->parallel_scan != NULL) + { + _bt_relbuf(rel, currPos->buf); + status = _bt_parallel_seize(scan, &blkno, false); + if (!status) + { + BTScanPosInvalidate(*currPos); + return false; + } + currPos->buf = _bt_getbuf(rel, blkno, BT_READ); + } ++>>>>>>> theirs } + ++<<<<<<< ours + /* no matching tuples on this page */ + _bt_relbuf(rel, so->currPos.buf); + seized = false; /* released by _bt_readpage (or by us) */ } + /* + * _bt_readpage succeeded. Drop the lock (and maybe the pin) on + * so->currPos.buf in preparation for btgettuple returning tuples. + */ + Assert(so->currPos.currPage == blkno); + Assert(BTScanPosIsPinned(so->currPos)); + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); ++======= + return true; + } + + /* + * _bt_parallel_readpage() -- Read current page containing valid data for scan + * + * On success, release lock and maybe pin on buffer. We return true to + * indicate success. + */ + static bool + _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) + { + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + Assert(!so->needPrimScan); + + _bt_initialize_more_data(scan, &so->state, dir); + + if (!_bt_readnextpage(scan, &so->state, blkno, dir)) + return false; + + /* We have at least one item to return as scan's next item */ + _bt_drop_lock_and_maybe_pin(scan, &so->state.currPos); ++>>>>>>> theirs return true; } @@@ -2535,12 -2600,11 +2957,20 @@@ _bt_endpoint(IndexScanDesc scan, ScanDi { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; ++<<<<<<< ours + Page page; + BTPageOpaque opaque; + OffsetNumber start; + + Assert(!BTScanPosIsValid(so->currPos)); + Assert(!so->needPrimScan); ++======= + BTScanPos currPos = &so->state.currPos; + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber start; ++>>>>>>> theirs /* * Scan down to the leftmost or rightmost leaf page. This is a simplified @@@ -2555,7 -2620,7 +2985,11 @@@ * exists. */ PredicateLockRelation(rel, scan->xs_snapshot); ++<<<<<<< ours + _bt_parallel_done(scan); ++======= + BTScanPosInvalidate(*currPos); ++>>>>>>> theirs return false; } @@@ -2582,12 -2648,46 +3016,58 @@@ start = 0; /* keep compiler quiet */ } ++<<<<<<< ours + /* + * Now load data from the first page of the scan. + */ + if (!_bt_readfirstpage(scan, start, dir)) + return false; + + _bt_returnitem(scan, so); + return true; +} ++======= + /* remember which buffer we have pinned */ + currPos->buf = buf; + + _bt_initialize_more_data(scan, &so->state, dir); + + if (!_bt_load_first_page(scan, &so->state, dir, start)) + return false; + + /* OK, currPos->itemIndex says what to return */ + return _bt_return_current_item(scan, &so->state); + } + + /* + * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir + * from currPos + */ + static inline void + _bt_initialize_more_data(IndexScanDesc scan, BTScanState state, ScanDirection dir) + { + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + state->currPos.dir = dir; + if (so->needPrimScan) + { + Assert(so->numArrayKeys); + + state->currPos.moreLeft = true; + state->currPos.moreRight = true; + so->needPrimScan = false; + } + else if (ScanDirectionIsForward(dir)) + { + state->currPos.moreLeft = false; + state->currPos.moreRight = true; + } + else + { + state->currPos.moreLeft = true; + state->currPos.moreRight = false; + } + state->numKilled = 0; /* just paranoia */ + state->markItemIndex = -1; /* ditto */ + } ++>>>>>>> theirs diff --cc src/include/access/nbtree.h index 6a501537e1,c60cecf722..0000000000 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@@ -1033,22 -1038,8 +1034,25 @@@ typedef struct BTArrayKeyInf Datum *elem_values; /* array of num_elems Datums */ } BTArrayKeyInfo; - typedef struct BTScanOpaqueData + typedef struct BTScanStateData { ++<<<<<<< ours + /* these fields are set by _bt_preprocess_keys(): */ + bool qual_ok; /* false if qual can never be satisfied */ + int numberOfKeys; /* number of preprocessed scan keys */ + ScanKey keyData; /* array of preprocessed scan keys */ + + /* workspace for SK_SEARCHARRAY support */ + int numArrayKeys; /* number of equality-type array keys */ + bool needPrimScan; /* New prim scan to continue in current dir? */ + bool scanBehind; /* Last array advancement matched -inf attr? */ + bool oppositeDirCheck; /* explicit scanBehind recheck needed? */ + BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ + FmgrInfo *orderProcs; /* ORDER procs for required equality keys */ + MemoryContext arrayContext; /* scan-lifespan context for array data */ + ++======= ++>>>>>>> theirs /* info about killed items if any (killedItems is NULL if never used) */ int *killedItems; /* currPos.items indexes of killed items */ int numKilled; /* number of currently stored items */ @@@ -1287,17 -1296,11 +1312,21 @@@ extern Buffer _bt_get_endpoint(Relatio extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); extern void _bt_freestack(BTStack stack); extern bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir); +extern int _bt_binsrch_array_skey(FmgrInfo *orderproc, + bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); -extern void _bt_preprocess_keys(IndexScanDesc scan); extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts); ++<<<<<<< ours +extern bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple finaltup); +extern void _bt_killitems(IndexScanDesc scan); ++======= + extern void _bt_killitems(BTScanState state, Relation indexRelation); ++>>>>>>> theirs extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); extern void _bt_end_vacuum(Relation rel);