LCOV - Code coverage - src/backend/access/nbtree/nbtinsert.c

LCOV - code coverage report

Current view:	top level - src/backend/access/nbtree - nbtinsert.c (source / functions)		Coverage	Total	Hit
Test:	Code coverage	Lines:	88.4 %	1067	943
Test Date:	2026-01-26 10:56:24	Functions:	93.8 %	16	15
Legend:	Lines: hit not hit Branches: + taken - not taken # not executed	Branches:	62.2 %	722	449

             Branch data     Line data    Source code

       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * nbtinsert.c
       4                 :             :  *        Item insertion in Lehman and Yao btrees for Postgres.
       5                 :             :  *
       6                 :             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7                 :             :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :             :  *
       9                 :             :  *
      10                 :             :  * IDENTIFICATION
      11                 :             :  *        src/backend/access/nbtree/nbtinsert.c
      12                 :             :  *
      13                 :             :  *-------------------------------------------------------------------------
      14                 :             :  */
      15                 :             : 
      16                 :             : #include "postgres.h"
      17                 :             : 
      18                 :             : #include "access/nbtree.h"
      19                 :             : #include "access/nbtxlog.h"
      20                 :             : #include "access/tableam.h"
      21                 :             : #include "access/transam.h"
      22                 :             : #include "access/xloginsert.h"
      23                 :             : #include "common/int.h"
      24                 :             : #include "common/pg_prng.h"
      25                 :             : #include "lib/qunique.h"
      26                 :             : #include "miscadmin.h"
      27                 :             : #include "storage/lmgr.h"
      28                 :             : #include "storage/predicate.h"
      29                 :             : #include "utils/injection_point.h"
      30                 :             : 
      31                 :             : /* Minimum tree height for application of fastpath optimization */
      32                 :             : #define BTREE_FASTPATH_MIN_LEVEL        2
      33                 :             : 
      34                 :             : 
      35                 :             : static BTStack _bt_search_insert(Relation rel, Relation heaprel,
      36                 :             :                                                                  BTInsertState insertstate);
      37                 :             : static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
      38                 :             :                                                                           Relation heapRel,
      39                 :             :                                                                           IndexUniqueCheck checkUnique, bool *is_unique,
      40                 :             :                                                                           uint32 *speculativeToken);
      41                 :             : static OffsetNumber _bt_findinsertloc(Relation rel,
      42                 :             :                                                                           BTInsertState insertstate,
      43                 :             :                                                                           bool checkingunique,
      44                 :             :                                                                           bool indexUnchanged,
      45                 :             :                                                                           BTStack stack,
      46                 :             :                                                                           Relation heapRel);
      47                 :             : static void _bt_stepright(Relation rel, Relation heaprel,
      48                 :             :                                                   BTInsertState insertstate, BTStack stack);
      49                 :             : static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
      50                 :             :                                                    Buffer buf,
      51                 :             :                                                    Buffer cbuf,
      52                 :             :                                                    BTStack stack,
      53                 :             :                                                    IndexTuple itup,
      54                 :             :                                                    Size itemsz,
      55                 :             :                                                    OffsetNumber newitemoff,
      56                 :             :                                                    int postingoff,
      57                 :             :                                                    bool split_only_page);
      58                 :             : static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
      59                 :             :                                                 Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
      60                 :             :                                                 Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
      61                 :             :                                                 IndexTuple nposting, uint16 postingoff);
      62                 :             : static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
      63                 :             :                                                           Buffer rbuf, BTStack stack, bool isroot, bool isonly);
      64                 :             : static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
      65                 :             : static inline bool _bt_pgaddtup(Page page, Size itemsize, const IndexTupleData *itup,
      66                 :             :                                                                 OffsetNumber itup_off, bool newfirstdataitem);
      67                 :             : static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
      68                 :             :                                                                                  BTInsertState insertstate,
      69                 :             :                                                                                  bool simpleonly, bool checkingunique,
      70                 :             :                                                                                  bool uniquedup, bool indexUnchanged);
      71                 :             : static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
      72                 :             :                                                            OffsetNumber *deletable, int ndeletable,
      73                 :             :                                                            IndexTuple newitem, OffsetNumber minoff,
      74                 :             :                                                            OffsetNumber maxoff);
      75                 :             : static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
      76                 :             :                                                                    int ndeletable, IndexTuple newitem,
      77                 :             :                                                                    int *nblocks);
      78                 :             : static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
      79                 :             : 
      80                 :             : /*
      81                 :             :  *      _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
      82                 :             :  *
      83                 :             :  *              This routine is called by the public interface routine, btinsert.
      84                 :             :  *              By here, itup is filled in, including the TID.
      85                 :             :  *
      86                 :             :  *              If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
      87                 :             :  *              will allow duplicates.  Otherwise (UNIQUE_CHECK_YES or
      88                 :             :  *              UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
      89                 :             :  *              For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
      90                 :             :  *              don't actually insert.
      91                 :             :  *
      92                 :             :  *              indexUnchanged executor hint indicates if itup is from an
      93                 :             :  *              UPDATE that didn't logically change the indexed value, but
      94                 :             :  *              must nevertheless have a new entry to point to a successor
      95                 :             :  *              version.
      96                 :             :  *
      97                 :             :  *              The result value is only significant for UNIQUE_CHECK_PARTIAL:
      98                 :             :  *              it must be true if the entry is known unique, else false.
      99                 :             :  *              (In the current implementation we'll also return true after a
     100                 :             :  *              successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
     101                 :             :  *              that's just a coding artifact.)
     102                 :             :  */
     103                 :             : bool
     104                 :      790536 : _bt_doinsert(Relation rel, IndexTuple itup,
     105                 :             :                          IndexUniqueCheck checkUnique, bool indexUnchanged,
     106                 :             :                          Relation heapRel)
     107                 :             : {
     108                 :      790536 :         bool            is_unique = false;
     109                 :      790536 :         BTInsertStateData insertstate;
     110                 :      790536 :         BTScanInsert itup_key;
     111                 :      790536 :         BTStack         stack;
     112                 :      790536 :         bool            checkingunique = (checkUnique != UNIQUE_CHECK_NO);
     113                 :             : 
     114                 :             :         /* we need an insertion scan key to do our search, so build one */
     115                 :      790536 :         itup_key = _bt_mkscankey(rel, itup);
     116                 :             : 
     117         [ +  + ]:      790536 :         if (checkingunique)
     118                 :             :         {
     119         [ +  + ]:      581791 :                 if (!itup_key->anynullkeys)
     120                 :             :                 {
     121                 :             :                         /* No (heapkeyspace) scantid until uniqueness established */
     122                 :      581762 :                         itup_key->scantid = NULL;
     123                 :      581762 :                 }
     124                 :             :                 else
     125                 :             :                 {
     126                 :             :                         /*
     127                 :             :                          * Scan key for new tuple contains NULL key values.  Bypass
     128                 :             :                          * checkingunique steps.  They are unnecessary because core code
     129                 :             :                          * considers NULL unequal to every value, including NULL.
     130                 :             :                          *
     131                 :             :                          * This optimization avoids O(N^2) behavior within the
     132                 :             :                          * _bt_findinsertloc() heapkeyspace path when a unique index has a
     133                 :             :                          * large number of "duplicates" with NULL key values.
     134                 :             :                          */
     135                 :          29 :                         checkingunique = false;
     136                 :             :                         /* Tuple is unique in the sense that core code cares about */
     137         [ +  - ]:          29 :                         Assert(checkUnique != UNIQUE_CHECK_EXISTING);
     138                 :          29 :                         is_unique = true;
     139                 :             :                 }
     140                 :      581791 :         }
     141                 :             : 
     142                 :             :         /*
     143                 :             :          * Fill in the BTInsertState working area, to track the current page and
     144                 :             :          * position within the page to insert on.
     145                 :             :          *
     146                 :             :          * Note that itemsz is passed down to lower level code that deals with
     147                 :             :          * inserting the item.  It must be MAXALIGN()'d.  This ensures that space
     148                 :             :          * accounting code consistently considers the alignment overhead that we
     149                 :             :          * expect PageAddItem() will add later.  (Actually, index_form_tuple() is
     150                 :             :          * already conservative about alignment, but we don't rely on that from
     151                 :             :          * this distance.  Besides, preserving the "true" tuple size in index
     152                 :             :          * tuple headers for the benefit of nbtsplitloc.c might happen someday.
     153                 :             :          * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
     154                 :             :          */
     155                 :      790536 :         insertstate.itup = itup;
     156                 :      790536 :         insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
     157                 :      790536 :         insertstate.itup_key = itup_key;
     158                 :      790536 :         insertstate.bounds_valid = false;
     159                 :      790536 :         insertstate.buf = InvalidBuffer;
     160                 :      790536 :         insertstate.postingoff = 0;
     161                 :             : 
     162                 :             : search:
     163                 :             : 
     164                 :             :         /*
     165                 :             :          * Find and lock the leaf page that the tuple should be added to by
     166                 :             :          * searching from the root page.  insertstate.buf will hold a buffer that
     167                 :             :          * is locked in exclusive mode afterwards.
     168                 :             :          */
     169                 :      790536 :         stack = _bt_search_insert(rel, heapRel, &insertstate);
     170                 :             : 
     171                 :             :         /*
     172                 :             :          * checkingunique inserts are not allowed to go ahead when two tuples with
     173                 :             :          * equal key attribute values would be visible to new MVCC snapshots once
     174                 :             :          * the xact commits.  Check for conflicts in the locked page/buffer (if
     175                 :             :          * needed) here.
     176                 :             :          *
     177                 :             :          * It might be necessary to check a page to the right in _bt_check_unique,
     178                 :             :          * though that should be very rare.  In practice the first page the value
     179                 :             :          * could be on (with scantid omitted) is almost always also the only page
     180                 :             :          * that a matching tuple might be found on.  This is due to the behavior
     181                 :             :          * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
     182                 :             :          * only be allowed to cross a page boundary when there is no candidate
     183                 :             :          * leaf page split point that avoids it.  Also, _bt_check_unique can use
     184                 :             :          * the leaf page high key to determine that there will be no duplicates on
     185                 :             :          * the right sibling without actually visiting it (it uses the high key in
     186                 :             :          * cases where the new item happens to belong at the far right of the leaf
     187                 :             :          * page).
     188                 :             :          *
     189                 :             :          * NOTE: obviously, _bt_check_unique can only detect keys that are already
     190                 :             :          * in the index; so it cannot defend against concurrent insertions of the
     191                 :             :          * same key.  We protect against that by means of holding a write lock on
     192                 :             :          * the first page the value could be on, with omitted/-inf value for the
     193                 :             :          * implicit heap TID tiebreaker attribute.  Any other would-be inserter of
     194                 :             :          * the same key must acquire a write lock on the same page, so only one
     195                 :             :          * would-be inserter can be making the check at one time.  Furthermore,
     196                 :             :          * once we are past the check we hold write locks continuously until we
     197                 :             :          * have performed our insertion, so no later inserter can fail to see our
     198                 :             :          * insertion.  (This requires some care in _bt_findinsertloc.)
     199                 :             :          *
     200                 :             :          * If we must wait for another xact, we release the lock while waiting,
     201                 :             :          * and then must perform a new search.
     202                 :             :          *
     203                 :             :          * For a partial uniqueness check, we don't wait for the other xact. Just
     204                 :             :          * let the tuple in and return false for possibly non-unique, or true for
     205                 :             :          * definitely unique.
     206                 :             :          */
     207         [ +  + ]:      790536 :         if (checkingunique)
     208                 :             :         {
     209                 :      581705 :                 TransactionId xwait;
     210                 :      581705 :                 uint32          speculativeToken;
     211                 :             : 
     212                 :      581705 :                 xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
     213                 :             :                                                                  &is_unique, &speculativeToken);
     214                 :             : 
     215         [ -  + ]:      581705 :                 if (unlikely(TransactionIdIsValid(xwait)))
     216                 :             :                 {
     217                 :             :                         /* Have to wait for the other guy ... */
     218                 :           0 :                         _bt_relbuf(rel, insertstate.buf);
     219                 :           0 :                         insertstate.buf = InvalidBuffer;
     220                 :             : 
     221                 :             :                         /*
     222                 :             :                          * If it's a speculative insertion, wait for it to finish (ie. to
     223                 :             :                          * go ahead with the insertion, or kill the tuple).  Otherwise
     224                 :             :                          * wait for the transaction to finish as usual.
     225                 :             :                          */
     226         [ #  # ]:           0 :                         if (speculativeToken)
     227                 :           0 :                                 SpeculativeInsertionWait(xwait, speculativeToken);
     228                 :             :                         else
     229                 :           0 :                                 XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
     230                 :             : 
     231                 :             :                         /* start over... */
     232         [ #  # ]:           0 :                         if (stack)
     233                 :           0 :                                 _bt_freestack(stack);
     234                 :           0 :                         goto search;
     235                 :             :                 }
     236                 :             : 
     237                 :             :                 /* Uniqueness is established -- restore heap tid as scantid */
     238         [ -  + ]:      581705 :                 if (itup_key->heapkeyspace)
     239                 :      581705 :                         itup_key->scantid = &itup->t_tid;
     240      [ -  -  + ]:      581705 :         }
     241                 :             : 
     242         [ +  + ]:      790536 :         if (checkUnique != UNIQUE_CHECK_EXISTING)
     243                 :             :         {
     244                 :      790527 :                 OffsetNumber newitemoff;
     245                 :             : 
     246                 :             :                 /*
     247                 :             :                  * The only conflict predicate locking cares about for indexes is when
     248                 :             :                  * an index tuple insert conflicts with an existing lock.  We don't
     249                 :             :                  * know the actual page we're going to insert on for sure just yet in
     250                 :             :                  * checkingunique and !heapkeyspace cases, but it's okay to use the
     251                 :             :                  * first page the value could be on (with scantid omitted) instead.
     252                 :             :                  */
     253                 :      790527 :                 CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
     254                 :             : 
     255                 :             :                 /*
     256                 :             :                  * Do the insertion.  Note that insertstate contains cached binary
     257                 :             :                  * search bounds established within _bt_check_unique when insertion is
     258                 :             :                  * checkingunique.
     259                 :             :                  */
     260                 :     1581054 :                 newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
     261                 :      790527 :                                                                            indexUnchanged, stack, heapRel);
     262                 :     1581054 :                 _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
     263                 :      790527 :                                            stack, itup, insertstate.itemsz, newitemoff,
     264                 :      790527 :                                            insertstate.postingoff, false);
     265                 :      790527 :         }
     266                 :             :         else
     267                 :             :         {
     268                 :             :                 /* just release the buffer */
     269                 :           9 :                 _bt_relbuf(rel, insertstate.buf);
     270                 :             :         }
     271                 :             : 
     272                 :             :         /* be tidy */
     273         [ +  + ]:      790536 :         if (stack)
     274                 :      726913 :                 _bt_freestack(stack);
     275                 :      790536 :         pfree(itup_key);
     276                 :             : 
     277                 :     1581072 :         return is_unique;
     278                 :      790536 : }
     279                 :             : 
     280                 :             : /*
     281                 :             :  *      _bt_search_insert() -- _bt_search() wrapper for inserts
     282                 :             :  *
     283                 :             :  * Search the tree for a particular scankey, or more precisely for the first
     284                 :             :  * leaf page it could be on.  Try to make use of the fastpath optimization's
     285                 :             :  * rightmost leaf page cache before actually searching the tree from the root
     286                 :             :  * page, though.
     287                 :             :  *
     288                 :             :  * Return value is a stack of parent-page pointers (though see notes about
     289                 :             :  * fastpath optimization and page splits below).  insertstate->buf is set to
     290                 :             :  * the address of the leaf-page buffer, which is write-locked and pinned in
     291                 :             :  * all cases (if necessary by creating a new empty root page for caller).
     292                 :             :  *
     293                 :             :  * The fastpath optimization avoids most of the work of searching the tree
     294                 :             :  * repeatedly when a single backend inserts successive new tuples on the
     295                 :             :  * rightmost leaf page of an index.  A backend cache of the rightmost leaf
     296                 :             :  * page is maintained within _bt_insertonpg(), and used here.  The cache is
     297                 :             :  * invalidated here when an insert of a non-pivot tuple must take place on a
     298                 :             :  * non-rightmost leaf page.
     299                 :             :  *
     300                 :             :  * The optimization helps with indexes on an auto-incremented field.  It also
     301                 :             :  * helps with indexes on datetime columns, as well as indexes with lots of
     302                 :             :  * NULL values.  (NULLs usually get inserted in the rightmost page for single
     303                 :             :  * column indexes, since they usually get treated as coming after everything
     304                 :             :  * else in the key space.  Individual NULL tuples will generally be placed on
     305                 :             :  * the rightmost leaf page due to the influence of the heap TID column.)
     306                 :             :  *
     307                 :             :  * Note that we avoid applying the optimization when there is insufficient
     308                 :             :  * space on the rightmost page to fit caller's new item.  This is necessary
     309                 :             :  * because we'll need to return a real descent stack when a page split is
     310                 :             :  * expected (actually, caller can cope with a leaf page split that uses a NULL
     311                 :             :  * stack, but that's very slow and so must be avoided).  Note also that the
     312                 :             :  * fastpath optimization acquires the lock on the page conditionally as a way
     313                 :             :  * of reducing extra contention when there are concurrent insertions into the
     314                 :             :  * rightmost page (we give up if we'd have to wait for the lock).  We assume
     315                 :             :  * that it isn't useful to apply the optimization when there is contention,
     316                 :             :  * since each per-backend cache won't stay valid for long.
     317                 :             :  */
     318                 :             : static BTStack
     319                 :      790593 : _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
     320                 :             : {
     321         [ +  - ]:      790593 :         Assert(insertstate->buf == InvalidBuffer);
     322         [ +  - ]:      790593 :         Assert(!insertstate->bounds_valid);
     323         [ +  - ]:      790593 :         Assert(insertstate->postingoff == 0);
     324                 :             : 
     325   [ +  -  +  + ]:      790593 :         if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
     326                 :             :         {
     327                 :             :                 /* Simulate a _bt_getbuf() call with conditional locking */
     328         [ +  - ]:          38 :                 insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
     329         [ +  - ]:          38 :                 if (_bt_conditionallockbuf(rel, insertstate->buf))
     330                 :             :                 {
     331                 :          38 :                         Page            page;
     332                 :          38 :                         BTPageOpaque opaque;
     333                 :             : 
     334                 :          38 :                         _bt_checkpage(rel, insertstate->buf);
     335                 :          38 :                         page = BufferGetPage(insertstate->buf);
     336                 :          38 :                         opaque = BTPageGetOpaque(page);
     337                 :             : 
     338                 :             :                         /*
     339                 :             :                          * Check if the page is still the rightmost leaf page and has
     340                 :             :                          * enough free space to accommodate the new tuple.  Also check
     341                 :             :                          * that the insertion scan key is strictly greater than the first
     342                 :             :                          * non-pivot tuple on the page.  (Note that we expect itup_key's
     343                 :             :                          * scantid to be unset when our caller is a checkingunique
     344                 :             :                          * inserter.)
     345                 :             :                          */
     346         [ +  - ]:          38 :                         if (P_RIGHTMOST(opaque) &&
     347         [ +  - ]:          38 :                                 P_ISLEAF(opaque) &&
     348         [ +  - ]:          38 :                                 !P_IGNORE(opaque) &&
     349         [ +  + ]:          38 :                                 PageGetFreeSpace(page) > insertstate->itemsz &&
     350   [ +  -  -  + ]:          15 :                                 PageGetMaxOffsetNumber(page) >= P_HIKEY &&
     351                 :          15 :                                 _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
     352                 :             :                         {
     353                 :             :                                 /*
     354                 :             :                                  * Caller can use the fastpath optimization because cached
     355                 :             :                                  * block is still rightmost leaf page, which can fit caller's
     356                 :             :                                  * new tuple without splitting.  Keep block in local cache for
     357                 :             :                                  * next insert, and have caller use NULL stack.
     358                 :             :                                  *
     359                 :             :                                  * Note that _bt_insert_parent() has an assertion that catches
     360                 :             :                                  * leaf page splits that somehow follow from a fastpath insert
     361                 :             :                                  * (it should only be passed a NULL stack when it must deal
     362                 :             :                                  * with a concurrent root page split, and never because a NULL
     363                 :             :                                  * stack was returned here).
     364                 :             :                                  */
     365                 :          15 :                                 return NULL;
     366                 :             :                         }
     367                 :             : 
     368                 :             :                         /* Page unsuitable for caller, drop lock and pin */
     369                 :          23 :                         _bt_relbuf(rel, insertstate->buf);
     370      [ -  +  + ]:          38 :                 }
     371                 :             :                 else
     372                 :             :                 {
     373                 :             :                         /* Lock unavailable, drop pin */
     374                 :           0 :                         ReleaseBuffer(insertstate->buf);
     375                 :             :                 }
     376                 :             : 
     377                 :             :                 /* Forget block, since cache doesn't appear to be useful */
     378                 :          23 :                 RelationSetTargetBlock(rel, InvalidBlockNumber);
     379                 :          23 :         }
     380                 :             : 
     381                 :             :         /* Cannot use optimization -- descend tree, return proper descent stack */
     382                 :      790578 :         return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
     383                 :             :                                           BT_WRITE);
     384                 :      790593 : }
     385                 :             : 
     386                 :             : /*
     387                 :             :  *      _bt_check_unique() -- Check for violation of unique index constraint
     388                 :             :  *
     389                 :             :  * Returns InvalidTransactionId if there is no conflict, else an xact ID
     390                 :             :  * we must wait for to see if it commits a conflicting tuple.   If an actual
     391                 :             :  * conflict is detected, no return --- just ereport().  If an xact ID is
     392                 :             :  * returned, and the conflicting tuple still has a speculative insertion in
     393                 :             :  * progress, *speculativeToken is set to non-zero, and the caller can wait for
     394                 :             :  * the verdict on the insertion using SpeculativeInsertionWait().
     395                 :             :  *
     396                 :             :  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
     397                 :             :  * InvalidTransactionId because we don't want to wait.  In this case we
     398                 :             :  * set *is_unique to false if there is a potential conflict, and the
     399                 :             :  * core code must redo the uniqueness check later.
     400                 :             :  *
     401                 :             :  * As a side-effect, sets state in insertstate that can later be used by
     402                 :             :  * _bt_findinsertloc() to reuse most of the binary search work we do
     403                 :             :  * here.
     404                 :             :  *
     405                 :             :  * This code treats NULLs as equal, unlike the default semantics for unique
     406                 :             :  * indexes.  So do not call here when there are NULL values in scan key and
     407                 :             :  * the index uses the default NULLS DISTINCT mode.
     408                 :             :  */
     409                 :             : static TransactionId
     410                 :      581762 : _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
     411                 :             :                                  IndexUniqueCheck checkUnique, bool *is_unique,
     412                 :             :                                  uint32 *speculativeToken)
     413                 :             : {
     414                 :      581762 :         IndexTuple      itup = insertstate->itup;
     415                 :      581762 :         IndexTuple      curitup = NULL;
     416                 :      581762 :         ItemId          curitemid = NULL;
     417                 :      581762 :         BTScanInsert itup_key = insertstate->itup_key;
     418                 :      581762 :         SnapshotData SnapshotDirty;
     419                 :      581762 :         OffsetNumber offset;
     420                 :      581762 :         OffsetNumber maxoff;
     421                 :      581762 :         Page            page;
     422                 :      581762 :         BTPageOpaque opaque;
     423                 :      581762 :         Buffer          nbuf = InvalidBuffer;
     424                 :      581762 :         bool            found = false;
     425                 :      581762 :         bool            inposting = false;
     426                 :      581762 :         bool            prevalldead = true;
     427                 :      581762 :         int                     curposti = 0;
     428                 :             : 
     429                 :             :         /* Assume unique until we find a duplicate */
     430                 :      581762 :         *is_unique = true;
     431                 :             : 
     432                 :      581762 :         InitDirtySnapshot(SnapshotDirty);
     433                 :             : 
     434                 :      581762 :         page = BufferGetPage(insertstate->buf);
     435                 :      581762 :         opaque = BTPageGetOpaque(page);
     436                 :      581762 :         maxoff = PageGetMaxOffsetNumber(page);
     437                 :             : 
     438                 :             :         /*
     439                 :             :          * Find the first tuple with the same key.
     440                 :             :          *
     441                 :             :          * This also saves the binary search bounds in insertstate.  We use them
     442                 :             :          * in the fastpath below, but also in the _bt_findinsertloc() call later.
     443                 :             :          */
     444         [ +  - ]:      581762 :         Assert(!insertstate->bounds_valid);
     445                 :      581762 :         offset = _bt_binsrch_insert(rel, insertstate);
     446                 :             : 
     447                 :             :         /*
     448                 :             :          * Scan over all equal tuples, looking for live conflicts.
     449                 :             :          */
     450   [ +  +  +  - ]:      581762 :         Assert(!insertstate->bounds_valid || insertstate->low == offset);
     451         [ +  - ]:      581762 :         Assert(!itup_key->anynullkeys);
     452         [ +  - ]:      581762 :         Assert(itup_key->scantid == NULL);
     453                 :     1694481 :         for (;;)
     454                 :             :         {
     455                 :             :                 /*
     456                 :             :                  * Each iteration of the loop processes one heap TID, not one index
     457                 :             :                  * tuple.  Current offset number for page isn't usually advanced on
     458                 :             :                  * iterations that process heap TIDs from posting list tuples.
     459                 :             :                  *
     460                 :             :                  * "inposting" state is set when _inside_ a posting list --- not when
     461                 :             :                  * we're at the start (or end) of a posting list.  We advance curposti
     462                 :             :                  * at the end of the iteration when inside a posting list tuple.  In
     463                 :             :                  * general, every loop iteration either advances the page offset or
     464                 :             :                  * advances curposti --- an iteration that handles the rightmost/max
     465                 :             :                  * heap TID in a posting list finally advances the page offset (and
     466                 :             :                  * unsets "inposting").
     467                 :             :                  *
     468                 :             :                  * Make sure the offset points to an actual index tuple before trying
     469                 :             :                  * to examine it...
     470                 :             :                  */
     471         [ +  + ]:     2444582 :                 if (offset <= maxoff)
     472                 :             :                 {
     473                 :             :                         /*
     474                 :             :                          * Fastpath: In most cases, we can use cached search bounds to
     475                 :             :                          * limit our consideration to items that are definitely
     476                 :             :                          * duplicates.  This fastpath doesn't apply when the original page
     477                 :             :                          * is empty, or when initial offset is past the end of the
     478                 :             :                          * original page, which may indicate that we need to examine a
     479                 :             :                          * second or subsequent page.
     480                 :             :                          *
     481                 :             :                          * Note that this optimization allows us to avoid calling
     482                 :             :                          * _bt_compare() directly when there are no duplicates, as long as
     483                 :             :                          * the offset where the key will go is not at the end of the page.
     484                 :             :                          */
     485   [ +  +  +  + ]:     1991441 :                         if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
     486                 :             :                         {
     487         [ +  - ]:      117144 :                                 Assert(insertstate->bounds_valid);
     488         [ +  - ]:      117144 :                                 Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
     489         [ +  - ]:      117144 :                                 Assert(insertstate->low <= insertstate->stricthigh);
     490         [ +  - ]:      117144 :                                 Assert(_bt_compare(rel, itup_key, page, offset) < 0);
     491                 :      117144 :                                 break;
     492                 :             :                         }
     493                 :             : 
     494                 :             :                         /*
     495                 :             :                          * We can skip items that are already marked killed.
     496                 :             :                          *
     497                 :             :                          * In the presence of heavy update activity an index may contain
     498                 :             :                          * many killed items with the same key; running _bt_compare() on
     499                 :             :                          * each killed item gets expensive.  Just advance over killed
     500                 :             :                          * items as quickly as we can.  We only apply _bt_compare() when
     501                 :             :                          * we get to a non-killed item.  We could reuse the bounds to
     502                 :             :                          * avoid _bt_compare() calls for known equal tuples, but it
     503                 :             :                          * doesn't seem worth it.
     504                 :             :                          */
     505         [ +  + ]:     1874297 :                         if (!inposting)
     506                 :     1124196 :                                 curitemid = PageGetItemId(page, offset);
     507   [ +  +  +  + ]:     1874297 :                         if (inposting || !ItemIdIsDead(curitemid))
     508                 :             :                         {
     509                 :     1855617 :                                 ItemPointerData htid;
     510                 :     1855617 :                                 bool            all_dead = false;
     511                 :             : 
     512         [ +  + ]:     1855617 :                                 if (!inposting)
     513                 :             :                                 {
     514                 :             :                                         /* Plain tuple, or first TID in posting list tuple */
     515         [ +  + ]:     1105516 :                                         if (_bt_compare(rel, itup_key, page, offset) != 0)
     516                 :        7805 :                                                 break;  /* we're past all the equal tuples */
     517                 :             : 
     518                 :             :                                         /* Advanced curitup */
     519                 :     1097711 :                                         curitup = (IndexTuple) PageGetItem(page, curitemid);
     520         [ -  + ]:     1097711 :                                         Assert(!BTreeTupleIsPivot(curitup));
     521                 :     1097711 :                                 }
     522                 :             : 
     523                 :             :                                 /* okay, we gotta fetch the heap tuple using htid ... */
     524         [ +  + ]:     1847812 :                                 if (!BTreeTupleIsPosting(curitup))
     525                 :             :                                 {
     526                 :             :                                         /* ... htid is from simple non-pivot tuple */
     527         [ +  - ]:     1093490 :                                         Assert(!inposting);
     528                 :     1093490 :                                         htid = curitup->t_tid;
     529                 :     1093490 :                                 }
     530         [ +  + ]:      754322 :                                 else if (!inposting)
     531                 :             :                                 {
     532                 :             :                                         /* ... htid is first TID in new posting list */
     533                 :        4221 :                                         inposting = true;
     534                 :        4221 :                                         prevalldead = true;
     535                 :        4221 :                                         curposti = 0;
     536                 :        4221 :                                         htid = *BTreeTupleGetPostingN(curitup, 0);
     537                 :        4221 :                                 }
     538                 :             :                                 else
     539                 :             :                                 {
     540                 :             :                                         /* ... htid is second or subsequent TID in posting list */
     541         [ +  - ]:      750101 :                                         Assert(curposti > 0);
     542                 :      750101 :                                         htid = *BTreeTupleGetPostingN(curitup, curposti);
     543                 :             :                                 }
     544                 :             : 
     545                 :             :                                 /*
     546                 :             :                                  * If we are doing a recheck, we expect to find the tuple we
     547                 :             :                                  * are rechecking.  It's not a duplicate, but we have to keep
     548                 :             :                                  * scanning.
     549                 :             :                                  */
     550   [ +  +  +  + ]:     1847812 :                                 if (checkUnique == UNIQUE_CHECK_EXISTING &&
     551                 :          39 :                                         ItemPointerCompare(&htid, &itup->t_tid) == 0)
     552                 :             :                                 {
     553                 :           9 :                                         found = true;
     554                 :           9 :                                 }
     555                 :             : 
     556                 :             :                                 /*
     557                 :             :                                  * Check if there's any table tuples for this index entry
     558                 :             :                                  * satisfying SnapshotDirty. This is necessary because for AMs
     559                 :             :                                  * with optimizations like heap's HOT, we have just a single
     560                 :             :                                  * index entry for the entire chain.
     561                 :             :                                  */
     562         [ +  + ]:     1847803 :                                 else if (table_index_fetch_tuple_check(heapRel, &htid,
     563                 :             :                                                                                                            &SnapshotDirty,
     564                 :             :                                                                                                            &all_dead))
     565                 :             :                                 {
     566                 :          73 :                                         TransactionId xwait;
     567                 :             : 
     568                 :             :                                         /*
     569                 :             :                                          * It is a duplicate. If we are only doing a partial
     570                 :             :                                          * check, then don't bother checking if the tuple is being
     571                 :             :                                          * updated in another transaction. Just return the fact
     572                 :             :                                          * that it is a potential conflict and leave the full
     573                 :             :                                          * check till later. Don't invalidate binary search
     574                 :             :                                          * bounds.
     575                 :             :                                          */
     576         [ +  + ]:          73 :                                         if (checkUnique == UNIQUE_CHECK_PARTIAL)
     577                 :             :                                         {
     578         [ +  - ]:          16 :                                                 if (nbuf != InvalidBuffer)
     579                 :           0 :                                                         _bt_relbuf(rel, nbuf);
     580                 :          16 :                                                 *is_unique = false;
     581                 :          16 :                                                 return InvalidTransactionId;
     582                 :             :                                         }
     583                 :             : 
     584                 :             :                                         /*
     585                 :             :                                          * If this tuple is being updated by other transaction
     586                 :             :                                          * then we have to wait for its commit/abort.
     587                 :             :                                          */
     588         [ -  + ]:          57 :                                         xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
     589                 :          57 :                                                 SnapshotDirty.xmin : SnapshotDirty.xmax;
     590                 :             : 
     591         [ -  + ]:          57 :                                         if (TransactionIdIsValid(xwait))
     592                 :             :                                         {
     593         [ #  # ]:           0 :                                                 if (nbuf != InvalidBuffer)
     594                 :           0 :                                                         _bt_relbuf(rel, nbuf);
     595                 :             :                                                 /* Tell _bt_doinsert to wait... */
     596                 :           0 :                                                 *speculativeToken = SnapshotDirty.speculativeToken;
     597                 :             :                                                 /* Caller releases lock on buf immediately */
     598                 :           0 :                                                 insertstate->bounds_valid = false;
     599                 :           0 :                                                 return xwait;
     600                 :             :                                         }
     601                 :             : 
     602                 :             :                                         /*
     603                 :             :                                          * Otherwise we have a definite conflict.  But before
     604                 :             :                                          * complaining, look to see if the tuple we want to insert
     605                 :             :                                          * is itself now committed dead --- if so, don't complain.
     606                 :             :                                          * This is a waste of time in normal scenarios but we must
     607                 :             :                                          * do it to support CREATE INDEX CONCURRENTLY.
     608                 :             :                                          *
     609                 :             :                                          * We must follow HOT-chains here because during
     610                 :             :                                          * concurrent index build, we insert the root TID though
     611                 :             :                                          * the actual tuple may be somewhere in the HOT-chain.
     612                 :             :                                          * While following the chain we might not stop at the
     613                 :             :                                          * exact tuple which triggered the insert, but that's OK
     614                 :             :                                          * because if we find a live tuple anywhere in this chain,
     615                 :             :                                          * we have a unique key conflict.  The other live tuple is
     616                 :             :                                          * not part of this chain because it had a different index
     617                 :             :                                          * entry.
     618                 :             :                                          */
     619                 :          57 :                                         htid = itup->t_tid;
     620         [ +  - ]:          57 :                                         if (table_index_fetch_tuple_check(heapRel, &htid,
     621                 :             :                                                                                                           SnapshotSelf, NULL))
     622                 :             :                                         {
     623                 :             :                                                 /* Normal case --- it's still live */
     624                 :          57 :                                         }
     625                 :             :                                         else
     626                 :             :                                         {
     627                 :             :                                                 /*
     628                 :             :                                                  * It's been deleted, so no error, and no need to
     629                 :             :                                                  * continue searching
     630                 :             :                                                  */
     631                 :           0 :                                                 break;
     632                 :             :                                         }
     633                 :             : 
     634                 :             :                                         /*
     635                 :             :                                          * Check for a conflict-in as we would if we were going to
     636                 :             :                                          * write to this page.  We aren't actually going to write,
     637                 :             :                                          * but we want a chance to report SSI conflicts that would
     638                 :             :                                          * otherwise be masked by this unique constraint
     639                 :             :                                          * violation.
     640                 :             :                                          */
     641                 :          57 :                                         CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
     642                 :             : 
     643                 :             :                                         /*
     644                 :             :                                          * This is a definite conflict.  Break the tuple down into
     645                 :             :                                          * datums and report the error.  But first, make sure we
     646                 :             :                                          * release the buffer locks we're holding ---
     647                 :             :                                          * BuildIndexValueDescription could make catalog accesses,
     648                 :             :                                          * which in the worst case might touch this same index and
     649                 :             :                                          * cause deadlocks.
     650                 :             :                                          */
     651         [ +  - ]:          57 :                                         if (nbuf != InvalidBuffer)
     652                 :           0 :                                                 _bt_relbuf(rel, nbuf);
     653                 :          57 :                                         _bt_relbuf(rel, insertstate->buf);
     654                 :          57 :                                         insertstate->buf = InvalidBuffer;
     655                 :          57 :                                         insertstate->bounds_valid = false;
     656                 :             : 
     657                 :             :                                         {
     658                 :          57 :                                                 Datum           values[INDEX_MAX_KEYS];
     659                 :          57 :                                                 bool            isnull[INDEX_MAX_KEYS];
     660                 :          57 :                                                 char       *key_desc;
     661                 :             : 
     662                 :         114 :                                                 index_deform_tuple(itup, RelationGetDescr(rel),
     663                 :          57 :                                                                                    values, isnull);
     664                 :             : 
     665                 :         114 :                                                 key_desc = BuildIndexValueDescription(rel, values,
     666                 :          57 :                                                                                                                           isnull);
     667                 :             : 
     668   [ -  +  +  -  :          57 :                                                 ereport(ERROR,
                   +  + ]
     669                 :             :                                                                 (errcode(ERRCODE_UNIQUE_VIOLATION),
     670                 :             :                                                                  errmsg("duplicate key value violates unique constraint \"%s\"",
     671                 :             :                                                                                 RelationGetRelationName(rel)),
     672                 :             :                                                                  key_desc ? errdetail("Key %s already exists.",
     673                 :             :                                                                                                           key_desc) : 0,
     674                 :             :                                                                  errtableconstraint(heapRel,
     675                 :             :                                                                                                         RelationGetRelationName(rel))));
     676                 :           0 :                                         }
     677         [ +  - ]:          16 :                                 }
     678   [ +  +  +  +  :     1848285 :                                 else if (all_dead && (!inposting ||
                   +  + ]
     679         [ +  + ]:        1069 :                                                                           (prevalldead &&
     680                 :         555 :                                                                            curposti == BTreeTupleGetNPosting(curitup) - 1)))
     681                 :             :                                 {
     682                 :             :                                         /*
     683                 :             :                                          * The conflicting tuple (or all HOT chains pointed to by
     684                 :             :                                          * all posting list TIDs) is dead to everyone, so mark the
     685                 :             :                                          * index entry killed.
     686                 :             :                                          */
     687                 :        1082 :                                         ItemIdMarkDead(curitemid);
     688                 :        1082 :                                         opaque->btpo_flags |= BTP_HAS_GARBAGE;
     689                 :             : 
     690                 :             :                                         /*
     691                 :             :                                          * Mark buffer with a dirty hint, since state is not
     692                 :             :                                          * crucial. Be sure to mark the proper buffer dirty.
     693                 :             :                                          */
     694         [ -  + ]:        1082 :                                         if (nbuf != InvalidBuffer)
     695                 :           0 :                                                 MarkBufferDirtyHint(nbuf, true);
     696                 :             :                                         else
     697                 :        1082 :                                                 MarkBufferDirtyHint(insertstate->buf, true);
     698                 :        1082 :                                 }
     699                 :             : 
     700                 :             :                                 /*
     701                 :             :                                  * Remember if posting list tuple has even a single HOT chain
     702                 :             :                                  * whose members are not all dead
     703                 :             :                                  */
     704   [ +  +  +  + ]:     1847739 :                                 if (!all_dead && inposting)
     705                 :      753253 :                                         prevalldead = false;
     706      [ +  +  + ]:     1855560 :                         }
     707                 :     1866419 :                 }
     708                 :             : 
     709   [ +  +  +  + ]:     2319560 :                 if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
     710                 :             :                 {
     711                 :             :                         /* Advance to next TID in same posting list */
     712                 :      750101 :                         curposti++;
     713                 :      750101 :                         continue;
     714                 :             :                 }
     715         [ +  + ]:     1569459 :                 else if (offset < maxoff)
     716                 :             :                 {
     717                 :             :                         /* Advance to next tuple */
     718                 :     1111045 :                         curposti = 0;
     719                 :     1111045 :                         inposting = false;
     720                 :     1111045 :                         offset = OffsetNumberNext(offset);
     721                 :     1111045 :                 }
     722                 :             :                 else
     723                 :             :                 {
     724                 :      458414 :                         int                     highkeycmp;
     725                 :             : 
     726                 :             :                         /* If scankey == hikey we gotta check the next page too */
     727         [ +  + ]:      458414 :                         if (P_RIGHTMOST(opaque))
     728                 :      454868 :                                 break;
     729                 :        3546 :                         highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
     730         [ +  - ]:        3546 :                         Assert(highkeycmp <= 0);
     731         [ +  + ]:        3546 :                         if (highkeycmp != 0)
     732                 :        1872 :                                 break;
     733                 :             :                         /* Advance to next non-dead page --- there must be one */
     734                 :        1674 :                         for (;;)
     735                 :             :                         {
     736                 :        1674 :                                 BlockNumber nblkno = opaque->btpo_next;
     737                 :             : 
     738                 :        1674 :                                 nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
     739                 :        1674 :                                 page = BufferGetPage(nbuf);
     740                 :        1674 :                                 opaque = BTPageGetOpaque(page);
     741         [ -  + ]:        1674 :                                 if (!P_IGNORE(opaque))
     742                 :        1674 :                                         break;
     743         [ #  # ]:           0 :                                 if (P_RIGHTMOST(opaque))
     744   [ #  #  #  # ]:           0 :                                         elog(ERROR, "fell off the end of index \"%s\"",
     745                 :             :                                                  RelationGetRelationName(rel));
     746         [ +  - ]:        1674 :                         }
     747                 :             :                         /* Will also advance to next tuple */
     748                 :        1674 :                         curposti = 0;
     749                 :        1674 :                         inposting = false;
     750                 :        1674 :                         maxoff = PageGetMaxOffsetNumber(page);
     751                 :        1674 :                         offset = P_FIRSTDATAKEY(opaque);
     752                 :             :                         /* Don't invalidate binary search bounds */
     753         [ +  + ]:      458414 :                 }
     754                 :             :         }
     755                 :             : 
     756                 :             :         /*
     757                 :             :          * If we are doing a recheck then we should have found the tuple we are
     758                 :             :          * checking.  Otherwise there's something very wrong --- probably, the
     759                 :             :          * index is on a non-immutable expression.
     760                 :             :          */
     761   [ +  +  +  - ]:      581689 :         if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
     762   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     763                 :             :                                 (errcode(ERRCODE_INTERNAL_ERROR),
     764                 :             :                                  errmsg("failed to re-find tuple within index \"%s\"",
     765                 :             :                                                 RelationGetRelationName(rel)),
     766                 :             :                                  errhint("This may be because of a non-immutable index expression."),
     767                 :             :                                  errtableconstraint(heapRel,
     768                 :             :                                                                         RelationGetRelationName(rel))));
     769                 :             : 
     770         [ +  + ]:      581689 :         if (nbuf != InvalidBuffer)
     771                 :         960 :                 _bt_relbuf(rel, nbuf);
     772                 :             : 
     773                 :      581689 :         return InvalidTransactionId;
     774                 :      581705 : }
     775                 :             : 
     776                 :             : 
     777                 :             : /*
     778                 :             :  *      _bt_findinsertloc() -- Finds an insert location for a tuple
     779                 :             :  *
     780                 :             :  *              On entry, insertstate buffer contains the page the new tuple belongs
     781                 :             :  *              on.  It is exclusive-locked and pinned by the caller.
     782                 :             :  *
     783                 :             :  *              If 'checkingunique' is true, the buffer on entry is the first page
     784                 :             :  *              that contains duplicates of the new key.  If there are duplicates on
     785                 :             :  *              multiple pages, the correct insertion position might be some page to
     786                 :             :  *              the right, rather than the first page.  In that case, this function
     787                 :             :  *              moves right to the correct target page.
     788                 :             :  *
     789                 :             :  *              (In a !heapkeyspace index, there can be multiple pages with the same
     790                 :             :  *              high key, where the new tuple could legitimately be placed on.  In
     791                 :             :  *              that case, the caller passes the first page containing duplicates,
     792                 :             :  *              just like when checkingunique=true.  If that page doesn't have enough
     793                 :             :  *              room for the new tuple, this function moves right, trying to find a
     794                 :             :  *              legal page that does.)
     795                 :             :  *
     796                 :             :  *              If 'indexUnchanged' is true, this is for an UPDATE that didn't
     797                 :             :  *              logically change the indexed value, but must nevertheless have a new
     798                 :             :  *              entry to point to a successor version.  This hint from the executor
     799                 :             :  *              will influence our behavior when the page might have to be split and
     800                 :             :  *              we must consider our options.  Bottom-up index deletion can avoid
     801                 :             :  *              pathological version-driven page splits, but we only want to go to the
     802                 :             :  *              trouble of trying it when we already have moderate confidence that
     803                 :             :  *              it's appropriate.  The hint should not significantly affect our
     804                 :             :  *              behavior over time unless practically all inserts on to the leaf page
     805                 :             :  *              get the hint.
     806                 :             :  *
     807                 :             :  *              On exit, insertstate buffer contains the chosen insertion page, and
     808                 :             :  *              the offset within that page is returned.  If _bt_findinsertloc needed
     809                 :             :  *              to move right, the lock and pin on the original page are released, and
     810                 :             :  *              the new buffer is exclusively locked and pinned instead.
     811                 :             :  *
     812                 :             :  *              If insertstate contains cached binary search bounds, we will take
     813                 :             :  *              advantage of them.  This avoids repeating comparisons that we made in
     814                 :             :  *              _bt_check_unique() already.
     815                 :             :  */
     816                 :             : static OffsetNumber
     817                 :      790527 : _bt_findinsertloc(Relation rel,
     818                 :             :                                   BTInsertState insertstate,
     819                 :             :                                   bool checkingunique,
     820                 :             :                                   bool indexUnchanged,
     821                 :             :                                   BTStack stack,
     822                 :             :                                   Relation heapRel)
     823                 :             : {
     824                 :      790527 :         BTScanInsert itup_key = insertstate->itup_key;
     825                 :      790527 :         Page            page = BufferGetPage(insertstate->buf);
     826                 :      790527 :         BTPageOpaque opaque;
     827                 :      790527 :         OffsetNumber newitemoff;
     828                 :             : 
     829                 :      790527 :         opaque = BTPageGetOpaque(page);
     830                 :             : 
     831                 :             :         /* Check 1/3 of a page restriction */
     832         [ +  - ]:      790527 :         if (unlikely(insertstate->itemsz > BTMaxItemSize))
     833                 :           0 :                 _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
     834                 :           0 :                                                          insertstate->itup);
     835                 :             : 
     836         [ +  - ]:      790527 :         Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
     837   [ +  +  +  - ]:      790527 :         Assert(!insertstate->bounds_valid || checkingunique);
     838   [ +  -  +  - ]:      790527 :         Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
     839   [ -  +  #  # ]:      790527 :         Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
     840   [ +  +  +  - ]:      790527 :         Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
     841                 :             : 
     842         [ +  - ]:      790527 :         if (itup_key->heapkeyspace)
     843                 :             :         {
     844                 :             :                 /* Keep track of whether checkingunique duplicate seen */
     845                 :      790527 :                 bool            uniquedup = indexUnchanged;
     846                 :             : 
     847                 :             :                 /*
     848                 :             :                  * If we're inserting into a unique index, we may have to walk right
     849                 :             :                  * through leaf pages to find the one leaf page that we must insert on
     850                 :             :                  * to.
     851                 :             :                  *
     852                 :             :                  * This is needed for checkingunique callers because a scantid was not
     853                 :             :                  * used when we called _bt_search().  scantid can only be set after
     854                 :             :                  * _bt_check_unique() has checked for duplicates.  The buffer
     855                 :             :                  * initially stored in insertstate->buf has the page where the first
     856                 :             :                  * duplicate key might be found, which isn't always the page that new
     857                 :             :                  * tuple belongs on.  The heap TID attribute for new tuple (scantid)
     858                 :             :                  * could force us to insert on a sibling page, though that should be
     859                 :             :                  * very rare in practice.
     860                 :             :                  */
     861         [ +  + ]:      790527 :                 if (checkingunique)
     862                 :             :                 {
     863         [ +  + ]:      581696 :                         if (insertstate->low < insertstate->stricthigh)
     864                 :             :                         {
     865                 :             :                                 /* Encountered a duplicate in _bt_check_unique() */
     866         [ +  - ]:       17633 :                                 Assert(insertstate->bounds_valid);
     867                 :       17633 :                                 uniquedup = true;
     868                 :       17633 :                         }
     869                 :             : 
     870                 :      583370 :                         for (;;)
     871                 :             :                         {
     872                 :             :                                 /*
     873                 :             :                                  * Does the new tuple belong on this page?
     874                 :             :                                  *
     875                 :             :                                  * The earlier _bt_check_unique() call may well have
     876                 :             :                                  * established a strict upper bound on the offset for the new
     877                 :             :                                  * item.  If it's not the last item of the page (i.e. if there
     878                 :             :                                  * is at least one tuple on the page that goes after the tuple
     879                 :             :                                  * we're inserting) then we know that the tuple belongs on
     880                 :             :                                  * this page.  We can skip the high key check.
     881                 :             :                                  */
     882         [ +  + ]:      583370 :                                 if (insertstate->bounds_valid &&
     883   [ +  -  +  + ]:      581132 :                                         insertstate->low <= insertstate->stricthigh &&
     884                 :      581132 :                                         insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
     885                 :      123211 :                                         break;
     886                 :             : 
     887                 :             :                                 /* Test '<=', not '!=', since scantid is set now */
     888   [ +  +  +  + ]:      460159 :                                 if (P_RIGHTMOST(opaque) ||
     889                 :        4243 :                                         _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
     890                 :      458485 :                                         break;
     891                 :             : 
     892                 :        1674 :                                 _bt_stepright(rel, heapRel, insertstate, stack);
     893                 :             :                                 /* Update local state after stepping right */
     894                 :        1674 :                                 page = BufferGetPage(insertstate->buf);
     895                 :        1674 :                                 opaque = BTPageGetOpaque(page);
     896                 :             :                                 /* Assume duplicates (if checkingunique) */
     897                 :        1674 :                                 uniquedup = true;
     898                 :             :                         }
     899                 :      581696 :                 }
     900                 :             : 
     901                 :             :                 /*
     902                 :             :                  * If the target page cannot fit newitem, try to avoid splitting the
     903                 :             :                  * page on insert by performing deletion or deduplication now
     904                 :             :                  */
     905         [ +  + ]:      790527 :                 if (PageGetFreeSpace(page) < insertstate->itemsz)
     906                 :       10044 :                         _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
     907                 :        5022 :                                                                                  checkingunique, uniquedup,
     908                 :        5022 :                                                                                  indexUnchanged);
     909                 :      790527 :         }
     910                 :             :         else
     911                 :             :         {
     912                 :             :                 /*----------
     913                 :             :                  * This is a !heapkeyspace (version 2 or 3) index.  The current page
     914                 :             :                  * is the first page that we could insert the new tuple to, but there
     915                 :             :                  * may be other pages to the right that we could opt to use instead.
     916                 :             :                  *
     917                 :             :                  * If the new key is equal to one or more existing keys, we can
     918                 :             :                  * legitimately place it anywhere in the series of equal keys.  In
     919                 :             :                  * fact, if the new key is equal to the page's "high key" we can place
     920                 :             :                  * it on the next page.  If it is equal to the high key, and there's
     921                 :             :                  * not room to insert the new tuple on the current page without
     922                 :             :                  * splitting, then we move right hoping to find more free space and
     923                 :             :                  * avoid a split.
     924                 :             :                  *
     925                 :             :                  * Keep scanning right until we
     926                 :             :                  *              (a) find a page with enough free space,
     927                 :             :                  *              (b) reach the last page where the tuple can legally go, or
     928                 :             :                  *              (c) get tired of searching.
     929                 :             :                  * (c) is not flippant; it is important because if there are many
     930                 :             :                  * pages' worth of equal keys, it's better to split one of the early
     931                 :             :                  * pages than to scan all the way to the end of the run of equal keys
     932                 :             :                  * on every insert.  We implement "get tired" as a random choice,
     933                 :             :                  * since stopping after scanning a fixed number of pages wouldn't work
     934                 :             :                  * well (we'd never reach the right-hand side of previously split
     935                 :             :                  * pages).  The probability of moving right is set at 0.99, which may
     936                 :             :                  * seem too high to change the behavior much, but it does an excellent
     937                 :             :                  * job of preventing O(N^2) behavior with many equal keys.
     938                 :             :                  *----------
     939                 :             :                  */
     940         [ #  # ]:           0 :                 while (PageGetFreeSpace(page) < insertstate->itemsz)
     941                 :             :                 {
     942                 :             :                         /*
     943                 :             :                          * Before considering moving right, see if we can obtain enough
     944                 :             :                          * space by erasing LP_DEAD items
     945                 :             :                          */
     946         [ #  # ]:           0 :                         if (P_HAS_GARBAGE(opaque))
     947                 :             :                         {
     948                 :             :                                 /* Perform simple deletion */
     949                 :           0 :                                 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
     950                 :             :                                                                                          false, false, false);
     951                 :             : 
     952         [ #  # ]:           0 :                                 if (PageGetFreeSpace(page) >= insertstate->itemsz)
     953                 :           0 :                                         break;          /* OK, now we have enough space */
     954                 :           0 :                         }
     955                 :             : 
     956                 :             :                         /*
     957                 :             :                          * Nope, so check conditions (b) and (c) enumerated above
     958                 :             :                          *
     959                 :             :                          * The earlier _bt_check_unique() call may well have established a
     960                 :             :                          * strict upper bound on the offset for the new item.  If it's not
     961                 :             :                          * the last item of the page (i.e. if there is at least one tuple
     962                 :             :                          * on the page that's greater than the tuple we're inserting to)
     963                 :             :                          * then we know that the tuple belongs on this page.  We can skip
     964                 :             :                          * the high key check.
     965                 :             :                          */
     966         [ #  # ]:           0 :                         if (insertstate->bounds_valid &&
     967   [ #  #  #  # ]:           0 :                                 insertstate->low <= insertstate->stricthigh &&
     968                 :           0 :                                 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
     969                 :           0 :                                 break;
     970                 :             : 
     971         [ #  # ]:           0 :                         if (P_RIGHTMOST(opaque) ||
     972   [ #  #  #  # ]:           0 :                                 _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
     973                 :           0 :                                 pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
     974                 :           0 :                                 break;
     975                 :             : 
     976                 :           0 :                         _bt_stepright(rel, heapRel, insertstate, stack);
     977                 :             :                         /* Update local state after stepping right */
     978                 :           0 :                         page = BufferGetPage(insertstate->buf);
     979                 :           0 :                         opaque = BTPageGetOpaque(page);
     980                 :             :                 }
     981                 :             :         }
     982                 :             : 
     983                 :             :         /*
     984                 :             :          * We should now be on the correct page.  Find the offset within the page
     985                 :             :          * for the new tuple. (Possibly reusing earlier search bounds.)
     986                 :             :          */
     987   [ +  +  +  - ]:      790527 :         Assert(P_RIGHTMOST(opaque) ||
     988                 :             :                    _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
     989                 :             : 
     990                 :      790527 :         newitemoff = _bt_binsrch_insert(rel, insertstate);
     991                 :             : 
     992         [ +  - ]:      790527 :         if (insertstate->postingoff == -1)
     993                 :             :         {
     994                 :             :                 /*
     995                 :             :                  * There is an overlapping posting list tuple with its LP_DEAD bit
     996                 :             :                  * set.  We don't want to unnecessarily unset its LP_DEAD bit while
     997                 :             :                  * performing a posting list split, so perform simple index tuple
     998                 :             :                  * deletion early.
     999                 :             :                  */
    1000                 :           0 :                 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
    1001                 :             :                                                                          false, false, false);
    1002                 :             : 
    1003                 :             :                 /*
    1004                 :             :                  * Do new binary search.  New insert location cannot overlap with any
    1005                 :             :                  * posting list now.
    1006                 :             :                  */
    1007         [ #  # ]:           0 :                 Assert(!insertstate->bounds_valid);
    1008                 :           0 :                 insertstate->postingoff = 0;
    1009                 :           0 :                 newitemoff = _bt_binsrch_insert(rel, insertstate);
    1010         [ #  # ]:           0 :                 Assert(insertstate->postingoff == 0);
    1011                 :           0 :         }
    1012                 :             : 
    1013                 :     1581054 :         return newitemoff;
    1014                 :      790527 : }
    1015                 :             : 
    1016                 :             : /*
    1017                 :             :  * Step right to next non-dead page, during insertion.
    1018                 :             :  *
    1019                 :             :  * This is a bit more complicated than moving right in a search.  We must
    1020                 :             :  * write-lock the target page before releasing write lock on current page;
    1021                 :             :  * else someone else's _bt_check_unique scan could fail to see our insertion.
    1022                 :             :  * Write locks on intermediate dead pages won't do because we don't know when
    1023                 :             :  * they will get de-linked from the tree.
    1024                 :             :  *
    1025                 :             :  * This is more aggressive than it needs to be for non-unique !heapkeyspace
    1026                 :             :  * indexes.
    1027                 :             :  */
    1028                 :             : static void
    1029                 :        1674 : _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
    1030                 :             :                           BTStack stack)
    1031                 :             : {
    1032                 :        1674 :         Page            page;
    1033                 :        1674 :         BTPageOpaque opaque;
    1034                 :        1674 :         Buffer          rbuf;
    1035                 :        1674 :         BlockNumber rblkno;
    1036                 :             : 
    1037         [ +  - ]:        1674 :         Assert(heaprel != NULL);
    1038                 :        1674 :         page = BufferGetPage(insertstate->buf);
    1039                 :        1674 :         opaque = BTPageGetOpaque(page);
    1040                 :             : 
    1041                 :        1674 :         rbuf = InvalidBuffer;
    1042                 :        1674 :         rblkno = opaque->btpo_next;
    1043                 :        1674 :         for (;;)
    1044                 :             :         {
    1045                 :        1674 :                 rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
    1046                 :        1674 :                 page = BufferGetPage(rbuf);
    1047                 :        1674 :                 opaque = BTPageGetOpaque(page);
    1048                 :             : 
    1049                 :             :                 /*
    1050                 :             :                  * If this page was incompletely split, finish the split now.  We do
    1051                 :             :                  * this while holding a lock on the left sibling, which is not good
    1052                 :             :                  * because finishing the split could be a fairly lengthy operation.
    1053                 :             :                  * But this should happen very seldom.
    1054                 :             :                  */
    1055         [ -  + ]:        1674 :                 if (P_INCOMPLETE_SPLIT(opaque))
    1056                 :             :                 {
    1057                 :           0 :                         _bt_finish_split(rel, heaprel, rbuf, stack);
    1058                 :           0 :                         rbuf = InvalidBuffer;
    1059                 :           0 :                         continue;
    1060                 :             :                 }
    1061                 :             : 
    1062         [ +  - ]:        1674 :                 if (!P_IGNORE(opaque))
    1063                 :        1674 :                         break;
    1064         [ #  # ]:           0 :                 if (P_RIGHTMOST(opaque))
    1065   [ #  #  #  # ]:           0 :                         elog(ERROR, "fell off the end of index \"%s\"",
    1066                 :             :                                  RelationGetRelationName(rel));
    1067                 :             : 
    1068                 :           0 :                 rblkno = opaque->btpo_next;
    1069                 :             :         }
    1070                 :             :         /* rbuf locked; unlock buf, update state for caller */
    1071                 :        1674 :         _bt_relbuf(rel, insertstate->buf);
    1072                 :        1674 :         insertstate->buf = rbuf;
    1073                 :        1674 :         insertstate->bounds_valid = false;
    1074                 :        1674 : }
    1075                 :             : 
    1076                 :             : /*----------
    1077                 :             :  *      _bt_insertonpg() -- Insert a tuple on a particular page in the index.
    1078                 :             :  *
    1079                 :             :  *              This recursive procedure does the following things:
    1080                 :             :  *
    1081                 :             :  *                      +  if postingoff != 0, splits existing posting list tuple
    1082                 :             :  *                         (since it overlaps with new 'itup' tuple).
    1083                 :             :  *                      +  if necessary, splits the target page, using 'itup_key' for
    1084                 :             :  *                         suffix truncation on leaf pages (caller passes NULL for
    1085                 :             :  *                         non-leaf pages).
    1086                 :             :  *                      +  inserts the new tuple (might be split from posting list).
    1087                 :             :  *                      +  if the page was split, pops the parent stack, and finds the
    1088                 :             :  *                         right place to insert the new child pointer (by walking
    1089                 :             :  *                         right using information stored in the parent stack).
    1090                 :             :  *                      +  invokes itself with the appropriate tuple for the right
    1091                 :             :  *                         child page on the parent.
    1092                 :             :  *                      +  updates the metapage if a true root or fast root is split.
    1093                 :             :  *
    1094                 :             :  *              On entry, we must have the correct buffer in which to do the
    1095                 :             :  *              insertion, and the buffer must be pinned and write-locked.  On return,
    1096                 :             :  *              we will have dropped both the pin and the lock on the buffer.
    1097                 :             :  *
    1098                 :             :  *              This routine only performs retail tuple insertions.  'itup' should
    1099                 :             :  *              always be either a non-highkey leaf item, or a downlink (new high
    1100                 :             :  *              key items are created indirectly, when a page is split).  When
    1101                 :             :  *              inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
    1102                 :             :  *              we're inserting the downlink for.  This function will clear the
    1103                 :             :  *              INCOMPLETE_SPLIT flag on it, and release the buffer.
    1104                 :             :  *----------
    1105                 :             :  */
    1106                 :             : static void
    1107                 :      792765 : _bt_insertonpg(Relation rel,
    1108                 :             :                            Relation heaprel,
    1109                 :             :                            BTScanInsert itup_key,
    1110                 :             :                            Buffer buf,
    1111                 :             :                            Buffer cbuf,
    1112                 :             :                            BTStack stack,
    1113                 :             :                            IndexTuple itup,
    1114                 :             :                            Size itemsz,
    1115                 :             :                            OffsetNumber newitemoff,
    1116                 :             :                            int postingoff,
    1117                 :             :                            bool split_only_page)
    1118                 :             : {
    1119                 :      792765 :         Page            page;
    1120                 :      792765 :         BTPageOpaque opaque;
    1121                 :      792765 :         bool            isleaf,
    1122                 :             :                                 isroot,
    1123                 :             :                                 isrightmost,
    1124                 :             :                                 isonly;
    1125                 :      792765 :         IndexTuple      oposting = NULL;
    1126                 :      792765 :         IndexTuple      origitup = NULL;
    1127                 :      792765 :         IndexTuple      nposting = NULL;
    1128                 :             : 
    1129                 :      792765 :         page = BufferGetPage(buf);
    1130                 :      792765 :         opaque = BTPageGetOpaque(page);
    1131                 :      792765 :         isleaf = P_ISLEAF(opaque);
    1132                 :      792765 :         isroot = P_ISROOT(opaque);
    1133                 :      792765 :         isrightmost = P_RIGHTMOST(opaque);
    1134         [ +  + ]:      792765 :         isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
    1135                 :             : 
    1136                 :             :         /* child buffer must be given iff inserting on an internal page */
    1137         [ +  - ]:      792765 :         Assert(isleaf == !BufferIsValid(cbuf));
    1138                 :             :         /* tuple must have appropriate number of attributes */
    1139   [ +  +  -  +  :      792765 :         Assert(!isleaf ||
                   +  - ]
    1140                 :             :                    BTreeTupleGetNAtts(itup, rel) ==
    1141                 :             :                    IndexRelationGetNumberOfAttributes(rel));
    1142   [ +  +  +  -  :      792765 :         Assert(isleaf ||
                   +  - ]
    1143                 :             :                    BTreeTupleGetNAtts(itup, rel) <=
    1144                 :             :                    IndexRelationGetNumberOfKeyAttributes(rel));
    1145         [ +  - ]:      792765 :         Assert(!BTreeTupleIsPosting(itup));
    1146         [ +  - ]:      792765 :         Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
    1147                 :             :         /* Caller must always finish incomplete split for us */
    1148         [ +  - ]:      792765 :         Assert(!P_INCOMPLETE_SPLIT(opaque));
    1149                 :             : 
    1150                 :             :         /*
    1151                 :             :          * Every internal page should have exactly one negative infinity item at
    1152                 :             :          * all times.  Only _bt_split() and _bt_newlevel() should add items that
    1153                 :             :          * become negative infinity items through truncation, since they're the
    1154                 :             :          * only routines that allocate new internal pages.
    1155                 :             :          */
    1156   [ +  +  +  - ]:      792765 :         Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
    1157                 :             : 
    1158                 :             :         /*
    1159                 :             :          * Do we need to split an existing posting list item?
    1160                 :             :          */
    1161         [ +  + ]:      792765 :         if (postingoff != 0)
    1162                 :             :         {
    1163                 :        2766 :                 ItemId          itemid = PageGetItemId(page, newitemoff);
    1164                 :             : 
    1165                 :             :                 /*
    1166                 :             :                  * The new tuple is a duplicate with a heap TID that falls inside the
    1167                 :             :                  * range of an existing posting list tuple on a leaf page.  Prepare to
    1168                 :             :                  * split an existing posting list.  Overwriting the posting list with
    1169                 :             :                  * its post-split version is treated as an extra step in either the
    1170                 :             :                  * insert or page split critical section.
    1171                 :             :                  */
    1172         [ +  - ]:        2766 :                 Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
    1173                 :        2766 :                 oposting = (IndexTuple) PageGetItem(page, itemid);
    1174                 :             : 
    1175                 :             :                 /*
    1176                 :             :                  * postingoff value comes from earlier call to _bt_binsrch_posting().
    1177                 :             :                  * Its binary search might think that a plain tuple must be a posting
    1178                 :             :                  * list tuple that needs to be split.  This can happen with corruption
    1179                 :             :                  * involving an existing plain tuple that is a duplicate of the new
    1180                 :             :                  * item, up to and including its table TID.  Check for that here in
    1181                 :             :                  * passing.
    1182                 :             :                  *
    1183                 :             :                  * Also verify that our caller has made sure that the existing posting
    1184                 :             :                  * list tuple does not have its LP_DEAD bit set.
    1185                 :             :                  */
    1186         [ +  - ]:        2766 :                 if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
    1187   [ #  #  #  # ]:           0 :                         ereport(ERROR,
    1188                 :             :                                         (errcode(ERRCODE_INDEX_CORRUPTED),
    1189                 :             :                                          errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
    1190                 :             :                                                                          ItemPointerGetBlockNumber(&itup->t_tid),
    1191                 :             :                                                                          ItemPointerGetOffsetNumber(&itup->t_tid),
    1192                 :             :                                                                          newitemoff, BufferGetBlockNumber(buf),
    1193                 :             :                                                                          RelationGetRelationName(rel))));
    1194                 :             : 
    1195                 :             :                 /* use a mutable copy of itup as our itup from here on */
    1196                 :        2766 :                 origitup = itup;
    1197                 :        2766 :                 itup = CopyIndexTuple(origitup);
    1198                 :        2766 :                 nposting = _bt_swap_posting(itup, oposting, postingoff);
    1199                 :             :                 /* itup now contains rightmost/max TID from oposting */
    1200                 :             : 
    1201                 :             :                 /* Alter offset so that newitem goes after posting list */
    1202                 :        2766 :                 newitemoff = OffsetNumberNext(newitemoff);
    1203                 :        2766 :         }
    1204                 :             : 
    1205                 :             :         /*
    1206                 :             :          * Do we need to split the page to fit the item on it?
    1207                 :             :          *
    1208                 :             :          * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
    1209                 :             :          * so this comparison is correct even though we appear to be accounting
    1210                 :             :          * only for the item and not for its line pointer.
    1211                 :             :          */
    1212         [ +  + ]:      792765 :         if (PageGetFreeSpace(page) < itemsz)
    1213                 :             :         {
    1214                 :        2309 :                 Buffer          rbuf;
    1215                 :             : 
    1216         [ +  - ]:        2309 :                 Assert(!split_only_page);
    1217                 :             : 
    1218                 :             :                 /* split the buffer into left and right halves */
    1219                 :        4618 :                 rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
    1220                 :        2309 :                                                  itup, origitup, nposting, postingoff);
    1221                 :        4618 :                 PredicateLockPageSplit(rel,
    1222                 :        2309 :                                                            BufferGetBlockNumber(buf),
    1223                 :        2309 :                                                            BufferGetBlockNumber(rbuf));
    1224                 :             : 
    1225                 :             :                 /*----------
    1226                 :             :                  * By here,
    1227                 :             :                  *
    1228                 :             :                  *              +  our target page has been split;
    1229                 :             :                  *              +  the original tuple has been inserted;
    1230                 :             :                  *              +  we have write locks on both the old (left half)
    1231                 :             :                  *                 and new (right half) buffers, after the split; and
    1232                 :             :                  *              +  we know the key we want to insert into the parent
    1233                 :             :                  *                 (it's the "high key" on the left child page).
    1234                 :             :                  *
    1235                 :             :                  * We're ready to do the parent insertion.  We need to hold onto the
    1236                 :             :                  * locks for the child pages until we locate the parent, but we can
    1237                 :             :                  * at least release the lock on the right child before doing the
    1238                 :             :                  * actual insertion.  The lock on the left child will be released
    1239                 :             :                  * last of all by parent insertion, where it is the 'cbuf' of parent
    1240                 :             :                  * page.
    1241                 :             :                  *----------
    1242                 :             :                  */
    1243                 :             : #ifdef USE_INJECTION_POINTS
    1244                 :             :                 if (P_ISLEAF(opaque))
    1245                 :             :                         INJECTION_POINT("nbtree-leave-leaf-split-incomplete", NULL);
    1246                 :             :                 else
    1247                 :             :                         INJECTION_POINT("nbtree-leave-internal-split-incomplete", NULL);
    1248                 :             : #endif
    1249                 :             : 
    1250                 :        2309 :                 _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
    1251                 :        2309 :         }
    1252                 :             :         else
    1253                 :             :         {
    1254                 :      790456 :                 Buffer          metabuf = InvalidBuffer;
    1255                 :      790456 :                 Page            metapg = NULL;
    1256                 :      790456 :                 BTMetaPageData *metad = NULL;
    1257                 :      790456 :                 BlockNumber blockcache;
    1258                 :             : 
    1259                 :             :                 /*
    1260                 :             :                  * If we are doing this insert because we split a page that was the
    1261                 :             :                  * only one on its tree level, but was not the root, it may have been
    1262                 :             :                  * the "fast root".  We need to ensure that the fast root link points
    1263                 :             :                  * at or above the current page.  We can safely acquire a lock on the
    1264                 :             :                  * metapage here --- see comments for _bt_newlevel().
    1265                 :             :                  */
    1266         [ +  + ]:      790456 :                 if (unlikely(split_only_page))
    1267                 :             :                 {
    1268         [ +  - ]:           3 :                         Assert(!isleaf);
    1269         [ +  - ]:           3 :                         Assert(BufferIsValid(cbuf));
    1270                 :             : 
    1271                 :           3 :                         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    1272                 :           3 :                         metapg = BufferGetPage(metabuf);
    1273                 :           3 :                         metad = BTPageGetMeta(metapg);
    1274                 :             : 
    1275         [ +  - ]:           3 :                         if (metad->btm_fastlevel >= opaque->btpo_level)
    1276                 :             :                         {
    1277                 :             :                                 /* no update wanted */
    1278                 :           0 :                                 _bt_relbuf(rel, metabuf);
    1279                 :           0 :                                 metabuf = InvalidBuffer;
    1280                 :           0 :                         }
    1281                 :           3 :                 }
    1282                 :             : 
    1283                 :             :                 /* Do the update.  No ereport(ERROR) until changes are logged */
    1284                 :      790456 :                 START_CRIT_SECTION();
    1285                 :             : 
    1286         [ +  + ]:      790456 :                 if (postingoff != 0)
    1287                 :        2756 :                         memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
    1288                 :             : 
    1289         [ +  - ]:      790456 :                 if (PageAddItem(page, itup, itemsz, newitemoff, false, false) == InvalidOffsetNumber)
    1290   [ #  #  #  # ]:           0 :                         elog(PANIC, "failed to add new item to block %u in index \"%s\"",
    1291                 :             :                                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
    1292                 :             : 
    1293                 :      790456 :                 MarkBufferDirty(buf);
    1294                 :             : 
    1295         [ +  + ]:      790456 :                 if (BufferIsValid(metabuf))
    1296                 :             :                 {
    1297                 :             :                         /* upgrade meta-page if needed */
    1298         [ +  - ]:           3 :                         if (metad->btm_version < BTREE_NOVAC_VERSION)
    1299                 :           0 :                                 _bt_upgrademetapage(metapg);
    1300                 :           3 :                         metad->btm_fastroot = BufferGetBlockNumber(buf);
    1301                 :           3 :                         metad->btm_fastlevel = opaque->btpo_level;
    1302                 :           3 :                         MarkBufferDirty(metabuf);
    1303                 :           3 :                 }
    1304                 :             : 
    1305                 :             :                 /*
    1306                 :             :                  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
    1307                 :             :                  * finishes a split
    1308                 :             :                  */
    1309         [ +  + ]:      790456 :                 if (!isleaf)
    1310                 :             :                 {
    1311                 :        2187 :                         Page            cpage = BufferGetPage(cbuf);
    1312                 :        2187 :                         BTPageOpaque cpageop = BTPageGetOpaque(cpage);
    1313                 :             : 
    1314         [ +  - ]:        2187 :                         Assert(P_INCOMPLETE_SPLIT(cpageop));
    1315                 :        2187 :                         cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
    1316                 :        2187 :                         MarkBufferDirty(cbuf);
    1317                 :        2187 :                 }
    1318                 :             : 
    1319                 :             :                 /* XLOG stuff */
    1320   [ +  +  +  +  :      790456 :                 if (RelationNeedsWAL(rel))
             +  +  +  + ]
    1321                 :             :                 {
    1322                 :      764415 :                         xl_btree_insert xlrec;
    1323                 :      764415 :                         xl_btree_metadata xlmeta;
    1324                 :      764415 :                         uint8           xlinfo;
    1325                 :      764415 :                         XLogRecPtr      recptr;
    1326                 :      764415 :                         uint16          upostingoff;
    1327                 :             : 
    1328                 :      764415 :                         xlrec.offnum = newitemoff;
    1329                 :             : 
    1330                 :      764415 :                         XLogBeginInsert();
    1331                 :      764415 :                         XLogRegisterData(&xlrec, SizeOfBtreeInsert);
    1332                 :             : 
    1333   [ +  +  +  + ]:      764415 :                         if (isleaf && postingoff == 0)
    1334                 :             :                         {
    1335                 :             :                                 /* Simple leaf insert */
    1336                 :      759504 :                                 xlinfo = XLOG_BTREE_INSERT_LEAF;
    1337                 :      759504 :                         }
    1338         [ +  + ]:        4911 :                         else if (postingoff != 0)
    1339                 :             :                         {
    1340                 :             :                                 /*
    1341                 :             :                                  * Leaf insert with posting list split.  Must include
    1342                 :             :                                  * postingoff field before newitem/orignewitem.
    1343                 :             :                                  */
    1344         [ +  - ]:        2756 :                                 Assert(isleaf);
    1345                 :        2756 :                                 xlinfo = XLOG_BTREE_INSERT_POST;
    1346                 :        2756 :                         }
    1347                 :             :                         else
    1348                 :             :                         {
    1349                 :             :                                 /* Internal page insert, which finishes a split on cbuf */
    1350                 :        2155 :                                 xlinfo = XLOG_BTREE_INSERT_UPPER;
    1351                 :        2155 :                                 XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
    1352                 :             : 
    1353         [ +  + ]:        2155 :                                 if (BufferIsValid(metabuf))
    1354                 :             :                                 {
    1355                 :             :                                         /* Actually, it's an internal page insert + meta update */
    1356                 :           3 :                                         xlinfo = XLOG_BTREE_INSERT_META;
    1357                 :             : 
    1358         [ +  - ]:           3 :                                         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
    1359                 :           3 :                                         xlmeta.version = metad->btm_version;
    1360                 :           3 :                                         xlmeta.root = metad->btm_root;
    1361                 :           3 :                                         xlmeta.level = metad->btm_level;
    1362                 :           3 :                                         xlmeta.fastroot = metad->btm_fastroot;
    1363                 :           3 :                                         xlmeta.fastlevel = metad->btm_fastlevel;
    1364                 :           3 :                                         xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
    1365                 :           3 :                                         xlmeta.allequalimage = metad->btm_allequalimage;
    1366                 :             : 
    1367                 :           3 :                                         XLogRegisterBuffer(2, metabuf,
    1368                 :             :                                                                            REGBUF_WILL_INIT | REGBUF_STANDARD);
    1369                 :           3 :                                         XLogRegisterBufData(2, &xlmeta,
    1370                 :             :                                                                                 sizeof(xl_btree_metadata));
    1371                 :           3 :                                 }
    1372                 :             :                         }
    1373                 :             : 
    1374                 :      764415 :                         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1375         [ +  + ]:      764415 :                         if (postingoff == 0)
    1376                 :             :                         {
    1377                 :             :                                 /* Just log itup from caller */
    1378                 :      761659 :                                 XLogRegisterBufData(0, itup, IndexTupleSize(itup));
    1379                 :      761659 :                         }
    1380                 :             :                         else
    1381                 :             :                         {
    1382                 :             :                                 /*
    1383                 :             :                                  * Insert with posting list split (XLOG_BTREE_INSERT_POST
    1384                 :             :                                  * record) case.
    1385                 :             :                                  *
    1386                 :             :                                  * Log postingoff.  Also log origitup, not itup.  REDO routine
    1387                 :             :                                  * must reconstruct final itup (as well as nposting) using
    1388                 :             :                                  * _bt_swap_posting().
    1389                 :             :                                  */
    1390                 :        2756 :                                 upostingoff = postingoff;
    1391                 :             : 
    1392                 :        2756 :                                 XLogRegisterBufData(0, &upostingoff, sizeof(uint16));
    1393                 :        5512 :                                 XLogRegisterBufData(0, origitup,
    1394                 :        2756 :                                                                         IndexTupleSize(origitup));
    1395                 :             :                         }
    1396                 :             : 
    1397                 :      764415 :                         recptr = XLogInsert(RM_BTREE_ID, xlinfo);
    1398                 :             : 
    1399         [ +  + ]:      764415 :                         if (BufferIsValid(metabuf))
    1400                 :           3 :                                 PageSetLSN(metapg, recptr);
    1401         [ +  + ]:      764415 :                         if (!isleaf)
    1402                 :        2155 :                                 PageSetLSN(BufferGetPage(cbuf), recptr);
    1403                 :             : 
    1404                 :      764415 :                         PageSetLSN(page, recptr);
    1405                 :      764415 :                 }
    1406                 :             : 
    1407         [ +  - ]:      790456 :                 END_CRIT_SECTION();
    1408                 :             : 
    1409                 :             :                 /* Release subsidiary buffers */
    1410         [ +  + ]:      790456 :                 if (BufferIsValid(metabuf))
    1411                 :           3 :                         _bt_relbuf(rel, metabuf);
    1412         [ +  + ]:      790456 :                 if (!isleaf)
    1413                 :        2187 :                         _bt_relbuf(rel, cbuf);
    1414                 :             : 
    1415                 :             :                 /*
    1416                 :             :                  * Cache the block number if this is the rightmost leaf page.  Cache
    1417                 :             :                  * may be used by a future inserter within _bt_search_insert().
    1418                 :             :                  */
    1419                 :      790456 :                 blockcache = InvalidBlockNumber;
    1420   [ +  +  +  +  :      790456 :                 if (isrightmost && isleaf && !isroot)
                   +  + ]
    1421                 :      575750 :                         blockcache = BufferGetBlockNumber(buf);
    1422                 :             : 
    1423                 :             :                 /* Release buffer for insertion target block */
    1424                 :      790456 :                 _bt_relbuf(rel, buf);
    1425                 :             : 
    1426                 :             :                 /*
    1427                 :             :                  * If we decided to cache the insertion target block before releasing
    1428                 :             :                  * its buffer lock, then cache it now.  Check the height of the tree
    1429                 :             :                  * first, though.  We don't go for the optimization with small
    1430                 :             :                  * indexes.  Defer final check to this point to ensure that we don't
    1431                 :             :                  * call _bt_getrootheight while holding a buffer lock.
    1432                 :             :                  */
    1433   [ +  +  +  + ]:      790456 :                 if (BlockNumberIsValid(blockcache) &&
    1434                 :      575750 :                         _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
    1435                 :          39 :                         RelationSetTargetBlock(rel, blockcache);
    1436                 :      790456 :         }
    1437                 :             : 
    1438                 :             :         /* be tidy */
    1439         [ +  + ]:      792765 :         if (postingoff != 0)
    1440                 :             :         {
    1441                 :             :                 /* itup is actually a modified copy of caller's original */
    1442                 :        2766 :                 pfree(nposting);
    1443                 :        2766 :                 pfree(itup);
    1444                 :        2766 :         }
    1445                 :      792765 : }
    1446                 :             : 
    1447                 :             : /*
    1448                 :             :  *      _bt_split() -- split a page in the btree.
    1449                 :             :  *
    1450                 :             :  *              On entry, buf is the page to split, and is pinned and write-locked.
    1451                 :             :  *              newitemoff etc. tell us about the new item that must be inserted
    1452                 :             :  *              along with the data from the original page.
    1453                 :             :  *
    1454                 :             :  *              itup_key is used for suffix truncation on leaf pages (internal
    1455                 :             :  *              page callers pass NULL).  When splitting a non-leaf page, 'cbuf'
    1456                 :             :  *              is the left-sibling of the page we're inserting the downlink for.
    1457                 :             :  *              This function will clear the INCOMPLETE_SPLIT flag on it, and
    1458                 :             :  *              release the buffer.
    1459                 :             :  *
    1460                 :             :  *              orignewitem, nposting, and postingoff are needed when an insert of
    1461                 :             :  *              orignewitem results in both a posting list split and a page split.
    1462                 :             :  *              These extra posting list split details are used here in the same
    1463                 :             :  *              way as they are used in the more common case where a posting list
    1464                 :             :  *              split does not coincide with a page split.  We need to deal with
    1465                 :             :  *              posting list splits directly in order to ensure that everything
    1466                 :             :  *              that follows from the insert of orignewitem is handled as a single
    1467                 :             :  *              atomic operation (though caller's insert of a new pivot/downlink
    1468                 :             :  *              into parent page will still be a separate operation).  See
    1469                 :             :  *              nbtree/README for details on the design of posting list splits.
    1470                 :             :  *
    1471                 :             :  *              Returns the new right sibling of buf, pinned and write-locked.
    1472                 :             :  *              The pin and lock on buf are maintained.
    1473                 :             :  */
    1474                 :             : static Buffer
    1475                 :        2309 : _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
    1476                 :             :                   Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
    1477                 :             :                   IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
    1478                 :             : {
    1479                 :        2309 :         Buffer          rbuf;
    1480                 :        2309 :         Page            origpage;
    1481                 :        2309 :         Page            leftpage,
    1482                 :             :                                 rightpage;
    1483                 :        2309 :         PGAlignedBlock leftpage_buf,
    1484                 :             :                                 rightpage_buf;
    1485                 :        2309 :         BlockNumber origpagenumber,
    1486                 :             :                                 rightpagenumber;
    1487                 :        2309 :         BTPageOpaque ropaque,
    1488                 :             :                                 lopaque,
    1489                 :             :                                 oopaque;
    1490                 :        2309 :         Buffer          sbuf = InvalidBuffer;
    1491                 :        2309 :         Page            spage = NULL;
    1492                 :        2309 :         BTPageOpaque sopaque = NULL;
    1493                 :        2309 :         Size            itemsz;
    1494                 :        2309 :         ItemId          itemid;
    1495                 :        2309 :         IndexTuple      firstright,
    1496                 :             :                                 lefthighkey;
    1497                 :        2309 :         OffsetNumber firstrightoff;
    1498                 :        2309 :         OffsetNumber afterleftoff,
    1499                 :             :                                 afterrightoff,
    1500                 :             :                                 minusinfoff;
    1501                 :        2309 :         OffsetNumber origpagepostingoff;
    1502                 :        2309 :         OffsetNumber maxoff;
    1503                 :        2309 :         OffsetNumber i;
    1504                 :        2309 :         bool            newitemonleft,
    1505                 :             :                                 isleaf,
    1506                 :             :                                 isrightmost;
    1507                 :             : 
    1508                 :             :         /*
    1509                 :             :          * origpage is the original page to be split.  leftpage is a temporary
    1510                 :             :          * buffer that receives the left-sibling data, which will be copied back
    1511                 :             :          * into origpage on success.  rightpage is the new page that will receive
    1512                 :             :          * the right-sibling data.
    1513                 :             :          *
    1514                 :             :          * leftpage is allocated after choosing a split point.  rightpage's new
    1515                 :             :          * buffer isn't acquired until after leftpage is initialized and has new
    1516                 :             :          * high key, the last point where splitting the page may fail (barring
    1517                 :             :          * corruption).  Failing before acquiring new buffer won't have lasting
    1518                 :             :          * consequences, since origpage won't have been modified and leftpage is
    1519                 :             :          * only workspace.
    1520                 :             :          */
    1521                 :        2309 :         origpage = BufferGetPage(buf);
    1522                 :        2309 :         oopaque = BTPageGetOpaque(origpage);
    1523                 :        2309 :         isleaf = P_ISLEAF(oopaque);
    1524                 :        2309 :         isrightmost = P_RIGHTMOST(oopaque);
    1525                 :        2309 :         maxoff = PageGetMaxOffsetNumber(origpage);
    1526                 :        2309 :         origpagenumber = BufferGetBlockNumber(buf);
    1527                 :             : 
    1528                 :             :         /*
    1529                 :             :          * Choose a point to split origpage at.
    1530                 :             :          *
    1531                 :             :          * A split point can be thought of as a point _between_ two existing data
    1532                 :             :          * items on origpage (the lastleft and firstright tuples), provided you
    1533                 :             :          * pretend that the new item that didn't fit is already on origpage.
    1534                 :             :          *
    1535                 :             :          * Since origpage does not actually contain newitem, the representation of
    1536                 :             :          * split points needs to work with two boundary cases: splits where
    1537                 :             :          * newitem is lastleft, and splits where newitem is firstright.
    1538                 :             :          * newitemonleft resolves the ambiguity that would otherwise exist when
    1539                 :             :          * newitemoff == firstrightoff.  In all other cases it's clear which side
    1540                 :             :          * of the split every tuple goes on from context.  newitemonleft is
    1541                 :             :          * usually (but not always) redundant information.
    1542                 :             :          *
    1543                 :             :          * firstrightoff is supposed to be an origpage offset number, but it's
    1544                 :             :          * possible that its value will be maxoff+1, which is "past the end" of
    1545                 :             :          * origpage.  This happens in the rare case where newitem goes after all
    1546                 :             :          * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
    1547                 :             :          * origpage at the point that leaves newitem alone on new right page.  Any
    1548                 :             :          * "!newitemonleft && newitemoff == firstrightoff" split point makes
    1549                 :             :          * newitem the firstright tuple, though, so this case isn't a special
    1550                 :             :          * case.
    1551                 :             :          */
    1552                 :        4618 :         firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
    1553                 :        2309 :                                                                          newitem, &newitemonleft);
    1554                 :             : 
    1555                 :             :         /* Use temporary buffer for leftpage */
    1556                 :        2309 :         leftpage = leftpage_buf.data;
    1557                 :        2309 :         _bt_pageinit(leftpage, BufferGetPageSize(buf));
    1558                 :        2309 :         lopaque = BTPageGetOpaque(leftpage);
    1559                 :             : 
    1560                 :             :         /*
    1561                 :             :          * leftpage won't be the root when we're done.  Also, clear the SPLIT_END
    1562                 :             :          * and HAS_GARBAGE flags.
    1563                 :             :          */
    1564                 :        2309 :         lopaque->btpo_flags = oopaque->btpo_flags;
    1565                 :        2309 :         lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
    1566                 :             :         /* set flag in leftpage indicating that rightpage has no downlink yet */
    1567                 :        2309 :         lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
    1568                 :        2309 :         lopaque->btpo_prev = oopaque->btpo_prev;
    1569                 :             :         /* handle btpo_next after rightpage buffer acquired */
    1570                 :        2309 :         lopaque->btpo_level = oopaque->btpo_level;
    1571                 :             :         /* handle btpo_cycleid after rightpage buffer acquired */
    1572                 :             : 
    1573                 :             :         /*
    1574                 :             :          * Copy the original page's LSN into leftpage, which will become the
    1575                 :             :          * updated version of the page.  We need this because XLogInsert will
    1576                 :             :          * examine the LSN and possibly dump it in a page image.
    1577                 :             :          */
    1578                 :        2309 :         PageSetLSN(leftpage, PageGetLSN(origpage));
    1579                 :             : 
    1580                 :             :         /*
    1581                 :             :          * Determine page offset number of existing overlapped-with-orignewitem
    1582                 :             :          * posting list when it is necessary to perform a posting list split in
    1583                 :             :          * passing.  Note that newitem was already changed by caller (newitem no
    1584                 :             :          * longer has the orignewitem TID).
    1585                 :             :          *
    1586                 :             :          * This page offset number (origpagepostingoff) will be used to pretend
    1587                 :             :          * that the posting split has already taken place, even though the
    1588                 :             :          * required modifications to origpage won't occur until we reach the
    1589                 :             :          * critical section.  The lastleft and firstright tuples of our page split
    1590                 :             :          * point should, in effect, come from an imaginary version of origpage
    1591                 :             :          * that has the nposting tuple instead of the original posting list tuple.
    1592                 :             :          *
    1593                 :             :          * Note: _bt_findsplitloc() should have compensated for coinciding posting
    1594                 :             :          * list splits in just the same way, at least in theory.  It doesn't
    1595                 :             :          * bother with that, though.  In practice it won't affect its choice of
    1596                 :             :          * split point.
    1597                 :             :          */
    1598                 :        2309 :         origpagepostingoff = InvalidOffsetNumber;
    1599         [ +  + ]:        2309 :         if (postingoff != 0)
    1600                 :             :         {
    1601         [ +  - ]:          10 :                 Assert(isleaf);
    1602         [ +  - ]:          10 :                 Assert(ItemPointerCompare(&orignewitem->t_tid,
    1603                 :             :                                                                   &newitem->t_tid) < 0);
    1604         [ +  - ]:          10 :                 Assert(BTreeTupleIsPosting(nposting));
    1605                 :          10 :                 origpagepostingoff = OffsetNumberPrev(newitemoff);
    1606                 :          10 :         }
    1607                 :             : 
    1608                 :             :         /*
    1609                 :             :          * The high key for the new left page is a possibly-truncated copy of
    1610                 :             :          * firstright on the leaf level (it's "firstright itself" on internal
    1611                 :             :          * pages; see !isleaf comments below).  This may seem to be contrary to
    1612                 :             :          * Lehman & Yao's approach of using a copy of lastleft as the new high key
    1613                 :             :          * when splitting on the leaf level.  It isn't, though.
    1614                 :             :          *
    1615                 :             :          * Suffix truncation will leave the left page's high key fully equal to
    1616                 :             :          * lastleft when lastleft and firstright are equal prior to heap TID (that
    1617                 :             :          * is, the tiebreaker TID value comes from lastleft).  It isn't actually
    1618                 :             :          * necessary for a new leaf high key to be a copy of lastleft for the L&Y
    1619                 :             :          * "subtree" invariant to hold.  It's sufficient to make sure that the new
    1620                 :             :          * leaf high key is strictly less than firstright, and greater than or
    1621                 :             :          * equal to (not necessarily equal to) lastleft.  In other words, when
    1622                 :             :          * suffix truncation isn't possible during a leaf page split, we take
    1623                 :             :          * L&Y's exact approach to generating a new high key for the left page.
    1624                 :             :          * (Actually, that is slightly inaccurate.  We don't just use a copy of
    1625                 :             :          * lastleft.  A tuple with all the keys from firstright but the max heap
    1626                 :             :          * TID from lastleft is used, to avoid introducing a special case.)
    1627                 :             :          */
    1628   [ +  +  +  + ]:        2309 :         if (!newitemonleft && newitemoff == firstrightoff)
    1629                 :             :         {
    1630                 :             :                 /* incoming tuple becomes firstright */
    1631                 :           8 :                 itemsz = newitemsz;
    1632                 :           8 :                 firstright = newitem;
    1633                 :           8 :         }
    1634                 :             :         else
    1635                 :             :         {
    1636                 :             :                 /* existing item at firstrightoff becomes firstright */
    1637                 :        2301 :                 itemid = PageGetItemId(origpage, firstrightoff);
    1638                 :        2301 :                 itemsz = ItemIdGetLength(itemid);
    1639                 :        2301 :                 firstright = (IndexTuple) PageGetItem(origpage, itemid);
    1640         [ +  - ]:        2301 :                 if (firstrightoff == origpagepostingoff)
    1641                 :           0 :                         firstright = nposting;
    1642                 :             :         }
    1643                 :             : 
    1644         [ +  + ]:        2309 :         if (isleaf)
    1645                 :             :         {
    1646                 :        2258 :                 IndexTuple      lastleft;
    1647                 :             : 
    1648                 :             :                 /* Attempt suffix truncation for leaf page splits */
    1649   [ +  +  +  + ]:        2258 :                 if (newitemonleft && newitemoff == firstrightoff)
    1650                 :             :                 {
    1651                 :             :                         /* incoming tuple becomes lastleft */
    1652                 :          24 :                         lastleft = newitem;
    1653                 :          24 :                 }
    1654                 :             :                 else
    1655                 :             :                 {
    1656                 :        2234 :                         OffsetNumber lastleftoff;
    1657                 :             : 
    1658                 :             :                         /* existing item before firstrightoff becomes lastleft */
    1659                 :        2234 :                         lastleftoff = OffsetNumberPrev(firstrightoff);
    1660         [ +  - ]:        2234 :                         Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
    1661                 :        2234 :                         itemid = PageGetItemId(origpage, lastleftoff);
    1662                 :        2234 :                         lastleft = (IndexTuple) PageGetItem(origpage, itemid);
    1663         [ +  + ]:        2234 :                         if (lastleftoff == origpagepostingoff)
    1664                 :           1 :                                 lastleft = nposting;
    1665                 :        2234 :                 }
    1666                 :             : 
    1667                 :        2258 :                 lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
    1668                 :        2258 :                 itemsz = IndexTupleSize(lefthighkey);
    1669                 :        2258 :         }
    1670                 :             :         else
    1671                 :             :         {
    1672                 :             :                 /*
    1673                 :             :                  * Don't perform suffix truncation on a copy of firstright to make
    1674                 :             :                  * left page high key for internal page splits.  Must use firstright
    1675                 :             :                  * as new high key directly.
    1676                 :             :                  *
    1677                 :             :                  * Each distinct separator key value originates as a leaf level high
    1678                 :             :                  * key; all other separator keys/pivot tuples are copied from one
    1679                 :             :                  * level down.  A separator key in a grandparent page must be
    1680                 :             :                  * identical to high key in rightmost parent page of the subtree to
    1681                 :             :                  * its left, which must itself be identical to high key in rightmost
    1682                 :             :                  * child page of that same subtree (this even applies to separator
    1683                 :             :                  * from grandparent's high key).  There must always be an unbroken
    1684                 :             :                  * "seam" of identical separator keys that guide index scans at every
    1685                 :             :                  * level, starting from the grandparent.  That's why suffix truncation
    1686                 :             :                  * is unsafe here.
    1687                 :             :                  *
    1688                 :             :                  * Internal page splits will truncate firstright into a "negative
    1689                 :             :                  * infinity" data item when it gets inserted on the new right page
    1690                 :             :                  * below, though.  This happens during the call to _bt_pgaddtup() for
    1691                 :             :                  * the new first data item for right page.  Do not confuse this
    1692                 :             :                  * mechanism with suffix truncation.  It is just a convenient way of
    1693                 :             :                  * implementing page splits that split the internal page "inside"
    1694                 :             :                  * firstright.  The lefthighkey separator key cannot appear a second
    1695                 :             :                  * time in the right page (only firstright's downlink goes in right
    1696                 :             :                  * page).
    1697                 :             :                  */
    1698                 :          51 :                 lefthighkey = firstright;
    1699                 :             :         }
    1700                 :             : 
    1701                 :             :         /*
    1702                 :             :          * Add new high key to leftpage
    1703                 :             :          */
    1704                 :        2309 :         afterleftoff = P_HIKEY;
    1705                 :             : 
    1706   [ +  -  +  - ]:        2309 :         Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
    1707   [ +  -  +  - ]:        2309 :         Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
    1708                 :             :                    IndexRelationGetNumberOfKeyAttributes(rel));
    1709         [ +  - ]:        2309 :         Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
    1710         [ +  - ]:        2309 :         if (PageAddItem(leftpage, lefthighkey, itemsz, afterleftoff, false, false) == InvalidOffsetNumber)
    1711   [ #  #  #  # ]:           0 :                 elog(ERROR, "failed to add high key to the left sibling"
    1712                 :             :                          " while splitting block %u of index \"%s\"",
    1713                 :             :                          origpagenumber, RelationGetRelationName(rel));
    1714                 :        2309 :         afterleftoff = OffsetNumberNext(afterleftoff);
    1715                 :             : 
    1716                 :             :         /*
    1717                 :             :          * Acquire a new right page to split into, now that left page has a new
    1718                 :             :          * high key.
    1719                 :             :          *
    1720                 :             :          * To not confuse future VACUUM operations, we zero the right page and
    1721                 :             :          * work on an in-memory copy of it before writing WAL, then copy its
    1722                 :             :          * contents back to the actual page once we start the critical section
    1723                 :             :          * work.  This simplifies the split work, so as there is no need to zero
    1724                 :             :          * the right page before throwing an error.
    1725                 :             :          */
    1726                 :        2309 :         rbuf = _bt_allocbuf(rel, heaprel);
    1727                 :        2309 :         rightpage = rightpage_buf.data;
    1728                 :             : 
    1729                 :             :         /*
    1730                 :             :          * Copy the contents of the right page into its temporary location, and
    1731                 :             :          * zero the original space.
    1732                 :             :          */
    1733                 :        2309 :         memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ);
    1734                 :        2309 :         memset(BufferGetPage(rbuf), 0, BLCKSZ);
    1735                 :        2309 :         rightpagenumber = BufferGetBlockNumber(rbuf);
    1736                 :             :         /* rightpage was initialized by _bt_allocbuf */
    1737                 :        2309 :         ropaque = BTPageGetOpaque(rightpage);
    1738                 :             : 
    1739                 :             :         /*
    1740                 :             :          * Finish off remaining leftpage special area fields.  They cannot be set
    1741                 :             :          * before both origpage (leftpage) and rightpage buffers are acquired and
    1742                 :             :          * locked.
    1743                 :             :          *
    1744                 :             :          * btpo_cycleid is only used with leaf pages, though we set it here in all
    1745                 :             :          * cases just to be consistent.
    1746                 :             :          */
    1747                 :        2309 :         lopaque->btpo_next = rightpagenumber;
    1748                 :        2309 :         lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
    1749                 :             : 
    1750                 :             :         /*
    1751                 :             :          * rightpage won't be the root when we're done.  Also, clear the SPLIT_END
    1752                 :             :          * and HAS_GARBAGE flags.
    1753                 :             :          */
    1754                 :        2309 :         ropaque->btpo_flags = oopaque->btpo_flags;
    1755                 :        2309 :         ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
    1756                 :        2309 :         ropaque->btpo_prev = origpagenumber;
    1757                 :        2309 :         ropaque->btpo_next = oopaque->btpo_next;
    1758                 :        2309 :         ropaque->btpo_level = oopaque->btpo_level;
    1759                 :        2309 :         ropaque->btpo_cycleid = lopaque->btpo_cycleid;
    1760                 :             : 
    1761                 :             :         /*
    1762                 :             :          * Add new high key to rightpage where necessary.
    1763                 :             :          *
    1764                 :             :          * If the page we're splitting is not the rightmost page at its level in
    1765                 :             :          * the tree, then the first entry on the page is the high key from
    1766                 :             :          * origpage.
    1767                 :             :          */
    1768                 :        2309 :         afterrightoff = P_HIKEY;
    1769                 :             : 
    1770         [ +  + ]:        2309 :         if (!isrightmost)
    1771                 :             :         {
    1772                 :         613 :                 IndexTuple      righthighkey;
    1773                 :             : 
    1774                 :         613 :                 itemid = PageGetItemId(origpage, P_HIKEY);
    1775                 :         613 :                 itemsz = ItemIdGetLength(itemid);
    1776                 :         613 :                 righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
    1777   [ +  -  +  - ]:         613 :                 Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
    1778   [ +  -  +  - ]:         613 :                 Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
    1779                 :             :                            IndexRelationGetNumberOfKeyAttributes(rel));
    1780         [ +  - ]:         613 :                 if (PageAddItem(rightpage, righthighkey, itemsz, afterrightoff, false, false) == InvalidOffsetNumber)
    1781                 :             :                 {
    1782   [ #  #  #  # ]:           0 :                         elog(ERROR, "failed to add high key to the right sibling"
    1783                 :             :                                  " while splitting block %u of index \"%s\"",
    1784                 :             :                                  origpagenumber, RelationGetRelationName(rel));
    1785                 :           0 :                 }
    1786                 :         613 :                 afterrightoff = OffsetNumberNext(afterrightoff);
    1787                 :         613 :         }
    1788                 :             : 
    1789                 :             :         /*
    1790                 :             :          * Internal page splits truncate first data item on right page -- it
    1791                 :             :          * becomes "minus infinity" item for the page.  Set this up here.
    1792                 :             :          */
    1793                 :        2309 :         minusinfoff = InvalidOffsetNumber;
    1794         [ +  + ]:        2309 :         if (!isleaf)
    1795                 :          51 :                 minusinfoff = afterrightoff;
    1796                 :             : 
    1797                 :             :         /*
    1798                 :             :          * Now transfer all the data items (non-pivot tuples in isleaf case, or
    1799                 :             :          * additional pivot tuples in !isleaf case) to the appropriate page.
    1800                 :             :          *
    1801                 :             :          * Note: we *must* insert at least the right page's items in item-number
    1802                 :             :          * order, for the benefit of _bt_restore_page().
    1803                 :             :          */
    1804         [ +  + ]:      746411 :         for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
    1805                 :             :         {
    1806                 :      744102 :                 IndexTuple      dataitem;
    1807                 :             : 
    1808                 :      744102 :                 itemid = PageGetItemId(origpage, i);
    1809                 :      744102 :                 itemsz = ItemIdGetLength(itemid);
    1810                 :      744102 :                 dataitem = (IndexTuple) PageGetItem(origpage, itemid);
    1811                 :             : 
    1812                 :             :                 /* replace original item with nposting due to posting split? */
    1813         [ +  + ]:      744102 :                 if (i == origpagepostingoff)
    1814                 :             :                 {
    1815         [ +  - ]:          10 :                         Assert(BTreeTupleIsPosting(dataitem));
    1816         [ +  - ]:          10 :                         Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
    1817                 :          10 :                         dataitem = nposting;
    1818                 :          10 :                 }
    1819                 :             : 
    1820                 :             :                 /* does new item belong before this one? */
    1821         [ +  + ]:      744092 :                 else if (i == newitemoff)
    1822                 :             :                 {
    1823         [ +  + ]:         834 :                         if (newitemonleft)
    1824                 :             :                         {
    1825         [ +  - ]:         124 :                                 Assert(newitemoff <= firstrightoff);
    1826         [ +  - ]:         124 :                                 if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
    1827                 :             :                                                                   false))
    1828                 :             :                                 {
    1829   [ #  #  #  # ]:           0 :                                         elog(ERROR, "failed to add new item to the left sibling"
    1830                 :             :                                                  " while splitting block %u of index \"%s\"",
    1831                 :             :                                                  origpagenumber, RelationGetRelationName(rel));
    1832                 :           0 :                                 }
    1833                 :         124 :                                 afterleftoff = OffsetNumberNext(afterleftoff);
    1834                 :         124 :                         }
    1835                 :             :                         else
    1836                 :             :                         {
    1837         [ +  - ]:         710 :                                 Assert(newitemoff >= firstrightoff);
    1838   [ +  -  +  - ]:        1420 :                                 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
    1839                 :         710 :                                                                   afterrightoff == minusinfoff))
    1840                 :             :                                 {
    1841   [ #  #  #  # ]:           0 :                                         elog(ERROR, "failed to add new item to the right sibling"
    1842                 :             :                                                  " while splitting block %u of index \"%s\"",
    1843                 :             :                                                  origpagenumber, RelationGetRelationName(rel));
    1844                 :           0 :                                 }
    1845                 :         710 :                                 afterrightoff = OffsetNumberNext(afterrightoff);
    1846                 :             :                         }
    1847                 :         834 :                 }
    1848                 :             : 
    1849                 :             :                 /* decide which page to put it on */
    1850         [ +  + ]:      744102 :                 if (i < firstrightoff)
    1851                 :             :                 {
    1852         [ +  - ]:      616809 :                         if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
    1853                 :             :                         {
    1854   [ #  #  #  # ]:           0 :                                 elog(ERROR, "failed to add old item to the left sibling"
    1855                 :             :                                          " while splitting block %u of index \"%s\"",
    1856                 :             :                                          origpagenumber, RelationGetRelationName(rel));
    1857                 :           0 :                         }
    1858                 :      616809 :                         afterleftoff = OffsetNumberNext(afterleftoff);
    1859                 :      616809 :                 }
    1860                 :             :                 else
    1861                 :             :                 {
    1862   [ +  -  +  - ]:      254586 :                         if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
    1863                 :      127293 :                                                           afterrightoff == minusinfoff))
    1864                 :             :                         {
    1865   [ #  #  #  # ]:           0 :                                 elog(ERROR, "failed to add old item to the right sibling"
    1866                 :             :                                          " while splitting block %u of index \"%s\"",
    1867                 :             :                                          origpagenumber, RelationGetRelationName(rel));
    1868                 :           0 :                         }
    1869                 :      127293 :                         afterrightoff = OffsetNumberNext(afterrightoff);
    1870                 :             :                 }
    1871                 :      744102 :         }
    1872                 :             : 
    1873                 :             :         /* Handle case where newitem goes at the end of rightpage */
    1874         [ +  + ]:        2309 :         if (i <= newitemoff)
    1875                 :             :         {
    1876                 :             :                 /*
    1877                 :             :                  * Can't have newitemonleft here; that would imply we were told to put
    1878                 :             :                  * *everything* on the left page, which cannot fit (if it could, we'd
    1879                 :             :                  * not be splitting the page).
    1880                 :             :                  */
    1881         [ +  - ]:        1475 :                 Assert(!newitemonleft && newitemoff == maxoff + 1);
    1882   [ +  -  +  - ]:        2950 :                 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
    1883                 :        1475 :                                                   afterrightoff == minusinfoff))
    1884                 :             :                 {
    1885   [ #  #  #  # ]:           0 :                         elog(ERROR, "failed to add new item to the right sibling"
    1886                 :             :                                  " while splitting block %u of index \"%s\"",
    1887                 :             :                                  origpagenumber, RelationGetRelationName(rel));
    1888                 :           0 :                 }
    1889                 :        1475 :                 afterrightoff = OffsetNumberNext(afterrightoff);
    1890                 :        1475 :         }
    1891                 :             : 
    1892                 :             :         /*
    1893                 :             :          * We have to grab the original right sibling (if any) and update its prev
    1894                 :             :          * link.  We are guaranteed that this is deadlock-free, since we couple
    1895                 :             :          * the locks in the standard order: left to right.
    1896                 :             :          */
    1897         [ +  + ]:        2309 :         if (!isrightmost)
    1898                 :             :         {
    1899                 :         613 :                 sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
    1900                 :         613 :                 spage = BufferGetPage(sbuf);
    1901                 :         613 :                 sopaque = BTPageGetOpaque(spage);
    1902         [ +  - ]:         613 :                 if (sopaque->btpo_prev != origpagenumber)
    1903                 :             :                 {
    1904   [ #  #  #  # ]:           0 :                         ereport(ERROR,
    1905                 :             :                                         (errcode(ERRCODE_INDEX_CORRUPTED),
    1906                 :             :                                          errmsg_internal("right sibling's left-link doesn't match: "
    1907                 :             :                                                                          "block %u links to %u instead of expected %u in index \"%s\"",
    1908                 :             :                                                                          oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
    1909                 :             :                                                                          RelationGetRelationName(rel))));
    1910                 :           0 :                 }
    1911                 :             : 
    1912                 :             :                 /*
    1913                 :             :                  * Check to see if we can set the SPLIT_END flag in the right-hand
    1914                 :             :                  * split page; this can save some I/O for vacuum since it need not
    1915                 :             :                  * proceed to the right sibling.  We can set the flag if the right
    1916                 :             :                  * sibling has a different cycleid: that means it could not be part of
    1917                 :             :                  * a group of pages that were all split off from the same ancestor
    1918                 :             :                  * page.  If you're confused, imagine that page A splits to A B and
    1919                 :             :                  * then again, yielding A C B, while vacuum is in progress.  Tuples
    1920                 :             :                  * originally in A could now be in either B or C, hence vacuum must
    1921                 :             :                  * examine both pages.  But if D, our right sibling, has a different
    1922                 :             :                  * cycleid then it could not contain any tuples that were in A when
    1923                 :             :                  * the vacuum started.
    1924                 :             :                  */
    1925         [ +  - ]:         613 :                 if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
    1926                 :           0 :                         ropaque->btpo_flags |= BTP_SPLIT_END;
    1927                 :         613 :         }
    1928                 :             : 
    1929                 :             :         /*
    1930                 :             :          * Right sibling is locked, new siblings are prepared, but original page
    1931                 :             :          * is not updated yet.
    1932                 :             :          *
    1933                 :             :          * NO EREPORT(ERROR) till right sibling is updated.  We can get away with
    1934                 :             :          * not starting the critical section till here because we haven't been
    1935                 :             :          * scribbling on the original page yet; see comments above.
    1936                 :             :          */
    1937                 :        2309 :         START_CRIT_SECTION();
    1938                 :             : 
    1939                 :             :         /*
    1940                 :             :          * By here, the original data page has been split into two new halves, and
    1941                 :             :          * these are correct.  The algorithm requires that the left page never
    1942                 :             :          * move during a split, so we copy the new left page back on top of the
    1943                 :             :          * original.  We need to do this before writing the WAL record, so that
    1944                 :             :          * XLogInsert can WAL log an image of the page if necessary.
    1945                 :             :          */
    1946                 :        2309 :         memcpy(origpage, leftpage, BLCKSZ);
    1947                 :             :         /* leftpage, lopaque must not be used below here */
    1948                 :             : 
    1949                 :             :         /*
    1950                 :             :          * Move the contents of the right page from its temporary location to the
    1951                 :             :          * destination buffer, before writing the WAL record.  Unlike the left
    1952                 :             :          * page, the right page and its opaque area are still needed to complete
    1953                 :             :          * the update of the page, so reinitialize them.
    1954                 :             :          */
    1955                 :        2309 :         rightpage = BufferGetPage(rbuf);
    1956                 :        2309 :         memcpy(rightpage, rightpage_buf.data, BLCKSZ);
    1957                 :        2309 :         ropaque = BTPageGetOpaque(rightpage);
    1958                 :             : 
    1959                 :        2309 :         MarkBufferDirty(buf);
    1960                 :        2309 :         MarkBufferDirty(rbuf);
    1961                 :             : 
    1962         [ +  + ]:        2309 :         if (!isrightmost)
    1963                 :             :         {
    1964                 :         613 :                 sopaque->btpo_prev = rightpagenumber;
    1965                 :         613 :                 MarkBufferDirty(sbuf);
    1966                 :         613 :         }
    1967                 :             : 
    1968                 :             :         /*
    1969                 :             :          * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
    1970                 :             :          * a split
    1971                 :             :          */
    1972         [ +  + ]:        2309 :         if (!isleaf)
    1973                 :             :         {
    1974                 :          51 :                 Page            cpage = BufferGetPage(cbuf);
    1975                 :          51 :                 BTPageOpaque cpageop = BTPageGetOpaque(cpage);
    1976                 :             : 
    1977                 :          51 :                 cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
    1978                 :          51 :                 MarkBufferDirty(cbuf);
    1979                 :          51 :         }
    1980                 :             : 
    1981                 :             :         /* XLOG stuff */
    1982   [ +  +  +  +  :        2309 :         if (RelationNeedsWAL(rel))
             +  +  -  + ]
    1983                 :             :         {
    1984                 :        2271 :                 xl_btree_split xlrec;
    1985                 :        2271 :                 uint8           xlinfo;
    1986                 :        2271 :                 XLogRecPtr      recptr;
    1987                 :             : 
    1988                 :        2271 :                 xlrec.level = ropaque->btpo_level;
    1989                 :             :                 /* See comments below on newitem, orignewitem, and posting lists */
    1990                 :        2271 :                 xlrec.firstrightoff = firstrightoff;
    1991                 :        2271 :                 xlrec.newitemoff = newitemoff;
    1992                 :        2271 :                 xlrec.postingoff = 0;
    1993   [ +  +  +  + ]:        2271 :                 if (postingoff != 0 && origpagepostingoff < firstrightoff)
    1994                 :           5 :                         xlrec.postingoff = postingoff;
    1995                 :             : 
    1996                 :        2271 :                 XLogBeginInsert();
    1997                 :        2271 :                 XLogRegisterData(&xlrec, SizeOfBtreeSplit);
    1998                 :             : 
    1999                 :        2271 :                 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    2000                 :        2271 :                 XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
    2001                 :             :                 /* Log original right sibling, since we've changed its prev-pointer */
    2002         [ +  + ]:        2271 :                 if (!isrightmost)
    2003                 :         611 :                         XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
    2004         [ +  + ]:        2271 :                 if (!isleaf)
    2005                 :          51 :                         XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
    2006                 :             : 
    2007                 :             :                 /*
    2008                 :             :                  * Log the new item, if it was inserted on the left page. (If it was
    2009                 :             :                  * put on the right page, we don't need to explicitly WAL log it
    2010                 :             :                  * because it's included with all the other items on the right page.)
    2011                 :             :                  * Show the new item as belonging to the left page buffer, so that it
    2012                 :             :                  * is not stored if XLogInsert decides it needs a full-page image of
    2013                 :             :                  * the left page.  We always store newitemoff in the record, though.
    2014                 :             :                  *
    2015                 :             :                  * The details are sometimes slightly different for page splits that
    2016                 :             :                  * coincide with a posting list split.  If both the replacement
    2017                 :             :                  * posting list and newitem go on the right page, then we don't need
    2018                 :             :                  * to log anything extra, just like the simple !newitemonleft
    2019                 :             :                  * no-posting-split case (postingoff is set to zero in the WAL record,
    2020                 :             :                  * so recovery doesn't need to process a posting list split at all).
    2021                 :             :                  * Otherwise, we set postingoff and log orignewitem instead of
    2022                 :             :                  * newitem, despite having actually inserted newitem.  REDO routine
    2023                 :             :                  * must reconstruct nposting and newitem using _bt_swap_posting().
    2024                 :             :                  *
    2025                 :             :                  * Note: It's possible that our page split point is the point that
    2026                 :             :                  * makes the posting list lastleft and newitem firstright.  This is
    2027                 :             :                  * the only case where we log orignewitem/newitem despite newitem
    2028                 :             :                  * going on the right page.  If XLogInsert decides that it can omit
    2029                 :             :                  * orignewitem due to logging a full-page image of the left page,
    2030                 :             :                  * everything still works out, since recovery only needs to log
    2031                 :             :                  * orignewitem for items on the left page (just like the regular
    2032                 :             :                  * newitem-logged case).
    2033                 :             :                  */
    2034   [ +  +  +  + ]:        2271 :                 if (newitemonleft && xlrec.postingoff == 0)
    2035                 :         119 :                         XLogRegisterBufData(0, newitem, newitemsz);
    2036         [ +  + ]:        2152 :                 else if (xlrec.postingoff != 0)
    2037                 :             :                 {
    2038         [ +  - ]:           5 :                         Assert(isleaf);
    2039   [ +  +  +  - ]:           5 :                         Assert(newitemonleft || firstrightoff == newitemoff);
    2040         [ +  - ]:           5 :                         Assert(newitemsz == IndexTupleSize(orignewitem));
    2041                 :           5 :                         XLogRegisterBufData(0, orignewitem, newitemsz);
    2042                 :           5 :                 }
    2043                 :             : 
    2044                 :             :                 /* Log the left page's new high key */
    2045         [ +  + ]:        2271 :                 if (!isleaf)
    2046                 :             :                 {
    2047                 :             :                         /* lefthighkey isn't local copy, get current pointer */
    2048                 :          51 :                         itemid = PageGetItemId(origpage, P_HIKEY);
    2049                 :          51 :                         lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
    2050                 :          51 :                 }
    2051                 :        4542 :                 XLogRegisterBufData(0, lefthighkey,
    2052                 :        2271 :                                                         MAXALIGN(IndexTupleSize(lefthighkey)));
    2053                 :             : 
    2054                 :             :                 /*
    2055                 :             :                  * Log the contents of the right page in the format understood by
    2056                 :             :                  * _bt_restore_page().  The whole right page will be recreated.
    2057                 :             :                  *
    2058                 :             :                  * Direct access to page is not good but faster - we should implement
    2059                 :             :                  * some new func in page API.  Note we only store the tuples
    2060                 :             :                  * themselves, knowing that they were inserted in item-number order
    2061                 :             :                  * and so the line pointers can be reconstructed.  See comments for
    2062                 :             :                  * _bt_restore_page().
    2063                 :             :                  */
    2064                 :        2271 :                 XLogRegisterBufData(1,
    2065                 :        2271 :                                                         (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
    2066                 :        2271 :                                                         ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
    2067                 :             : 
    2068                 :        2271 :                 xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
    2069                 :        2271 :                 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
    2070                 :             : 
    2071                 :        2271 :                 PageSetLSN(origpage, recptr);
    2072                 :        2271 :                 PageSetLSN(rightpage, recptr);
    2073         [ +  + ]:        2271 :                 if (!isrightmost)
    2074                 :         611 :                         PageSetLSN(spage, recptr);
    2075         [ +  + ]:        2271 :                 if (!isleaf)
    2076                 :          51 :                         PageSetLSN(BufferGetPage(cbuf), recptr);
    2077                 :        2271 :         }
    2078                 :             : 
    2079         [ +  - ]:        2309 :         END_CRIT_SECTION();
    2080                 :             : 
    2081                 :             :         /* release the old right sibling */
    2082         [ +  + ]:        2309 :         if (!isrightmost)
    2083                 :         613 :                 _bt_relbuf(rel, sbuf);
    2084                 :             : 
    2085                 :             :         /* release the child */
    2086         [ +  + ]:        2309 :         if (!isleaf)
    2087                 :          51 :                 _bt_relbuf(rel, cbuf);
    2088                 :             : 
    2089                 :             :         /* be tidy */
    2090         [ +  + ]:        2309 :         if (isleaf)
    2091                 :        2258 :                 pfree(lefthighkey);
    2092                 :             : 
    2093                 :             :         /* split's done */
    2094                 :        4618 :         return rbuf;
    2095                 :        2309 : }
    2096                 :             : 
    2097                 :             : /*
    2098                 :             :  * _bt_insert_parent() -- Insert downlink into parent, completing split.
    2099                 :             :  *
    2100                 :             :  * On entry, buf and rbuf are the left and right split pages, which we
    2101                 :             :  * still hold write locks on.  Both locks will be released here.  We
    2102                 :             :  * release the rbuf lock once we have a write lock on the page that we
    2103                 :             :  * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
    2104                 :             :  * The lock on buf is released at the same point as the lock on the parent
    2105                 :             :  * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
    2106                 :             :  * atomic operation that completes the split by inserting a new downlink.
    2107                 :             :  *
    2108                 :             :  * stack - stack showing how we got here.  Will be NULL when splitting true
    2109                 :             :  *                      root, or during concurrent root split, where we can be inefficient
    2110                 :             :  * isroot - we split the true root
    2111                 :             :  * isonly - we split a page alone on its level (might have been fast root)
    2112                 :             :  */
    2113                 :             : static void
    2114                 :        2309 : _bt_insert_parent(Relation rel,
    2115                 :             :                                   Relation heaprel,
    2116                 :             :                                   Buffer buf,
    2117                 :             :                                   Buffer rbuf,
    2118                 :             :                                   BTStack stack,
    2119                 :             :                                   bool isroot,
    2120                 :             :                                   bool isonly)
    2121                 :             : {
    2122         [ +  - ]:        2309 :         Assert(heaprel != NULL);
    2123                 :             : 
    2124                 :             :         /*
    2125                 :             :          * Here we have to do something Lehman and Yao don't talk about: deal with
    2126                 :             :          * a root split and construction of a new root.  If our stack is empty
    2127                 :             :          * then we have just split a node on what had been the root level when we
    2128                 :             :          * descended the tree.  If it was still the root then we perform a
    2129                 :             :          * new-root construction.  If it *wasn't* the root anymore, search to find
    2130                 :             :          * the next higher level that someone constructed meanwhile, and find the
    2131                 :             :          * right place to insert as for the normal case.
    2132                 :             :          *
    2133                 :             :          * If we have to search for the parent level, we do so by re-descending
    2134                 :             :          * from the root.  This is not super-efficient, but it's rare enough not
    2135                 :             :          * to matter.
    2136                 :             :          */
    2137         [ +  + ]:        2309 :         if (isroot)
    2138                 :             :         {
    2139                 :          71 :                 Buffer          rootbuf;
    2140                 :             : 
    2141         [ +  - ]:          71 :                 Assert(stack == NULL);
    2142         [ +  - ]:          71 :                 Assert(isonly);
    2143                 :             :                 /* create a new root node one level up and update the metapage */
    2144                 :          71 :                 rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
    2145                 :             :                 /* release the split buffers */
    2146                 :          71 :                 _bt_relbuf(rel, rootbuf);
    2147                 :          71 :                 _bt_relbuf(rel, rbuf);
    2148                 :          71 :                 _bt_relbuf(rel, buf);
    2149                 :          71 :         }
    2150                 :             :         else
    2151                 :             :         {
    2152                 :        2238 :                 BlockNumber bknum = BufferGetBlockNumber(buf);
    2153                 :        2238 :                 BlockNumber rbknum = BufferGetBlockNumber(rbuf);
    2154                 :        2238 :                 Page            page = BufferGetPage(buf);
    2155                 :        2238 :                 IndexTuple      new_item;
    2156                 :        2238 :                 BTStackData fakestack;
    2157                 :        2238 :                 IndexTuple      ritem;
    2158                 :        2238 :                 Buffer          pbuf;
    2159                 :             : 
    2160         [ +  + ]:        2238 :                 if (stack == NULL)
    2161                 :             :                 {
    2162                 :           3 :                         BTPageOpaque opaque;
    2163                 :             : 
    2164   [ -  +  -  + ]:           3 :                         elog(DEBUG2, "concurrent ROOT page split");
    2165                 :           3 :                         opaque = BTPageGetOpaque(page);
    2166                 :             : 
    2167                 :             :                         /*
    2168                 :             :                          * We should never reach here when a leaf page split takes place
    2169                 :             :                          * despite the insert of newitem being able to apply the fastpath
    2170                 :             :                          * optimization.  Make sure of that with an assertion.
    2171                 :             :                          *
    2172                 :             :                          * This is more of a performance issue than a correctness issue.
    2173                 :             :                          * The fastpath won't have a descent stack.  Using a phony stack
    2174                 :             :                          * here works, but never rely on that.  The fastpath should be
    2175                 :             :                          * rejected within _bt_search_insert() when the rightmost leaf
    2176                 :             :                          * page will split, since it's faster to go through _bt_search()
    2177                 :             :                          * and get a stack in the usual way.
    2178                 :             :                          */
    2179   [ -  +  +  -  :           3 :                         Assert(!(P_ISLEAF(opaque) &&
                   +  - ]
    2180                 :             :                                          BlockNumberIsValid(RelationGetTargetBlock(rel))));
    2181                 :             : 
    2182                 :             :                         /* Find the leftmost page at the next level up */
    2183                 :           3 :                         pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
    2184                 :             :                         /* Set up a phony stack entry pointing there */
    2185                 :           3 :                         stack = &fakestack;
    2186                 :           3 :                         stack->bts_blkno = BufferGetBlockNumber(pbuf);
    2187                 :           3 :                         stack->bts_offset = InvalidOffsetNumber;
    2188                 :           3 :                         stack->bts_parent = NULL;
    2189                 :           3 :                         _bt_relbuf(rel, pbuf);
    2190                 :           3 :                 }
    2191                 :             : 
    2192                 :             :                 /* get high key from left, a strict lower bound for new right page */
    2193                 :        4476 :                 ritem = (IndexTuple) PageGetItem(page,
    2194                 :        2238 :                                                                                  PageGetItemId(page, P_HIKEY));
    2195                 :             : 
    2196                 :             :                 /* form an index tuple that points at the new right page */
    2197                 :        2238 :                 new_item = CopyIndexTuple(ritem);
    2198                 :        2238 :                 BTreeTupleSetDownLink(new_item, rbknum);
    2199                 :             : 
    2200                 :             :                 /*
    2201                 :             :                  * Re-find and write lock the parent of buf.
    2202                 :             :                  *
    2203                 :             :                  * It's possible that the location of buf's downlink has changed since
    2204                 :             :                  * our initial _bt_search() descent.  _bt_getstackbuf() will detect
    2205                 :             :                  * and recover from this, updating the stack, which ensures that the
    2206                 :             :                  * new downlink will be inserted at the correct offset. Even buf's
    2207                 :             :                  * parent may have changed.
    2208                 :             :                  */
    2209                 :        2238 :                 pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
    2210                 :             : 
    2211                 :             :                 /*
    2212                 :             :                  * Unlock the right child.  The left child will be unlocked in
    2213                 :             :                  * _bt_insertonpg().
    2214                 :             :                  *
    2215                 :             :                  * Unlocking the right child must be delayed until here to ensure that
    2216                 :             :                  * no concurrent VACUUM operation can become confused.  Page deletion
    2217                 :             :                  * cannot be allowed to fail to re-find a downlink for the rbuf page.
    2218                 :             :                  * (Actually, this is just a vestige of how things used to work.  The
    2219                 :             :                  * page deletion code is expected to check for the INCOMPLETE_SPLIT
    2220                 :             :                  * flag on the left child.  It won't attempt deletion of the right
    2221                 :             :                  * child until the split is complete.  Despite all this, we opt to
    2222                 :             :                  * conservatively delay unlocking the right child until here.)
    2223                 :             :                  */
    2224                 :        2238 :                 _bt_relbuf(rel, rbuf);
    2225                 :             : 
    2226         [ +  - ]:        2238 :                 if (pbuf == InvalidBuffer)
    2227   [ #  #  #  # ]:           0 :                         ereport(ERROR,
    2228                 :             :                                         (errcode(ERRCODE_INDEX_CORRUPTED),
    2229                 :             :                                          errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
    2230                 :             :                                                                          RelationGetRelationName(rel), bknum, rbknum)));
    2231                 :             : 
    2232                 :             :                 /* Recursively insert into the parent */
    2233                 :        4476 :                 _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
    2234                 :        2238 :                                            new_item, MAXALIGN(IndexTupleSize(new_item)),
    2235                 :        2238 :                                            stack->bts_offset + 1, 0, isonly);
    2236                 :             : 
    2237                 :             :                 /* be tidy */
    2238                 :        2238 :                 pfree(new_item);
    2239                 :        2238 :         }
    2240                 :        2309 : }
    2241                 :             : 
    2242                 :             : /*
    2243                 :             :  * _bt_finish_split() -- Finish an incomplete split
    2244                 :             :  *
    2245                 :             :  * A crash or other failure can leave a split incomplete.  The insertion
    2246                 :             :  * routines won't allow to insert on a page that is incompletely split.
    2247                 :             :  * Before inserting on such a page, call _bt_finish_split().
    2248                 :             :  *
    2249                 :             :  * On entry, 'lbuf' must be locked in write-mode.  On exit, it is unlocked
    2250                 :             :  * and unpinned.
    2251                 :             :  *
    2252                 :             :  * Caller must provide a valid heaprel, since finishing a page split requires
    2253                 :             :  * allocating a new page if and when the parent page splits in turn.
    2254                 :             :  */
    2255                 :             : void
    2256                 :           0 : _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
    2257                 :             : {
    2258                 :           0 :         Page            lpage = BufferGetPage(lbuf);
    2259                 :           0 :         BTPageOpaque lpageop = BTPageGetOpaque(lpage);
    2260                 :           0 :         Buffer          rbuf;
    2261                 :           0 :         Page            rpage;
    2262                 :           0 :         BTPageOpaque rpageop;
    2263                 :           0 :         bool            wasroot;
    2264                 :           0 :         bool            wasonly;
    2265                 :             : 
    2266         [ #  # ]:           0 :         Assert(P_INCOMPLETE_SPLIT(lpageop));
    2267         [ #  # ]:           0 :         Assert(heaprel != NULL);
    2268                 :             : 
    2269                 :             :         /* Lock right sibling, the one missing the downlink */
    2270                 :           0 :         rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
    2271                 :           0 :         rpage = BufferGetPage(rbuf);
    2272                 :           0 :         rpageop = BTPageGetOpaque(rpage);
    2273                 :             : 
    2274                 :             :         /* Could this be a root split? */
    2275         [ #  # ]:           0 :         if (!stack)
    2276                 :             :         {
    2277                 :           0 :                 Buffer          metabuf;
    2278                 :           0 :                 Page            metapg;
    2279                 :           0 :                 BTMetaPageData *metad;
    2280                 :             : 
    2281                 :             :                 /* acquire lock on the metapage */
    2282                 :           0 :                 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    2283                 :           0 :                 metapg = BufferGetPage(metabuf);
    2284                 :           0 :                 metad = BTPageGetMeta(metapg);
    2285                 :             : 
    2286                 :           0 :                 wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
    2287                 :             : 
    2288                 :           0 :                 _bt_relbuf(rel, metabuf);
    2289                 :           0 :         }
    2290                 :             :         else
    2291                 :           0 :                 wasroot = false;
    2292                 :             : 
    2293                 :             :         /* Was this the only page on the level before split? */
    2294         [ #  # ]:           0 :         wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
    2295                 :             : 
    2296                 :             :         INJECTION_POINT("nbtree-finish-incomplete-split", NULL);
    2297   [ #  #  #  # ]:           0 :         elog(DEBUG1, "finishing incomplete split of %u/%u",
    2298                 :             :                  BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
    2299                 :             : 
    2300                 :           0 :         _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
    2301                 :           0 : }
    2302                 :             : 
    2303                 :             : /*
    2304                 :             :  *      _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
    2305                 :             :  *                                               tuple whose downlink points to child page.
    2306                 :             :  *
    2307                 :             :  *              Caller passes child's block number, which is used to identify
    2308                 :             :  *              associated pivot tuple in parent page using a linear search that
    2309                 :             :  *              matches on pivot's downlink/block number.  The expected location of
    2310                 :             :  *              the pivot tuple is taken from the stack one level above the child
    2311                 :             :  *              page.  This is used as a starting point.  Insertions into the
    2312                 :             :  *              parent level could cause the pivot tuple to move right; deletions
    2313                 :             :  *              could cause it to move left, but not left of the page we previously
    2314                 :             :  *              found it on.
    2315                 :             :  *
    2316                 :             :  *              Caller can use its stack to relocate the pivot tuple/downlink for
    2317                 :             :  *              any same-level page to the right of the page found by its initial
    2318                 :             :  *              descent.  This is necessary because of the possibility that caller
    2319                 :             :  *              moved right to recover from a concurrent page split.  It's also
    2320                 :             :  *              convenient for certain callers to be able to step right when there
    2321                 :             :  *              wasn't a concurrent page split, while still using their original
    2322                 :             :  *              stack.  For example, the checkingunique _bt_doinsert() case may
    2323                 :             :  *              have to step right when there are many physical duplicates, and its
    2324                 :             :  *              scantid forces an insertion to the right of the "first page the
    2325                 :             :  *              value could be on".  (This is also relied on by all of our callers
    2326                 :             :  *              when dealing with !heapkeyspace indexes.)
    2327                 :             :  *
    2328                 :             :  *              Returns write-locked parent page buffer, or InvalidBuffer if pivot
    2329                 :             :  *              tuple not found (should not happen).  Adjusts bts_blkno &
    2330                 :             :  *              bts_offset if changed.  Page split caller should insert its new
    2331                 :             :  *              pivot tuple for its new right sibling page on parent page, at the
    2332                 :             :  *              offset number bts_offset + 1.
    2333                 :             :  */
    2334                 :             : Buffer
    2335                 :        2442 : _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
    2336                 :             : {
    2337                 :        2442 :         BlockNumber blkno;
    2338                 :        2442 :         OffsetNumber start;
    2339                 :             : 
    2340                 :        2442 :         blkno = stack->bts_blkno;
    2341                 :        2442 :         start = stack->bts_offset;
    2342                 :             : 
    2343                 :        2449 :         for (;;)
    2344                 :             :         {
    2345                 :        2449 :                 Buffer          buf;
    2346                 :        2449 :                 Page            page;
    2347                 :        2449 :                 BTPageOpaque opaque;
    2348                 :             : 
    2349                 :        2449 :                 buf = _bt_getbuf(rel, blkno, BT_WRITE);
    2350                 :        2449 :                 page = BufferGetPage(buf);
    2351                 :        2449 :                 opaque = BTPageGetOpaque(page);
    2352                 :             : 
    2353         [ +  - ]:        2449 :                 Assert(heaprel != NULL);
    2354         [ -  + ]:        2449 :                 if (P_INCOMPLETE_SPLIT(opaque))
    2355                 :             :                 {
    2356                 :           0 :                         _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
    2357                 :           0 :                         continue;
    2358                 :             :                 }
    2359                 :             : 
    2360         [ +  + ]:        2449 :                 if (!P_IGNORE(opaque))
    2361                 :             :                 {
    2362                 :        2442 :                         OffsetNumber offnum,
    2363                 :             :                                                 minoff,
    2364                 :             :                                                 maxoff;
    2365                 :        2442 :                         ItemId          itemid;
    2366                 :        2442 :                         IndexTuple      item;
    2367                 :             : 
    2368                 :        2442 :                         minoff = P_FIRSTDATAKEY(opaque);
    2369                 :        2442 :                         maxoff = PageGetMaxOffsetNumber(page);
    2370                 :             : 
    2371                 :             :                         /*
    2372                 :             :                          * start = InvalidOffsetNumber means "search the whole page". We
    2373                 :             :                          * need this test anyway due to possibility that page has a high
    2374                 :             :                          * key now when it didn't before.
    2375                 :             :                          */
    2376         [ +  + ]:        2442 :                         if (start < minoff)
    2377                 :          10 :                                 start = minoff;
    2378                 :             : 
    2379                 :             :                         /*
    2380                 :             :                          * Need this check too, to guard against possibility that page
    2381                 :             :                          * split since we visited it originally.
    2382                 :             :                          */
    2383         [ +  - ]:        2442 :                         if (start > maxoff)
    2384                 :           0 :                                 start = OffsetNumberNext(maxoff);
    2385                 :             : 
    2386                 :             :                         /*
    2387                 :             :                          * These loops will check every item on the page --- but in an
    2388                 :             :                          * order that's attuned to the probability of where it actually
    2389                 :             :                          * is.  Scan to the right first, then to the left.
    2390                 :             :                          */
    2391         [ +  - ]:        2445 :                         for (offnum = start;
    2392                 :        2445 :                                  offnum <= maxoff;
    2393                 :           3 :                                  offnum = OffsetNumberNext(offnum))
    2394                 :             :                         {
    2395                 :        2445 :                                 itemid = PageGetItemId(page, offnum);
    2396                 :        2445 :                                 item = (IndexTuple) PageGetItem(page, itemid);
    2397                 :             : 
    2398         [ +  + ]:        2445 :                                 if (BTreeTupleGetDownLink(item) == child)
    2399                 :             :                                 {
    2400                 :             :                                         /* Return accurate pointer to where link is now */
    2401                 :        2442 :                                         stack->bts_blkno = blkno;
    2402                 :        2442 :                                         stack->bts_offset = offnum;
    2403                 :        2442 :                                         return buf;
    2404                 :             :                                 }
    2405                 :           3 :                         }
    2406                 :             : 
    2407         [ #  # ]:           0 :                         for (offnum = OffsetNumberPrev(start);
    2408                 :           0 :                                  offnum >= minoff;
    2409                 :           0 :                                  offnum = OffsetNumberPrev(offnum))
    2410                 :             :                         {
    2411                 :           0 :                                 itemid = PageGetItemId(page, offnum);
    2412                 :           0 :                                 item = (IndexTuple) PageGetItem(page, itemid);
    2413                 :             : 
    2414         [ #  # ]:           0 :                                 if (BTreeTupleGetDownLink(item) == child)
    2415                 :             :                                 {
    2416                 :             :                                         /* Return accurate pointer to where link is now */
    2417                 :           0 :                                         stack->bts_blkno = blkno;
    2418                 :           0 :                                         stack->bts_offset = offnum;
    2419                 :           0 :                                         return buf;
    2420                 :             :                                 }
    2421                 :           0 :                         }
    2422         [ +  - ]:        2442 :                 }
    2423                 :             : 
    2424                 :             :                 /*
    2425                 :             :                  * The item we're looking for moved right at least one page.
    2426                 :             :                  *
    2427                 :             :                  * Lehman and Yao couple/chain locks when moving right here, which we
    2428                 :             :                  * can avoid.  See nbtree/README.
    2429                 :             :                  */
    2430         [ -  + ]:           7 :                 if (P_RIGHTMOST(opaque))
    2431                 :             :                 {
    2432                 :           0 :                         _bt_relbuf(rel, buf);
    2433                 :           0 :                         return InvalidBuffer;
    2434                 :             :                 }
    2435                 :           7 :                 blkno = opaque->btpo_next;
    2436                 :           7 :                 start = InvalidOffsetNumber;
    2437                 :           7 :                 _bt_relbuf(rel, buf);
    2438      [ -  +  + ]:        2449 :         }
    2439                 :        2442 : }
    2440                 :             : 
    2441                 :             : /*
    2442                 :             :  *      _bt_newlevel() -- Create a new level above root page.
    2443                 :             :  *
    2444                 :             :  *              We've just split the old root page and need to create a new one.
    2445                 :             :  *              In order to do this, we add a new root page to the file, then lock
    2446                 :             :  *              the metadata page and update it.  This is guaranteed to be deadlock-
    2447                 :             :  *              free, because all readers release their locks on the metadata page
    2448                 :             :  *              before trying to lock the root, and all writers lock the root before
    2449                 :             :  *              trying to lock the metadata page.  We have a write lock on the old
    2450                 :             :  *              root page, so we have not introduced any cycles into the waits-for
    2451                 :             :  *              graph.
    2452                 :             :  *
    2453                 :             :  *              On entry, lbuf (the old root) and rbuf (its new peer) are write-
    2454                 :             :  *              locked. On exit, a new root page exists with entries for the
    2455                 :             :  *              two new children, metapage is updated and unlocked/unpinned.
    2456                 :             :  *              The new root buffer is returned to caller which has to unlock/unpin
    2457                 :             :  *              lbuf, rbuf & rootbuf.
    2458                 :             :  */
    2459                 :             : static Buffer
    2460                 :          71 : _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
    2461                 :             : {
    2462                 :          71 :         Buffer          rootbuf;
    2463                 :          71 :         Page            lpage,
    2464                 :             :                                 rootpage;
    2465                 :          71 :         BlockNumber lbkno,
    2466                 :             :                                 rbkno;
    2467                 :          71 :         BlockNumber rootblknum;
    2468                 :          71 :         BTPageOpaque rootopaque;
    2469                 :          71 :         BTPageOpaque lopaque;
    2470                 :          71 :         ItemId          itemid;
    2471                 :          71 :         IndexTuple      item;
    2472                 :          71 :         IndexTuple      left_item;
    2473                 :          71 :         Size            left_item_sz;
    2474                 :          71 :         IndexTuple      right_item;
    2475                 :          71 :         Size            right_item_sz;
    2476                 :          71 :         Buffer          metabuf;
    2477                 :          71 :         Page            metapg;
    2478                 :          71 :         BTMetaPageData *metad;
    2479                 :             : 
    2480                 :          71 :         lbkno = BufferGetBlockNumber(lbuf);
    2481                 :          71 :         rbkno = BufferGetBlockNumber(rbuf);
    2482                 :          71 :         lpage = BufferGetPage(lbuf);
    2483                 :          71 :         lopaque = BTPageGetOpaque(lpage);
    2484                 :             : 
    2485                 :             :         /* get a new root page */
    2486                 :          71 :         rootbuf = _bt_allocbuf(rel, heaprel);
    2487                 :          71 :         rootpage = BufferGetPage(rootbuf);
    2488                 :          71 :         rootblknum = BufferGetBlockNumber(rootbuf);
    2489                 :             : 
    2490                 :             :         /* acquire lock on the metapage */
    2491                 :          71 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    2492                 :          71 :         metapg = BufferGetPage(metabuf);
    2493                 :          71 :         metad = BTPageGetMeta(metapg);
    2494                 :             : 
    2495                 :             :         /*
    2496                 :             :          * Create downlink item for left page (old root).  The key value used is
    2497                 :             :          * "minus infinity", a sentinel value that's reliably less than any real
    2498                 :             :          * key value that could appear in the left page.
    2499                 :             :          */
    2500                 :          71 :         left_item_sz = sizeof(IndexTupleData);
    2501                 :          71 :         left_item = (IndexTuple) palloc(left_item_sz);
    2502                 :          71 :         left_item->t_info = left_item_sz;
    2503                 :          71 :         BTreeTupleSetDownLink(left_item, lbkno);
    2504                 :          71 :         BTreeTupleSetNAtts(left_item, 0, false);
    2505                 :             : 
    2506                 :             :         /*
    2507                 :             :          * Create downlink item for right page.  The key for it is obtained from
    2508                 :             :          * the "high key" position in the left page.
    2509                 :             :          */
    2510                 :          71 :         itemid = PageGetItemId(lpage, P_HIKEY);
    2511                 :          71 :         right_item_sz = ItemIdGetLength(itemid);
    2512                 :          71 :         item = (IndexTuple) PageGetItem(lpage, itemid);
    2513                 :          71 :         right_item = CopyIndexTuple(item);
    2514                 :          71 :         BTreeTupleSetDownLink(right_item, rbkno);
    2515                 :             : 
    2516                 :             :         /* NO EREPORT(ERROR) from here till newroot op is logged */
    2517                 :          71 :         START_CRIT_SECTION();
    2518                 :             : 
    2519                 :             :         /* upgrade metapage if needed */
    2520         [ +  - ]:          71 :         if (metad->btm_version < BTREE_NOVAC_VERSION)
    2521                 :           0 :                 _bt_upgrademetapage(metapg);
    2522                 :             : 
    2523                 :             :         /* set btree special data */
    2524                 :          71 :         rootopaque = BTPageGetOpaque(rootpage);
    2525                 :          71 :         rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
    2526                 :          71 :         rootopaque->btpo_flags = BTP_ROOT;
    2527                 :          71 :         rootopaque->btpo_level =
    2528                 :          71 :                 (BTPageGetOpaque(lpage))->btpo_level + 1;
    2529                 :          71 :         rootopaque->btpo_cycleid = 0;
    2530                 :             : 
    2531                 :             :         /* update metapage data */
    2532                 :          71 :         metad->btm_root = rootblknum;
    2533                 :          71 :         metad->btm_level = rootopaque->btpo_level;
    2534                 :          71 :         metad->btm_fastroot = rootblknum;
    2535                 :          71 :         metad->btm_fastlevel = rootopaque->btpo_level;
    2536                 :             : 
    2537                 :             :         /*
    2538                 :             :          * Insert the left page pointer into the new root page.  The root page is
    2539                 :             :          * the rightmost page on its level so there is no "high key" in it; the
    2540                 :             :          * two items will go into positions P_HIKEY and P_FIRSTKEY.
    2541                 :             :          *
    2542                 :             :          * Note: we *must* insert the two items in item-number order, for the
    2543                 :             :          * benefit of _bt_restore_page().
    2544                 :             :          */
    2545   [ +  -  +  - ]:          71 :         Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
    2546         [ +  - ]:          71 :         if (PageAddItem(rootpage, left_item, left_item_sz, P_HIKEY, false, false) == InvalidOffsetNumber)
    2547   [ #  #  #  # ]:           0 :                 elog(PANIC, "failed to add leftkey to new root page"
    2548                 :             :                          " while splitting block %u of index \"%s\"",
    2549                 :             :                          BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
    2550                 :             : 
    2551                 :             :         /*
    2552                 :             :          * insert the right page pointer into the new root page.
    2553                 :             :          */
    2554   [ +  -  +  - ]:          71 :         Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
    2555   [ +  -  +  - ]:          71 :         Assert(BTreeTupleGetNAtts(right_item, rel) <=
    2556                 :             :                    IndexRelationGetNumberOfKeyAttributes(rel));
    2557         [ +  - ]:          71 :         if (PageAddItem(rootpage, right_item, right_item_sz, P_FIRSTKEY, false, false) == InvalidOffsetNumber)
    2558   [ #  #  #  # ]:           0 :                 elog(PANIC, "failed to add rightkey to new root page"
    2559                 :             :                          " while splitting block %u of index \"%s\"",
    2560                 :             :                          BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
    2561                 :             : 
    2562                 :             :         /* Clear the incomplete-split flag in the left child */
    2563         [ +  - ]:          71 :         Assert(P_INCOMPLETE_SPLIT(lopaque));
    2564                 :          71 :         lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
    2565                 :          71 :         MarkBufferDirty(lbuf);
    2566                 :             : 
    2567                 :          71 :         MarkBufferDirty(rootbuf);
    2568                 :          71 :         MarkBufferDirty(metabuf);
    2569                 :             : 
    2570                 :             :         /* XLOG stuff */
    2571   [ +  +  +  +  :          71 :         if (RelationNeedsWAL(rel))
             +  +  -  + ]
    2572                 :             :         {
    2573                 :          65 :                 xl_btree_newroot xlrec;
    2574                 :          65 :                 XLogRecPtr      recptr;
    2575                 :          65 :                 xl_btree_metadata md;
    2576                 :             : 
    2577                 :          65 :                 xlrec.rootblk = rootblknum;
    2578                 :          65 :                 xlrec.level = metad->btm_level;
    2579                 :             : 
    2580                 :          65 :                 XLogBeginInsert();
    2581                 :          65 :                 XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
    2582                 :             : 
    2583                 :          65 :                 XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
    2584                 :          65 :                 XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
    2585                 :          65 :                 XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
    2586                 :             : 
    2587         [ +  - ]:          65 :                 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
    2588                 :          65 :                 md.version = metad->btm_version;
    2589                 :          65 :                 md.root = rootblknum;
    2590                 :          65 :                 md.level = metad->btm_level;
    2591                 :          65 :                 md.fastroot = rootblknum;
    2592                 :          65 :                 md.fastlevel = metad->btm_level;
    2593                 :          65 :                 md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
    2594                 :          65 :                 md.allequalimage = metad->btm_allequalimage;
    2595                 :             : 
    2596                 :          65 :                 XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
    2597                 :             : 
    2598                 :             :                 /*
    2599                 :             :                  * Direct access to page is not good but faster - we should implement
    2600                 :             :                  * some new func in page API.
    2601                 :             :                  */
    2602                 :          65 :                 XLogRegisterBufData(0,
    2603                 :          65 :                                                         (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
    2604                 :         130 :                                                         ((PageHeader) rootpage)->pd_special -
    2605                 :          65 :                                                         ((PageHeader) rootpage)->pd_upper);
    2606                 :             : 
    2607                 :          65 :                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
    2608                 :             : 
    2609                 :          65 :                 PageSetLSN(lpage, recptr);
    2610                 :          65 :                 PageSetLSN(rootpage, recptr);
    2611                 :          65 :                 PageSetLSN(metapg, recptr);
    2612                 :          65 :         }
    2613                 :             : 
    2614         [ +  - ]:          71 :         END_CRIT_SECTION();
    2615                 :             : 
    2616                 :             :         /* done with metapage */
    2617                 :          71 :         _bt_relbuf(rel, metabuf);
    2618                 :             : 
    2619                 :          71 :         pfree(left_item);
    2620                 :          71 :         pfree(right_item);
    2621                 :             : 
    2622                 :         142 :         return rootbuf;
    2623                 :          71 : }
    2624                 :             : 
    2625                 :             : /*
    2626                 :             :  *      _bt_pgaddtup() -- add a data item to a particular page during split.
    2627                 :             :  *
    2628                 :             :  *              The difference between this routine and a bare PageAddItem call is
    2629                 :             :  *              that this code can deal with the first data item on an internal btree
    2630                 :             :  *              page in passing.  This data item (which is called "firstright" within
    2631                 :             :  *              _bt_split()) has a key that must be treated as minus infinity after
    2632                 :             :  *              the split.  Therefore, we truncate away all attributes when caller
    2633                 :             :  *              specifies it's the first data item on page (downlink is not changed,
    2634                 :             :  *              though).  This extra step is only needed for the right page of an
    2635                 :             :  *              internal page split.  There is no need to do this for the first data
    2636                 :             :  *              item on the existing/left page, since that will already have been
    2637                 :             :  *              truncated during an earlier page split.
    2638                 :             :  *
    2639                 :             :  *              See _bt_split() for a high level explanation of why we truncate here.
    2640                 :             :  *              Note that this routine has nothing to do with suffix truncation,
    2641                 :             :  *              despite using some of the same infrastructure.
    2642                 :             :  */
    2643                 :             : static inline bool
    2644                 :      746411 : _bt_pgaddtup(Page page,
    2645                 :             :                          Size itemsize,
    2646                 :             :                          const IndexTupleData *itup,
    2647                 :             :                          OffsetNumber itup_off,
    2648                 :             :                          bool newfirstdataitem)
    2649                 :             : {
    2650                 :      746411 :         IndexTupleData trunctuple;
    2651                 :             : 
    2652         [ +  + ]:      746411 :         if (newfirstdataitem)
    2653                 :             :         {
    2654                 :          51 :                 trunctuple = *itup;
    2655                 :          51 :                 trunctuple.t_info = sizeof(IndexTupleData);
    2656                 :          51 :                 BTreeTupleSetNAtts(&trunctuple, 0, false);
    2657                 :          51 :                 itup = &trunctuple;
    2658                 :          51 :                 itemsize = sizeof(IndexTupleData);
    2659                 :          51 :         }
    2660                 :             : 
    2661         [ -  + ]:      746411 :         if (unlikely(PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber))
    2662                 :           0 :                 return false;
    2663                 :             : 
    2664                 :      746411 :         return true;
    2665                 :      746411 : }
    2666                 :             : 
    2667                 :             : /*
    2668                 :             :  * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
    2669                 :             :  *
    2670                 :             :  * There are three operations performed here: simple index deletion, bottom-up
    2671                 :             :  * index deletion, and deduplication.  If all three operations fail to free
    2672                 :             :  * enough space for the incoming item then caller will go on to split the
    2673                 :             :  * page.  We always consider simple deletion first.  If that doesn't work out
    2674                 :             :  * we consider alternatives.  Callers that only want us to consider simple
    2675                 :             :  * deletion (without any fallback) ask for that using the 'simpleonly'
    2676                 :             :  * argument.
    2677                 :             :  *
    2678                 :             :  * We usually pick only one alternative "complex" operation when simple
    2679                 :             :  * deletion alone won't prevent a page split.  The 'checkingunique',
    2680                 :             :  * 'uniquedup', and 'indexUnchanged' arguments are used for that.
    2681                 :             :  *
    2682                 :             :  * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
    2683                 :             :  * level flag was found set.  The flag was useful back when there wasn't
    2684                 :             :  * necessarily one single page for a duplicate tuple to go on (before heap TID
    2685                 :             :  * became a part of the key space in version 4 indexes).  But we don't
    2686                 :             :  * actually look at the flag anymore (it's not a gating condition for our
    2687                 :             :  * caller).  That would cause us to miss tuples that are safe to delete,
    2688                 :             :  * without getting any benefit in return.  We know that the alternative is to
    2689                 :             :  * split the page; scanning the line pointer array in passing won't have
    2690                 :             :  * noticeable overhead.  (We still maintain the BTP_HAS_GARBAGE flag despite
    2691                 :             :  * all this because !heapkeyspace indexes must still do a "getting tired"
    2692                 :             :  * linear search, and so are likely to get some benefit from using it as a
    2693                 :             :  * gating condition.)
    2694                 :             :  */
    2695                 :             : static void
    2696                 :        5022 : _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
    2697                 :             :                                                          BTInsertState insertstate,
    2698                 :             :                                                          bool simpleonly, bool checkingunique,
    2699                 :             :                                                          bool uniquedup, bool indexUnchanged)
    2700                 :             : {
    2701                 :        5022 :         OffsetNumber deletable[MaxIndexTuplesPerPage];
    2702                 :        5022 :         int                     ndeletable = 0;
    2703                 :        5022 :         OffsetNumber offnum,
    2704                 :             :                                 minoff,
    2705                 :             :                                 maxoff;
    2706                 :        5022 :         Buffer          buffer = insertstate->buf;
    2707                 :        5022 :         BTScanInsert itup_key = insertstate->itup_key;
    2708                 :        5022 :         Page            page = BufferGetPage(buffer);
    2709                 :        5022 :         BTPageOpaque opaque = BTPageGetOpaque(page);
    2710                 :             : 
    2711         [ +  - ]:        5022 :         Assert(P_ISLEAF(opaque));
    2712   [ +  -  +  - ]:        5022 :         Assert(simpleonly || itup_key->heapkeyspace);
    2713   [ +  -  #  # ]:        5022 :         Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
    2714                 :             : 
    2715                 :             :         /*
    2716                 :             :          * Scan over all items to see which ones need to be deleted according to
    2717                 :             :          * LP_DEAD flags.  We'll usually manage to delete a few extra items that
    2718                 :             :          * are not marked LP_DEAD in passing.  Often the extra items that actually
    2719                 :             :          * end up getting deleted are items that would have had their LP_DEAD bit
    2720                 :             :          * set before long anyway (if we opted not to include them as extras).
    2721                 :             :          */
    2722                 :        5022 :         minoff = P_FIRSTDATAKEY(opaque);
    2723                 :        5022 :         maxoff = PageGetMaxOffsetNumber(page);
    2724         [ +  + ]:     1382599 :         for (offnum = minoff;
    2725                 :     1382599 :                  offnum <= maxoff;
    2726                 :     1377577 :                  offnum = OffsetNumberNext(offnum))
    2727                 :             :         {
    2728                 :     1377577 :                 ItemId          itemId = PageGetItemId(page, offnum);
    2729                 :             : 
    2730         [ +  + ]:     1377577 :                 if (ItemIdIsDead(itemId))
    2731                 :       19644 :                         deletable[ndeletable++] = offnum;
    2732                 :     1377577 :         }
    2733                 :             : 
    2734         [ +  + ]:        5022 :         if (ndeletable > 0)
    2735                 :             :         {
    2736                 :        1708 :                 _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
    2737                 :         854 :                                                    insertstate->itup, minoff, maxoff);
    2738                 :         854 :                 insertstate->bounds_valid = false;
    2739                 :             : 
    2740                 :             :                 /* Return when a page split has already been avoided */
    2741         [ +  + ]:         854 :                 if (PageGetFreeSpace(page) >= insertstate->itemsz)
    2742                 :         852 :                         return;
    2743                 :             : 
    2744                 :             :                 /* Might as well assume duplicates (if checkingunique) */
    2745                 :           2 :                 uniquedup = true;
    2746                 :           2 :         }
    2747                 :             : 
    2748                 :             :         /*
    2749                 :             :          * We're done with simple deletion.  Return early with callers that only
    2750                 :             :          * call here so that simple deletion can be considered.  This includes
    2751                 :             :          * callers that explicitly ask for this and checkingunique callers that
    2752                 :             :          * probably don't have any version churn duplicates on the page.
    2753                 :             :          *
    2754                 :             :          * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
    2755                 :             :          * return at this point (or when we go on the try either or both of our
    2756                 :             :          * other strategies and they also fail).  We do not bother expending a
    2757                 :             :          * separate write to clear it, however.  Caller will definitely clear it
    2758                 :             :          * when it goes on to split the page (note also that the deduplication
    2759                 :             :          * process will clear the flag in passing, just to keep things tidy).
    2760                 :             :          */
    2761   [ +  -  +  +  :        4170 :         if (simpleonly || (checkingunique && !uniquedup))
                   +  + ]
    2762                 :             :         {
    2763         [ +  - ]:        1587 :                 Assert(!indexUnchanged);
    2764                 :        1587 :                 return;
    2765                 :             :         }
    2766                 :             : 
    2767                 :             :         /* Assume bounds about to be invalidated (this is almost certain now) */
    2768                 :        2583 :         insertstate->bounds_valid = false;
    2769                 :             : 
    2770                 :             :         /*
    2771                 :             :          * Perform bottom-up index deletion pass when executor hint indicated that
    2772                 :             :          * incoming item is logically unchanged, or for a unique index that is
    2773                 :             :          * known to have physical duplicates for some other reason.  (There is a
    2774                 :             :          * large overlap between these two cases for a unique index.  It's worth
    2775                 :             :          * having both triggering conditions in order to apply the optimization in
    2776                 :             :          * the event of successive related INSERT and DELETE statements.)
    2777                 :             :          *
    2778                 :             :          * We'll go on to do a deduplication pass when a bottom-up pass fails to
    2779                 :             :          * delete an acceptable amount of free space (a significant fraction of
    2780                 :             :          * the page, or space for the new item, whichever is greater).
    2781                 :             :          *
    2782                 :             :          * Note: Bottom-up index deletion uses the same equality/equivalence
    2783                 :             :          * routines as deduplication internally.  However, it does not merge
    2784                 :             :          * together index tuples, so the same correctness considerations do not
    2785                 :             :          * apply.  We deliberately omit an index-is-allequalimage test here.
    2786                 :             :          */
    2787   [ +  +  +  + ]:        2583 :         if ((indexUnchanged || uniquedup) &&
    2788                 :        2583 :                 _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
    2789                 :          21 :                 return;
    2790                 :             : 
    2791                 :             :         /* Perform deduplication pass (when enabled and index-is-allequalimage) */
    2792   [ +  -  +  +  :        2562 :         if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
             +  +  -  + ]
    2793                 :        5004 :                 _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
    2794         [ +  + ]:        2559 :                                            (indexUnchanged || uniquedup));
    2795         [ -  + ]:        5022 : }
    2796                 :             : 
    2797                 :             : /*
    2798                 :             :  * _bt_simpledel_pass - Simple index tuple deletion pass.
    2799                 :             :  *
    2800                 :             :  * We delete all LP_DEAD-set index tuples on a leaf page.  The offset numbers
    2801                 :             :  * of all such tuples are determined by caller (caller passes these to us as
    2802                 :             :  * its 'deletable' argument).
    2803                 :             :  *
    2804                 :             :  * We might also delete extra index tuples that turn out to be safe to delete
    2805                 :             :  * in passing (though they must be cheap to check in passing to begin with).
    2806                 :             :  * There is no certainty that any extra tuples will be deleted, though.  The
    2807                 :             :  * high level goal of the approach we take is to get the most out of each call
    2808                 :             :  * here (without noticeably increasing the per-call overhead compared to what
    2809                 :             :  * we need to do just to be able to delete the page's LP_DEAD-marked index
    2810                 :             :  * tuples).
    2811                 :             :  *
    2812                 :             :  * The number of extra index tuples that turn out to be deletable might
    2813                 :             :  * greatly exceed the number of LP_DEAD-marked index tuples due to various
    2814                 :             :  * locality related effects.  For example, it's possible that the total number
    2815                 :             :  * of table blocks (pointed to by all TIDs on the leaf page) is naturally
    2816                 :             :  * quite low, in which case we might end up checking if it's possible to
    2817                 :             :  * delete _most_ index tuples on the page (without the tableam needing to
    2818                 :             :  * access additional table blocks).  The tableam will sometimes stumble upon
    2819                 :             :  * _many_ extra deletable index tuples in indexes where this pattern is
    2820                 :             :  * common.
    2821                 :             :  *
    2822                 :             :  * See nbtree/README for further details on simple index tuple deletion.
    2823                 :             :  */
    2824                 :             : static void
    2825                 :         854 : _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
    2826                 :             :                                    OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
    2827                 :             :                                    OffsetNumber minoff, OffsetNumber maxoff)
    2828                 :             : {
    2829                 :         854 :         Page            page = BufferGetPage(buffer);
    2830                 :         854 :         BlockNumber *deadblocks;
    2831                 :         854 :         int                     ndeadblocks;
    2832                 :         854 :         TM_IndexDeleteOp delstate;
    2833                 :         854 :         OffsetNumber offnum;
    2834                 :             : 
    2835                 :             :         /* Get array of table blocks pointed to by LP_DEAD-set tuples */
    2836                 :         854 :         deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
    2837                 :             :                                                                 &ndeadblocks);
    2838                 :             : 
    2839                 :             :         /* Initialize tableam state that describes index deletion operation */
    2840                 :         854 :         delstate.irel = rel;
    2841                 :         854 :         delstate.iblknum = BufferGetBlockNumber(buffer);
    2842                 :         854 :         delstate.bottomup = false;
    2843                 :         854 :         delstate.bottomupfreespace = 0;
    2844                 :         854 :         delstate.ndeltids = 0;
    2845                 :         854 :         delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
    2846                 :         854 :         delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
    2847                 :             : 
    2848         [ +  + ]:      236629 :         for (offnum = minoff;
    2849                 :      236629 :                  offnum <= maxoff;
    2850                 :      235775 :                  offnum = OffsetNumberNext(offnum))
    2851                 :             :         {
    2852                 :      235775 :                 ItemId          itemid = PageGetItemId(page, offnum);
    2853                 :      235775 :                 IndexTuple      itup = (IndexTuple) PageGetItem(page, itemid);
    2854                 :      235775 :                 TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
    2855                 :      235775 :                 TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
    2856                 :      235775 :                 BlockNumber tidblock;
    2857                 :      235775 :                 void       *match;
    2858                 :             : 
    2859         [ +  + ]:      235775 :                 if (!BTreeTupleIsPosting(itup))
    2860                 :             :                 {
    2861                 :      222501 :                         tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
    2862                 :      222501 :                         match = bsearch(&tidblock, deadblocks, ndeadblocks,
    2863                 :             :                                                         sizeof(BlockNumber), _bt_blk_cmp);
    2864                 :             : 
    2865         [ +  + ]:      222501 :                         if (!match)
    2866                 :             :                         {
    2867         [ +  - ]:      145495 :                                 Assert(!ItemIdIsDead(itemid));
    2868                 :      145495 :                                 continue;
    2869                 :             :                         }
    2870                 :             : 
    2871                 :             :                         /*
    2872                 :             :                          * TID's table block is among those pointed to by the TIDs from
    2873                 :             :                          * LP_DEAD-bit set tuples on page -- add TID to deltids
    2874                 :             :                          */
    2875                 :       77006 :                         odeltid->tid = itup->t_tid;
    2876                 :       77006 :                         odeltid->id = delstate.ndeltids;
    2877                 :       77006 :                         ostatus->idxoffnum = offnum;
    2878                 :       77006 :                         ostatus->knowndeletable = ItemIdIsDead(itemid);
    2879                 :       77006 :                         ostatus->promising = false; /* unused */
    2880                 :       77006 :                         ostatus->freespace = 0; /* unused */
    2881                 :             : 
    2882                 :       77006 :                         delstate.ndeltids++;
    2883                 :       77006 :                 }
    2884                 :             :                 else
    2885                 :             :                 {
    2886                 :       13274 :                         int                     nitem = BTreeTupleGetNPosting(itup);
    2887                 :             : 
    2888         [ +  + ]:       70353 :                         for (int p = 0; p < nitem; p++)
    2889                 :             :                         {
    2890                 :       57079 :                                 ItemPointer tid = BTreeTupleGetPostingN(itup, p);
    2891                 :             : 
    2892                 :       57079 :                                 tidblock = ItemPointerGetBlockNumber(tid);
    2893                 :       57079 :                                 match = bsearch(&tidblock, deadblocks, ndeadblocks,
    2894                 :             :                                                                 sizeof(BlockNumber), _bt_blk_cmp);
    2895                 :             : 
    2896         [ +  + ]:       57079 :                                 if (!match)
    2897                 :             :                                 {
    2898         [ -  + ]:       53364 :                                         Assert(!ItemIdIsDead(itemid));
    2899                 :       53364 :                                         continue;
    2900                 :             :                                 }
    2901                 :             : 
    2902                 :             :                                 /*
    2903                 :             :                                  * TID's table block is among those pointed to by the TIDs
    2904                 :             :                                  * from LP_DEAD-bit set tuples on page -- add TID to deltids
    2905                 :             :                                  */
    2906                 :        3715 :                                 odeltid->tid = *tid;
    2907                 :        3715 :                                 odeltid->id = delstate.ndeltids;
    2908                 :        3715 :                                 ostatus->idxoffnum = offnum;
    2909                 :        3715 :                                 ostatus->knowndeletable = ItemIdIsDead(itemid);
    2910                 :        3715 :                                 ostatus->promising = false; /* unused */
    2911                 :        3715 :                                 ostatus->freespace = 0; /* unused */
    2912                 :             : 
    2913                 :        3715 :                                 odeltid++;
    2914                 :        3715 :                                 ostatus++;
    2915                 :        3715 :                                 delstate.ndeltids++;
    2916         [ +  + ]:       57079 :                         }
    2917                 :       13274 :                 }
    2918         [ +  + ]:      235775 :         }
    2919                 :             : 
    2920                 :         854 :         pfree(deadblocks);
    2921                 :             : 
    2922         [ +  - ]:         854 :         Assert(delstate.ndeltids >= ndeletable);
    2923                 :             : 
    2924                 :             :         /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
    2925                 :         854 :         _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
    2926                 :             : 
    2927                 :         854 :         pfree(delstate.deltids);
    2928                 :         854 :         pfree(delstate.status);
    2929                 :         854 : }
    2930                 :             : 
    2931                 :             : /*
    2932                 :             :  * _bt_deadblocks() -- Get LP_DEAD related table blocks.
    2933                 :             :  *
    2934                 :             :  * Builds sorted and unique-ified array of table block numbers from index
    2935                 :             :  * tuple TIDs whose line pointers are marked LP_DEAD.  Also adds the table
    2936                 :             :  * block from incoming newitem just in case it isn't among the LP_DEAD-related
    2937                 :             :  * table blocks.
    2938                 :             :  *
    2939                 :             :  * Always counting the newitem's table block as an LP_DEAD related block makes
    2940                 :             :  * sense because the cost is consistently low; it is practically certain that
    2941                 :             :  * the table block will not incur a buffer miss in tableam.  On the other hand
    2942                 :             :  * the benefit is often quite high.  There is a decent chance that there will
    2943                 :             :  * be some deletable items from this block, since in general most garbage
    2944                 :             :  * tuples became garbage in the recent past (in many cases this won't be the
    2945                 :             :  * first logical row that core code added to/modified in table block
    2946                 :             :  * recently).
    2947                 :             :  *
    2948                 :             :  * Returns final array, and sets *nblocks to its final size for caller.
    2949                 :             :  */
    2950                 :             : static BlockNumber *
    2951                 :         854 : _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
    2952                 :             :                            IndexTuple newitem, int *nblocks)
    2953                 :             : {
    2954                 :         854 :         int                     spacentids,
    2955                 :             :                                 ntids;
    2956                 :         854 :         BlockNumber *tidblocks;
    2957                 :             : 
    2958                 :             :         /*
    2959                 :             :          * Accumulate each TID's block in array whose initial size has space for
    2960                 :             :          * one table block per LP_DEAD-set tuple (plus space for the newitem table
    2961                 :             :          * block).  Array will only need to grow when there are LP_DEAD-marked
    2962                 :             :          * posting list tuples (which is not that common).
    2963                 :             :          */
    2964                 :         854 :         spacentids = ndeletable + 1;
    2965                 :         854 :         ntids = 0;
    2966                 :         854 :         tidblocks = palloc_array(BlockNumber, spacentids);
    2967                 :             : 
    2968                 :             :         /*
    2969                 :             :          * First add the table block for the incoming newitem.  This is the one
    2970                 :             :          * case where simple deletion can visit a table block that doesn't have
    2971                 :             :          * any known deletable items.
    2972                 :             :          */
    2973         [ +  - ]:         854 :         Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
    2974                 :         854 :         tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
    2975                 :             : 
    2976         [ +  + ]:       20498 :         for (int i = 0; i < ndeletable; i++)
    2977                 :             :         {
    2978                 :       19644 :                 ItemId          itemid = PageGetItemId(page, deletable[i]);
    2979                 :       19644 :                 IndexTuple      itup = (IndexTuple) PageGetItem(page, itemid);
    2980                 :             : 
    2981         [ +  - ]:       19644 :                 Assert(ItemIdIsDead(itemid));
    2982                 :             : 
    2983         [ +  + ]:       19644 :                 if (!BTreeTupleIsPosting(itup))
    2984                 :             :                 {
    2985         [ +  + ]:       19565 :                         if (ntids + 1 > spacentids)
    2986                 :             :                         {
    2987                 :          17 :                                 spacentids *= 2;
    2988                 :          17 :                                 tidblocks = (BlockNumber *)
    2989                 :          17 :                                         repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
    2990                 :          17 :                         }
    2991                 :             : 
    2992                 :       19565 :                         tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
    2993                 :       19565 :                 }
    2994                 :             :                 else
    2995                 :             :                 {
    2996                 :          79 :                         int                     nposting = BTreeTupleGetNPosting(itup);
    2997                 :             : 
    2998         [ +  + ]:          79 :                         if (ntids + nposting > spacentids)
    2999                 :             :                         {
    3000         [ +  + ]:          17 :                                 spacentids = Max(spacentids * 2, ntids + nposting);
    3001                 :          17 :                                 tidblocks = (BlockNumber *)
    3002                 :          17 :                                         repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
    3003                 :          17 :                         }
    3004                 :             : 
    3005         [ +  + ]:         514 :                         for (int j = 0; j < nposting; j++)
    3006                 :             :                         {
    3007                 :         435 :                                 ItemPointer tid = BTreeTupleGetPostingN(itup, j);
    3008                 :             : 
    3009                 :         435 :                                 tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
    3010                 :         435 :                         }
    3011                 :          79 :                 }
    3012                 :       19644 :         }
    3013                 :             : 
    3014                 :         854 :         qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
    3015                 :         854 :         *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
    3016                 :             : 
    3017                 :        1708 :         return tidblocks;
    3018                 :         854 : }
    3019                 :             : 
    3020                 :             : /*
    3021                 :             :  * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
    3022                 :             :  */
    3023                 :             : static inline int
    3024                 :      588650 : _bt_blk_cmp(const void *arg1, const void *arg2)
    3025                 :             : {
    3026                 :      588650 :         BlockNumber b1 = *((BlockNumber *) arg1);
    3027                 :      588650 :         BlockNumber b2 = *((BlockNumber *) arg2);
    3028                 :             : 
    3029                 :     1177300 :         return pg_cmp_u32(b1, b2);
    3030                 :      588650 : }

Generated by: LCOV version 2.3.2-1