LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtpage.c (source / functions) Coverage Total Hit
Test: Code coverage Lines: 89.0 % 1122 999
Test Date: 2026-01-26 10:56:24 Functions: 97.0 % 33 32
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
Branches: 52.8 % 725 383

             Branch data     Line data    Source code
       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * nbtpage.c
       4                 :             :  *        BTree-specific page management code for the Postgres btree access
       5                 :             :  *        method.
       6                 :             :  *
       7                 :             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       8                 :             :  * Portions Copyright (c) 1994, Regents of the University of California
       9                 :             :  *
      10                 :             :  *
      11                 :             :  * IDENTIFICATION
      12                 :             :  *        src/backend/access/nbtree/nbtpage.c
      13                 :             :  *
      14                 :             :  *      NOTES
      15                 :             :  *         Postgres btree pages look like ordinary relation pages.  The opaque
      16                 :             :  *         data at high addresses includes pointers to left and right siblings
      17                 :             :  *         and flag data describing page state.  The first page in a btree, page
      18                 :             :  *         zero, is special -- it stores meta-information describing the tree.
      19                 :             :  *         Pages one and higher store the actual tree data.
      20                 :             :  *
      21                 :             :  *-------------------------------------------------------------------------
      22                 :             :  */
      23                 :             : #include "postgres.h"
      24                 :             : 
      25                 :             : #include "access/nbtree.h"
      26                 :             : #include "access/nbtxlog.h"
      27                 :             : #include "access/tableam.h"
      28                 :             : #include "access/transam.h"
      29                 :             : #include "access/xlog.h"
      30                 :             : #include "access/xloginsert.h"
      31                 :             : #include "common/int.h"
      32                 :             : #include "miscadmin.h"
      33                 :             : #include "storage/indexfsm.h"
      34                 :             : #include "storage/predicate.h"
      35                 :             : #include "storage/procarray.h"
      36                 :             : #include "utils/injection_point.h"
      37                 :             : #include "utils/memdebug.h"
      38                 :             : #include "utils/memutils.h"
      39                 :             : #include "utils/snapmgr.h"
      40                 :             : 
      41                 :             : static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
      42                 :             : static void _bt_delitems_delete(Relation rel, Buffer buf,
      43                 :             :                                                                 TransactionId snapshotConflictHorizon,
      44                 :             :                                                                 bool isCatalogRel,
      45                 :             :                                                                 OffsetNumber *deletable, int ndeletable,
      46                 :             :                                                                 BTVacuumPosting *updatable, int nupdatable);
      47                 :             : static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
      48                 :             :                                                                  OffsetNumber *updatedoffsets,
      49                 :             :                                                                  Size *updatedbuflen, bool needswal);
      50                 :             : static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel,
      51                 :             :                                                                    Buffer leafbuf, BTStack stack);
      52                 :             : static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
      53                 :             :                                                                          BlockNumber scanblkno,
      54                 :             :                                                                          bool *rightsib_empty,
      55                 :             :                                                                          BTVacState *vstate);
      56                 :             : static bool _bt_lock_subtree_parent(Relation rel, Relation heaprel,
      57                 :             :                                                                         BlockNumber child, BTStack stack,
      58                 :             :                                                                         Buffer *subtreeparent, OffsetNumber *poffset,
      59                 :             :                                                                         BlockNumber *topparent,
      60                 :             :                                                                         BlockNumber *topparentrightsib);
      61                 :             : static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,
      62                 :             :                                                            FullTransactionId safexid);
      63                 :             : 
      64                 :             : /*
      65                 :             :  *      _bt_initmetapage() -- Fill a page buffer with a correct metapage image
      66                 :             :  */
      67                 :             : void
      68                 :        3712 : _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
      69                 :             :                                  bool allequalimage)
      70                 :             : {
      71                 :        3712 :         BTMetaPageData *metad;
      72                 :        3712 :         BTPageOpaque metaopaque;
      73                 :             : 
      74                 :        3712 :         _bt_pageinit(page, BLCKSZ);
      75                 :             : 
      76                 :        3712 :         metad = BTPageGetMeta(page);
      77                 :        3712 :         metad->btm_magic = BTREE_MAGIC;
      78                 :        3712 :         metad->btm_version = BTREE_VERSION;
      79                 :        3712 :         metad->btm_root = rootbknum;
      80                 :        3712 :         metad->btm_level = level;
      81                 :        3712 :         metad->btm_fastroot = rootbknum;
      82                 :        3712 :         metad->btm_fastlevel = level;
      83                 :        3712 :         metad->btm_last_cleanup_num_delpages = 0;
      84                 :        3712 :         metad->btm_last_cleanup_num_heap_tuples = -1.0;
      85                 :        3712 :         metad->btm_allequalimage = allequalimage;
      86                 :             : 
      87                 :        3712 :         metaopaque = BTPageGetOpaque(page);
      88                 :        3712 :         metaopaque->btpo_flags = BTP_META;
      89                 :             : 
      90                 :             :         /*
      91                 :             :          * Set pd_lower just past the end of the metadata.  This is essential,
      92                 :             :          * because without doing so, metadata will be lost if xlog.c compresses
      93                 :             :          * the page.
      94                 :             :          */
      95                 :        3712 :         ((PageHeader) page)->pd_lower =
      96                 :        3712 :                 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
      97                 :        3712 : }
      98                 :             : 
      99                 :             : /*
     100                 :             :  *      _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
     101                 :             :  *              3, the last version that can be updated without broadly affecting
     102                 :             :  *              on-disk compatibility.  (A REINDEX is required to upgrade to v4.)
     103                 :             :  *
     104                 :             :  *              This routine does purely in-memory image upgrade.  Caller is
     105                 :             :  *              responsible for locking, WAL-logging etc.
     106                 :             :  */
     107                 :             : void
     108                 :           0 : _bt_upgrademetapage(Page page)
     109                 :             : {
     110                 :           0 :         BTMetaPageData *metad;
     111                 :           0 :         BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;
     112                 :             : 
     113                 :           0 :         metad = BTPageGetMeta(page);
     114                 :           0 :         metaopaque = BTPageGetOpaque(page);
     115                 :             : 
     116                 :             :         /* It must be really a meta page of upgradable version */
     117         [ #  # ]:           0 :         Assert(metaopaque->btpo_flags & BTP_META);
     118         [ #  # ]:           0 :         Assert(metad->btm_version < BTREE_NOVAC_VERSION);
     119         [ #  # ]:           0 :         Assert(metad->btm_version >= BTREE_MIN_VERSION);
     120                 :             : 
     121                 :             :         /* Set version number and fill extra fields added into version 3 */
     122                 :           0 :         metad->btm_version = BTREE_NOVAC_VERSION;
     123                 :           0 :         metad->btm_last_cleanup_num_delpages = 0;
     124                 :           0 :         metad->btm_last_cleanup_num_heap_tuples = -1.0;
     125                 :             :         /* Only a REINDEX can set this field */
     126         [ #  # ]:           0 :         Assert(!metad->btm_allequalimage);
     127                 :           0 :         metad->btm_allequalimage = false;
     128                 :             : 
     129                 :             :         /* Adjust pd_lower (see _bt_initmetapage() for details) */
     130                 :           0 :         ((PageHeader) page)->pd_lower =
     131                 :           0 :                 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
     132                 :           0 : }
     133                 :             : 
     134                 :             : /*
     135                 :             :  * Get metadata from share-locked buffer containing metapage, while performing
     136                 :             :  * standard sanity checks.
     137                 :             :  *
     138                 :             :  * Callers that cache data returned here in local cache should note that an
     139                 :             :  * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
     140                 :             :  * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
     141                 :             :  */
     142                 :             : static BTMetaPageData *
     143                 :       88453 : _bt_getmeta(Relation rel, Buffer metabuf)
     144                 :             : {
     145                 :       88453 :         Page            metapg;
     146                 :       88453 :         BTPageOpaque metaopaque;
     147                 :       88453 :         BTMetaPageData *metad;
     148                 :             : 
     149                 :       88453 :         metapg = BufferGetPage(metabuf);
     150                 :       88453 :         metaopaque = BTPageGetOpaque(metapg);
     151                 :       88453 :         metad = BTPageGetMeta(metapg);
     152                 :             : 
     153                 :             :         /* sanity-check the metapage */
     154         [ +  - ]:       88453 :         if (!P_ISMETA(metaopaque) ||
     155                 :       88453 :                 metad->btm_magic != BTREE_MAGIC)
     156   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     157                 :             :                                 (errcode(ERRCODE_INDEX_CORRUPTED),
     158                 :             :                                  errmsg("index \"%s\" is not a btree",
     159                 :             :                                                 RelationGetRelationName(rel))));
     160                 :             : 
     161         [ +  - ]:       88453 :         if (metad->btm_version < BTREE_MIN_VERSION ||
     162                 :       88453 :                 metad->btm_version > BTREE_VERSION)
     163   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     164                 :             :                                 (errcode(ERRCODE_INDEX_CORRUPTED),
     165                 :             :                                  errmsg("version mismatch in index \"%s\": file version %d, "
     166                 :             :                                                 "current version %d, minimal supported version %d",
     167                 :             :                                                 RelationGetRelationName(rel),
     168                 :             :                                                 metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
     169                 :             : 
     170                 :      176906 :         return metad;
     171                 :       88453 : }
     172                 :             : 
     173                 :             : /*
     174                 :             :  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
     175                 :             :  *
     176                 :             :  * Called by btvacuumcleanup when btbulkdelete was never called because no
     177                 :             :  * index tuples needed to be deleted.
     178                 :             :  */
     179                 :             : bool
     180                 :         469 : _bt_vacuum_needs_cleanup(Relation rel)
     181                 :             : {
     182                 :         469 :         Buffer          metabuf;
     183                 :         469 :         Page            metapg;
     184                 :         469 :         BTMetaPageData *metad;
     185                 :         469 :         uint32          btm_version;
     186                 :         469 :         BlockNumber prev_num_delpages;
     187                 :             : 
     188                 :             :         /*
     189                 :             :          * Copy details from metapage to local variables quickly.
     190                 :             :          *
     191                 :             :          * Note that we deliberately avoid using cached version of metapage here.
     192                 :             :          */
     193                 :         469 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     194                 :         469 :         metapg = BufferGetPage(metabuf);
     195                 :         469 :         metad = BTPageGetMeta(metapg);
     196                 :         469 :         btm_version = metad->btm_version;
     197                 :             : 
     198         [ -  + ]:         469 :         if (btm_version < BTREE_NOVAC_VERSION)
     199                 :             :         {
     200                 :             :                 /*
     201                 :             :                  * Metapage needs to be dynamically upgraded to store fields that are
     202                 :             :                  * only present when btm_version >= BTREE_NOVAC_VERSION
     203                 :             :                  */
     204                 :           0 :                 _bt_relbuf(rel, metabuf);
     205                 :           0 :                 return true;
     206                 :             :         }
     207                 :             : 
     208                 :         469 :         prev_num_delpages = metad->btm_last_cleanup_num_delpages;
     209                 :         469 :         _bt_relbuf(rel, metabuf);
     210                 :             : 
     211                 :             :         /*
     212                 :             :          * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
     213                 :             :          * total size of the index.  We can reasonably expect (though are not
     214                 :             :          * guaranteed) to be able to recycle this many pages if we decide to do a
     215                 :             :          * btvacuumscan call during the ongoing btvacuumcleanup.  For further
     216                 :             :          * details see the nbtree/README section on placing deleted pages in the
     217                 :             :          * FSM.
     218                 :             :          */
     219   [ +  +  -  + ]:         469 :         if (prev_num_delpages > 0 &&
     220                 :           1 :                 prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
     221                 :           1 :                 return true;
     222                 :             : 
     223                 :         468 :         return false;
     224                 :         469 : }
     225                 :             : 
     226                 :             : /*
     227                 :             :  * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup.
     228                 :             :  *
     229                 :             :  * Called at the end of btvacuumcleanup, when num_delpages value has been
     230                 :             :  * finalized.
     231                 :             :  */
     232                 :             : void
     233                 :         107 : _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
     234                 :             : {
     235                 :         107 :         Buffer          metabuf;
     236                 :         107 :         Page            metapg;
     237                 :         107 :         BTMetaPageData *metad;
     238                 :             : 
     239                 :             :         /*
     240                 :             :          * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
     241                 :             :          * field started out as a TransactionId field called btm_oldest_btpo_xact.
     242                 :             :          * Both "versions" are just uint32 fields.  It was convenient to repurpose
     243                 :             :          * the field when we began to use 64-bit XIDs in deleted pages.
     244                 :             :          *
     245                 :             :          * It's possible that a pg_upgrade'd database will contain an XID value in
     246                 :             :          * what is now recognized as the metapage's btm_last_cleanup_num_delpages
     247                 :             :          * field.  _bt_vacuum_needs_cleanup() may even believe that this value
     248                 :             :          * indicates that there are lots of pages that it needs to recycle, when
     249                 :             :          * in reality there are only one or two.  The worst that can happen is
     250                 :             :          * that there will be a call to btvacuumscan a little earlier, which will
     251                 :             :          * set btm_last_cleanup_num_delpages to a sane value when we're called.
     252                 :             :          *
     253                 :             :          * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
     254                 :             :          * no longer used as of PostgreSQL 14.  We set it to -1.0 on rewrite, just
     255                 :             :          * to be consistent.
     256                 :             :          */
     257                 :         107 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     258                 :         107 :         metapg = BufferGetPage(metabuf);
     259                 :         107 :         metad = BTPageGetMeta(metapg);
     260                 :             : 
     261                 :             :         /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
     262   [ +  -  +  + ]:         107 :         if (metad->btm_version >= BTREE_NOVAC_VERSION &&
     263                 :         107 :                 metad->btm_last_cleanup_num_delpages == num_delpages)
     264                 :             :         {
     265                 :             :                 /* Usually means index continues to have num_delpages of 0 */
     266                 :          94 :                 _bt_relbuf(rel, metabuf);
     267                 :          94 :                 return;
     268                 :             :         }
     269                 :             : 
     270                 :             :         /* trade in our read lock for a write lock */
     271                 :          13 :         _bt_unlockbuf(rel, metabuf);
     272                 :          13 :         _bt_lockbuf(rel, metabuf, BT_WRITE);
     273                 :             : 
     274                 :          13 :         START_CRIT_SECTION();
     275                 :             : 
     276                 :             :         /* upgrade meta-page if needed */
     277         [ +  - ]:          13 :         if (metad->btm_version < BTREE_NOVAC_VERSION)
     278                 :           0 :                 _bt_upgrademetapage(metapg);
     279                 :             : 
     280                 :             :         /* update cleanup-related information */
     281                 :          13 :         metad->btm_last_cleanup_num_delpages = num_delpages;
     282                 :          13 :         metad->btm_last_cleanup_num_heap_tuples = -1.0;
     283                 :          13 :         MarkBufferDirty(metabuf);
     284                 :             : 
     285                 :             :         /* write wal record if needed */
     286   [ +  -  +  -  :          13 :         if (RelationNeedsWAL(rel))
             +  -  -  + ]
     287                 :             :         {
     288                 :          13 :                 xl_btree_metadata md;
     289                 :          13 :                 XLogRecPtr      recptr;
     290                 :             : 
     291                 :          13 :                 XLogBeginInsert();
     292                 :          13 :                 XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
     293                 :             : 
     294         [ +  - ]:          13 :                 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
     295                 :          13 :                 md.version = metad->btm_version;
     296                 :          13 :                 md.root = metad->btm_root;
     297                 :          13 :                 md.level = metad->btm_level;
     298                 :          13 :                 md.fastroot = metad->btm_fastroot;
     299                 :          13 :                 md.fastlevel = metad->btm_fastlevel;
     300                 :          13 :                 md.last_cleanup_num_delpages = num_delpages;
     301                 :          13 :                 md.allequalimage = metad->btm_allequalimage;
     302                 :             : 
     303                 :          13 :                 XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata));
     304                 :             : 
     305                 :          13 :                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
     306                 :             : 
     307                 :          13 :                 PageSetLSN(metapg, recptr);
     308                 :          13 :         }
     309                 :             : 
     310         [ +  - ]:          13 :         END_CRIT_SECTION();
     311                 :             : 
     312                 :          13 :         _bt_relbuf(rel, metabuf);
     313         [ -  + ]:         107 : }
     314                 :             : 
     315                 :             : /*
     316                 :             :  *      _bt_getroot() -- Get the root page of the btree.
     317                 :             :  *
     318                 :             :  *              Since the root page can move around the btree file, we have to read
     319                 :             :  *              its location from the metadata page, and then read the root page
     320                 :             :  *              itself.  If no root page exists yet, we have to create one.
     321                 :             :  *
     322                 :             :  *              The access type parameter (BT_READ or BT_WRITE) controls whether
     323                 :             :  *              a new root page will be created or not.  If access = BT_READ,
     324                 :             :  *              and no root page exists, we just return InvalidBuffer.  For
     325                 :             :  *              BT_WRITE, we try to create the root page if it doesn't exist.
     326                 :             :  *              NOTE that the returned root page will have only a read lock set
     327                 :             :  *              on it even if access = BT_WRITE!
     328                 :             :  *
     329                 :             :  *              If access = BT_WRITE, heaprel must be set; otherwise caller can just
     330                 :             :  *              pass NULL.  See _bt_allocbuf for an explanation.
     331                 :             :  *
     332                 :             :  *              The returned page is not necessarily the true root --- it could be
     333                 :             :  *              a "fast root" (a page that is alone in its level due to deletions).
     334                 :             :  *              Also, if the root page is split while we are "in flight" to it,
     335                 :             :  *              what we will return is the old root, which is now just the leftmost
     336                 :             :  *              page on a probably-not-very-wide level.  For most purposes this is
     337                 :             :  *              as good as or better than the true root, so we do not bother to
     338                 :             :  *              insist on finding the true root.  We do, however, guarantee to
     339                 :             :  *              return a live (not deleted or half-dead) page.
     340                 :             :  *
     341                 :             :  *              On successful return, the root page is pinned and read-locked.
     342                 :             :  *              The metadata page is not locked or pinned on exit.
     343                 :             :  */
     344                 :             : Buffer
     345                 :     1845721 : _bt_getroot(Relation rel, Relation heaprel, int access)
     346                 :             : {
     347                 :     1845721 :         Buffer          metabuf;
     348                 :     1845721 :         Buffer          rootbuf;
     349                 :     1845721 :         Page            rootpage;
     350                 :     1845721 :         BTPageOpaque rootopaque;
     351                 :     1845721 :         BlockNumber rootblkno;
     352                 :     1845721 :         uint32          rootlevel;
     353                 :     1845721 :         BTMetaPageData *metad;
     354                 :             : 
     355   [ +  +  +  - ]:     1845721 :         Assert(access == BT_READ || heaprel != NULL);
     356                 :             : 
     357                 :             :         /*
     358                 :             :          * Try to use previously-cached metapage data to find the root.  This
     359                 :             :          * normally saves one buffer access per index search, which is a very
     360                 :             :          * helpful savings in bufmgr traffic and hence contention.
     361                 :             :          */
     362         [ +  + ]:     1845721 :         if (rel->rd_amcache != NULL)
     363                 :             :         {
     364                 :     1817469 :                 metad = (BTMetaPageData *) rel->rd_amcache;
     365                 :             :                 /* We shouldn't have cached it if any of these fail */
     366         [ +  - ]:     1817469 :                 Assert(metad->btm_magic == BTREE_MAGIC);
     367         [ +  - ]:     1817469 :                 Assert(metad->btm_version >= BTREE_MIN_VERSION);
     368         [ +  - ]:     1817469 :                 Assert(metad->btm_version <= BTREE_VERSION);
     369   [ +  +  +  - ]:     1817469 :                 Assert(!metad->btm_allequalimage ||
     370                 :             :                            metad->btm_version > BTREE_NOVAC_VERSION);
     371         [ +  - ]:     1817469 :                 Assert(metad->btm_root != P_NONE);
     372                 :             : 
     373                 :     1817469 :                 rootblkno = metad->btm_fastroot;
     374         [ +  - ]:     1817469 :                 Assert(rootblkno != P_NONE);
     375                 :     1817469 :                 rootlevel = metad->btm_fastlevel;
     376                 :             : 
     377                 :     1817469 :                 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
     378                 :     1817469 :                 rootpage = BufferGetPage(rootbuf);
     379                 :     1817469 :                 rootopaque = BTPageGetOpaque(rootpage);
     380                 :             : 
     381                 :             :                 /*
     382                 :             :                  * Since the cache might be stale, we check the page more carefully
     383                 :             :                  * here than normal.  We *must* check that it's not deleted. If it's
     384                 :             :                  * not alone on its level, then we reject too --- this may be overly
     385                 :             :                  * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
     386                 :             :                  * because that's not set in a "fast root".
     387                 :             :                  */
     388         [ +  - ]:     1817469 :                 if (!P_IGNORE(rootopaque) &&
     389         [ +  - ]:     1817469 :                         rootopaque->btpo_level == rootlevel &&
     390   [ +  -  +  + ]:     1817469 :                         P_LEFTMOST(rootopaque) &&
     391                 :     1817469 :                         P_RIGHTMOST(rootopaque))
     392                 :             :                 {
     393                 :             :                         /* OK, accept cached page as the root */
     394                 :     1817374 :                         return rootbuf;
     395                 :             :                 }
     396                 :          95 :                 _bt_relbuf(rel, rootbuf);
     397                 :             :                 /* Cache is stale, throw it away */
     398         [ -  + ]:          95 :                 if (rel->rd_amcache)
     399                 :          95 :                         pfree(rel->rd_amcache);
     400                 :          95 :                 rel->rd_amcache = NULL;
     401                 :          95 :         }
     402                 :             : 
     403                 :       28347 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     404                 :       28347 :         metad = _bt_getmeta(rel, metabuf);
     405                 :             : 
     406                 :             :         /* if no root page initialized yet, do it */
     407         [ +  + ]:       28347 :         if (metad->btm_root == P_NONE)
     408                 :             :         {
     409                 :       28242 :                 Page            metapg;
     410                 :             : 
     411                 :             :                 /* If access = BT_READ, caller doesn't want us to create root yet */
     412         [ +  + ]:       28242 :                 if (access == BT_READ)
     413                 :             :                 {
     414                 :       27545 :                         _bt_relbuf(rel, metabuf);
     415                 :       27545 :                         return InvalidBuffer;
     416                 :             :                 }
     417                 :             : 
     418                 :             :                 /* trade in our read lock for a write lock */
     419                 :         697 :                 _bt_unlockbuf(rel, metabuf);
     420                 :         697 :                 _bt_lockbuf(rel, metabuf, BT_WRITE);
     421                 :             : 
     422                 :             :                 /*
     423                 :             :                  * Race condition:      if someone else initialized the metadata between
     424                 :             :                  * the time we released the read lock and acquired the write lock, we
     425                 :             :                  * must avoid doing it again.
     426                 :             :                  */
     427         [ -  + ]:         697 :                 if (metad->btm_root != P_NONE)
     428                 :             :                 {
     429                 :             :                         /*
     430                 :             :                          * Metadata initialized by someone else.  In order to guarantee no
     431                 :             :                          * deadlocks, we have to release the metadata page and start all
     432                 :             :                          * over again.  (Is that really true? But it's hardly worth trying
     433                 :             :                          * to optimize this case.)
     434                 :             :                          */
     435                 :           0 :                         _bt_relbuf(rel, metabuf);
     436                 :           0 :                         return _bt_getroot(rel, heaprel, access);
     437                 :             :                 }
     438                 :             : 
     439                 :             :                 /*
     440                 :             :                  * Get, initialize, write, and leave a lock of the appropriate type on
     441                 :             :                  * the new root page.  Since this is the first page in the tree, it's
     442                 :             :                  * a leaf as well as the root.
     443                 :             :                  */
     444                 :         697 :                 rootbuf = _bt_allocbuf(rel, heaprel);
     445                 :         697 :                 rootblkno = BufferGetBlockNumber(rootbuf);
     446                 :         697 :                 rootpage = BufferGetPage(rootbuf);
     447                 :         697 :                 rootopaque = BTPageGetOpaque(rootpage);
     448                 :         697 :                 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
     449                 :         697 :                 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
     450                 :         697 :                 rootopaque->btpo_level = 0;
     451                 :         697 :                 rootopaque->btpo_cycleid = 0;
     452                 :             :                 /* Get raw page pointer for metapage */
     453                 :         697 :                 metapg = BufferGetPage(metabuf);
     454                 :             : 
     455                 :             :                 /* NO ELOG(ERROR) till meta is updated */
     456                 :         697 :                 START_CRIT_SECTION();
     457                 :             : 
     458                 :             :                 /* upgrade metapage if needed */
     459         [ +  - ]:         697 :                 if (metad->btm_version < BTREE_NOVAC_VERSION)
     460                 :           0 :                         _bt_upgrademetapage(metapg);
     461                 :             : 
     462                 :         697 :                 metad->btm_root = rootblkno;
     463                 :         697 :                 metad->btm_level = 0;
     464                 :         697 :                 metad->btm_fastroot = rootblkno;
     465                 :         697 :                 metad->btm_fastlevel = 0;
     466                 :         697 :                 metad->btm_last_cleanup_num_delpages = 0;
     467                 :         697 :                 metad->btm_last_cleanup_num_heap_tuples = -1.0;
     468                 :             : 
     469                 :         697 :                 MarkBufferDirty(rootbuf);
     470                 :         697 :                 MarkBufferDirty(metabuf);
     471                 :             : 
     472                 :             :                 /* XLOG stuff */
     473   [ +  +  +  +  :         697 :                 if (RelationNeedsWAL(rel))
             +  +  -  + ]
     474                 :             :                 {
     475                 :         618 :                         xl_btree_newroot xlrec;
     476                 :         618 :                         XLogRecPtr      recptr;
     477                 :         618 :                         xl_btree_metadata md;
     478                 :             : 
     479                 :         618 :                         XLogBeginInsert();
     480                 :         618 :                         XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
     481                 :         618 :                         XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
     482                 :             : 
     483         [ +  - ]:         618 :                         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
     484                 :         618 :                         md.version = metad->btm_version;
     485                 :         618 :                         md.root = rootblkno;
     486                 :         618 :                         md.level = 0;
     487                 :         618 :                         md.fastroot = rootblkno;
     488                 :         618 :                         md.fastlevel = 0;
     489                 :         618 :                         md.last_cleanup_num_delpages = 0;
     490                 :         618 :                         md.allequalimage = metad->btm_allequalimage;
     491                 :             : 
     492                 :         618 :                         XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
     493                 :             : 
     494                 :         618 :                         xlrec.rootblk = rootblkno;
     495                 :         618 :                         xlrec.level = 0;
     496                 :             : 
     497                 :         618 :                         XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
     498                 :             : 
     499                 :         618 :                         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
     500                 :             : 
     501                 :         618 :                         PageSetLSN(rootpage, recptr);
     502                 :         618 :                         PageSetLSN(metapg, recptr);
     503                 :         618 :                 }
     504                 :             : 
     505         [ +  - ]:         697 :                 END_CRIT_SECTION();
     506                 :             : 
     507                 :             :                 /*
     508                 :             :                  * swap root write lock for read lock.  There is no danger of anyone
     509                 :             :                  * else accessing the new root page while it's unlocked, since no one
     510                 :             :                  * else knows where it is yet.
     511                 :             :                  */
     512                 :         697 :                 _bt_unlockbuf(rel, rootbuf);
     513                 :         697 :                 _bt_lockbuf(rel, rootbuf, BT_READ);
     514                 :             : 
     515                 :             :                 /* okay, metadata is correct, release lock on it without caching */
     516                 :         697 :                 _bt_relbuf(rel, metabuf);
     517         [ +  + ]:       28242 :         }
     518                 :             :         else
     519                 :             :         {
     520                 :         105 :                 rootblkno = metad->btm_fastroot;
     521         [ +  - ]:         105 :                 Assert(rootblkno != P_NONE);
     522                 :         105 :                 rootlevel = metad->btm_fastlevel;
     523                 :             : 
     524                 :             :                 /*
     525                 :             :                  * Cache the metapage data for next time
     526                 :             :                  */
     527                 :         105 :                 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     528                 :             :                                                                                          sizeof(BTMetaPageData));
     529                 :         105 :                 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     530                 :             : 
     531                 :             :                 /*
     532                 :             :                  * We are done with the metapage; arrange to release it via first
     533                 :             :                  * _bt_relandgetbuf call
     534                 :             :                  */
     535                 :         105 :                 rootbuf = metabuf;
     536                 :             : 
     537                 :         105 :                 for (;;)
     538                 :             :                 {
     539                 :         105 :                         rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
     540                 :         105 :                         rootpage = BufferGetPage(rootbuf);
     541                 :         105 :                         rootopaque = BTPageGetOpaque(rootpage);
     542                 :             : 
     543         [ -  + ]:         105 :                         if (!P_IGNORE(rootopaque))
     544                 :         105 :                                 break;
     545                 :             : 
     546                 :             :                         /* it's dead, Jim.  step right one page */
     547         [ #  # ]:           0 :                         if (P_RIGHTMOST(rootopaque))
     548   [ #  #  #  # ]:           0 :                                 elog(ERROR, "no live root page found in index \"%s\"",
     549                 :             :                                          RelationGetRelationName(rel));
     550                 :           0 :                         rootblkno = rootopaque->btpo_next;
     551                 :             :                 }
     552                 :             : 
     553         [ +  - ]:         105 :                 if (rootopaque->btpo_level != rootlevel)
     554   [ #  #  #  # ]:           0 :                         elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
     555                 :             :                                  rootblkno, RelationGetRelationName(rel),
     556                 :             :                                  rootopaque->btpo_level, rootlevel);
     557                 :             :         }
     558                 :             : 
     559                 :             :         /*
     560                 :             :          * By here, we have a pin and read lock on the root page, and no lock set
     561                 :             :          * on the metadata page.  Return the root page's buffer.
     562                 :             :          */
     563                 :         802 :         return rootbuf;
     564                 :     1845721 : }
     565                 :             : 
     566                 :             : /*
     567                 :             :  *      _bt_gettrueroot() -- Get the true root page of the btree.
     568                 :             :  *
     569                 :             :  *              This is the same as the BT_READ case of _bt_getroot(), except
     570                 :             :  *              we follow the true-root link not the fast-root link.
     571                 :             :  *
     572                 :             :  * By the time we acquire lock on the root page, it might have been split and
     573                 :             :  * not be the true root anymore.  This is okay for the present uses of this
     574                 :             :  * routine; we only really need to be able to move up at least one tree level
     575                 :             :  * from whatever non-root page we were at.  If we ever do need to lock the
     576                 :             :  * one true root page, we could loop here, re-reading the metapage on each
     577                 :             :  * failure.  (Note that it wouldn't do to hold the lock on the metapage while
     578                 :             :  * moving to the root --- that'd deadlock against any concurrent root split.)
     579                 :             :  */
     580                 :             : Buffer
     581                 :           3 : _bt_gettrueroot(Relation rel)
     582                 :             : {
     583                 :           3 :         Buffer          metabuf;
     584                 :           3 :         Page            metapg;
     585                 :           3 :         BTPageOpaque metaopaque;
     586                 :           3 :         Buffer          rootbuf;
     587                 :           3 :         Page            rootpage;
     588                 :           3 :         BTPageOpaque rootopaque;
     589                 :           3 :         BlockNumber rootblkno;
     590                 :           3 :         uint32          rootlevel;
     591                 :           3 :         BTMetaPageData *metad;
     592                 :             : 
     593                 :             :         /*
     594                 :             :          * We don't try to use cached metapage data here, since (a) this path is
     595                 :             :          * not performance-critical, and (b) if we are here it suggests our cache
     596                 :             :          * is out-of-date anyway.  In light of point (b), it's probably safest to
     597                 :             :          * actively flush any cached metapage info.
     598                 :             :          */
     599         [ -  + ]:           3 :         if (rel->rd_amcache)
     600                 :           3 :                 pfree(rel->rd_amcache);
     601                 :           3 :         rel->rd_amcache = NULL;
     602                 :             : 
     603                 :           3 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     604                 :           3 :         metapg = BufferGetPage(metabuf);
     605                 :           3 :         metaopaque = BTPageGetOpaque(metapg);
     606                 :           3 :         metad = BTPageGetMeta(metapg);
     607                 :             : 
     608         [ +  - ]:           3 :         if (!P_ISMETA(metaopaque) ||
     609                 :           3 :                 metad->btm_magic != BTREE_MAGIC)
     610   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     611                 :             :                                 (errcode(ERRCODE_INDEX_CORRUPTED),
     612                 :             :                                  errmsg("index \"%s\" is not a btree",
     613                 :             :                                                 RelationGetRelationName(rel))));
     614                 :             : 
     615         [ +  - ]:           3 :         if (metad->btm_version < BTREE_MIN_VERSION ||
     616                 :           3 :                 metad->btm_version > BTREE_VERSION)
     617   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     618                 :             :                                 (errcode(ERRCODE_INDEX_CORRUPTED),
     619                 :             :                                  errmsg("version mismatch in index \"%s\": file version %d, "
     620                 :             :                                                 "current version %d, minimal supported version %d",
     621                 :             :                                                 RelationGetRelationName(rel),
     622                 :             :                                                 metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
     623                 :             : 
     624                 :             :         /* if no root page initialized yet, fail */
     625         [ +  - ]:           3 :         if (metad->btm_root == P_NONE)
     626                 :             :         {
     627                 :           0 :                 _bt_relbuf(rel, metabuf);
     628                 :           0 :                 return InvalidBuffer;
     629                 :             :         }
     630                 :             : 
     631                 :           3 :         rootblkno = metad->btm_root;
     632                 :           3 :         rootlevel = metad->btm_level;
     633                 :             : 
     634                 :             :         /*
     635                 :             :          * We are done with the metapage; arrange to release it via first
     636                 :             :          * _bt_relandgetbuf call
     637                 :             :          */
     638                 :           3 :         rootbuf = metabuf;
     639                 :             : 
     640                 :           3 :         for (;;)
     641                 :             :         {
     642                 :           3 :                 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
     643                 :           3 :                 rootpage = BufferGetPage(rootbuf);
     644                 :           3 :                 rootopaque = BTPageGetOpaque(rootpage);
     645                 :             : 
     646         [ -  + ]:           3 :                 if (!P_IGNORE(rootopaque))
     647                 :           3 :                         break;
     648                 :             : 
     649                 :             :                 /* it's dead, Jim.  step right one page */
     650         [ #  # ]:           0 :                 if (P_RIGHTMOST(rootopaque))
     651   [ #  #  #  # ]:           0 :                         elog(ERROR, "no live root page found in index \"%s\"",
     652                 :             :                                  RelationGetRelationName(rel));
     653                 :           0 :                 rootblkno = rootopaque->btpo_next;
     654                 :             :         }
     655                 :             : 
     656         [ +  - ]:           3 :         if (rootopaque->btpo_level != rootlevel)
     657   [ #  #  #  # ]:           0 :                 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
     658                 :             :                          rootblkno, RelationGetRelationName(rel),
     659                 :             :                          rootopaque->btpo_level, rootlevel);
     660                 :             : 
     661                 :           3 :         return rootbuf;
     662                 :           3 : }
     663                 :             : 
     664                 :             : /*
     665                 :             :  *      _bt_getrootheight() -- Get the height of the btree search tree.
     666                 :             :  *
     667                 :             :  *              We return the level (counting from zero) of the current fast root.
     668                 :             :  *              This represents the number of tree levels we'd have to descend through
     669                 :             :  *              to start any btree index search.
     670                 :             :  *
     671                 :             :  *              This is used by the planner for cost-estimation purposes.  Since it's
     672                 :             :  *              only an estimate, slightly-stale data is fine, hence we don't worry
     673                 :             :  *              about updating previously cached data.
     674                 :             :  */
     675                 :             : int
     676                 :      644878 : _bt_getrootheight(Relation rel)
     677                 :             : {
     678                 :      644878 :         BTMetaPageData *metad;
     679                 :             : 
     680         [ +  + ]:      644878 :         if (rel->rd_amcache == NULL)
     681                 :             :         {
     682                 :        6820 :                 Buffer          metabuf;
     683                 :             : 
     684                 :        6820 :                 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     685                 :        6820 :                 metad = _bt_getmeta(rel, metabuf);
     686                 :             : 
     687                 :             :                 /*
     688                 :             :                  * If there's no root page yet, _bt_getroot() doesn't expect a cache
     689                 :             :                  * to be made, so just stop here and report the index height is zero.
     690                 :             :                  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
     691                 :             :                  */
     692         [ +  + ]:        6820 :                 if (metad->btm_root == P_NONE)
     693                 :             :                 {
     694                 :        5053 :                         _bt_relbuf(rel, metabuf);
     695                 :        5053 :                         return 0;
     696                 :             :                 }
     697                 :             : 
     698                 :             :                 /*
     699                 :             :                  * Cache the metapage data for next time
     700                 :             :                  */
     701                 :        1767 :                 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     702                 :             :                                                                                          sizeof(BTMetaPageData));
     703                 :        1767 :                 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     704                 :        1767 :                 _bt_relbuf(rel, metabuf);
     705         [ +  + ]:        6820 :         }
     706                 :             : 
     707                 :             :         /* Get cached page */
     708                 :      639825 :         metad = (BTMetaPageData *) rel->rd_amcache;
     709                 :             :         /* We shouldn't have cached it if any of these fail */
     710         [ +  - ]:      639825 :         Assert(metad->btm_magic == BTREE_MAGIC);
     711         [ +  - ]:      639825 :         Assert(metad->btm_version >= BTREE_MIN_VERSION);
     712         [ +  - ]:      639825 :         Assert(metad->btm_version <= BTREE_VERSION);
     713   [ +  +  +  - ]:      639825 :         Assert(!metad->btm_allequalimage ||
     714                 :             :                    metad->btm_version > BTREE_NOVAC_VERSION);
     715         [ +  - ]:      639825 :         Assert(metad->btm_fastroot != P_NONE);
     716                 :             : 
     717                 :      639825 :         return metad->btm_fastlevel;
     718                 :      644878 : }
     719                 :             : 
     720                 :             : /*
     721                 :             :  *      _bt_metaversion() -- Get version/status info from metapage.
     722                 :             :  *
     723                 :             :  *              Sets caller's *heapkeyspace and *allequalimage arguments using data
     724                 :             :  *              from the B-Tree metapage (could be locally-cached version).  This
     725                 :             :  *              information needs to be stashed in insertion scankey, so we provide a
     726                 :             :  *              single function that fetches both at once.
     727                 :             :  *
     728                 :             :  *              This is used to determine the rules that must be used to descend a
     729                 :             :  *              btree.  Version 4 indexes treat heap TID as a tiebreaker attribute.
     730                 :             :  *              pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
     731                 :             :  *              performance when inserting a new BTScanInsert-wise duplicate tuple
     732                 :             :  *              among many leaf pages already full of such duplicates.
     733                 :             :  *
     734                 :             :  *              Also sets allequalimage field, which indicates whether or not it is
     735                 :             :  *              safe to apply deduplication.  We rely on the assumption that
     736                 :             :  *              btm_allequalimage will be zero'ed on heapkeyspace indexes that were
     737                 :             :  *              pg_upgrade'd from Postgres 12.
     738                 :             :  */
     739                 :             : void
     740                 :     1839280 : _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
     741                 :             : {
     742                 :     1839280 :         BTMetaPageData *metad;
     743                 :             : 
     744         [ +  + ]:     1839280 :         if (rel->rd_amcache == NULL)
     745                 :             :         {
     746                 :       53286 :                 Buffer          metabuf;
     747                 :             : 
     748                 :       53286 :                 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     749                 :       53286 :                 metad = _bt_getmeta(rel, metabuf);
     750                 :             : 
     751                 :             :                 /*
     752                 :             :                  * If there's no root page yet, _bt_getroot() doesn't expect a cache
     753                 :             :                  * to be made, so just stop here.  (XXX perhaps _bt_getroot() should
     754                 :             :                  * be changed to allow this case.)
     755                 :             :                  */
     756         [ +  + ]:       53286 :                 if (metad->btm_root == P_NONE)
     757                 :             :                 {
     758                 :       27903 :                         *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
     759                 :       27903 :                         *allequalimage = metad->btm_allequalimage;
     760                 :             : 
     761                 :       27903 :                         _bt_relbuf(rel, metabuf);
     762                 :       27903 :                         return;
     763                 :             :                 }
     764                 :             : 
     765                 :             :                 /*
     766                 :             :                  * Cache the metapage data for next time
     767                 :             :                  *
     768                 :             :                  * An on-the-fly version upgrade performed by _bt_upgrademetapage()
     769                 :             :                  * can change the nbtree version for an index without invalidating any
     770                 :             :                  * local cache.  This is okay because it can only happen when moving
     771                 :             :                  * from version 2 to version 3, both of which are !heapkeyspace
     772                 :             :                  * versions.
     773                 :             :                  */
     774                 :       25383 :                 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     775                 :             :                                                                                          sizeof(BTMetaPageData));
     776                 :       25383 :                 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     777                 :       25383 :                 _bt_relbuf(rel, metabuf);
     778         [ +  + ]:       53286 :         }
     779                 :             : 
     780                 :             :         /* Get cached page */
     781                 :     1811377 :         metad = (BTMetaPageData *) rel->rd_amcache;
     782                 :             :         /* We shouldn't have cached it if any of these fail */
     783         [ +  - ]:     1811377 :         Assert(metad->btm_magic == BTREE_MAGIC);
     784         [ +  - ]:     1811377 :         Assert(metad->btm_version >= BTREE_MIN_VERSION);
     785         [ +  - ]:     1811377 :         Assert(metad->btm_version <= BTREE_VERSION);
     786   [ +  +  +  - ]:     1811377 :         Assert(!metad->btm_allequalimage ||
     787                 :             :                    metad->btm_version > BTREE_NOVAC_VERSION);
     788         [ +  - ]:     1811377 :         Assert(metad->btm_fastroot != P_NONE);
     789                 :             : 
     790                 :     1811377 :         *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
     791                 :     1811377 :         *allequalimage = metad->btm_allequalimage;
     792         [ -  + ]:     1839280 : }
     793                 :             : 
     794                 :             : /*
     795                 :             :  *      _bt_checkpage() -- Verify that a freshly-read page looks sane.
     796                 :             :  */
     797                 :             : void
     798                 :     3508720 : _bt_checkpage(Relation rel, Buffer buf)
     799                 :             : {
     800                 :     3508720 :         Page            page = BufferGetPage(buf);
     801                 :             : 
     802                 :             :         /*
     803                 :             :          * ReadBuffer verifies that every newly-read page passes
     804                 :             :          * PageHeaderIsValid, which means it either contains a reasonably sane
     805                 :             :          * page header or is all-zero.  We have to defend against the all-zero
     806                 :             :          * case, however.
     807                 :             :          */
     808         [ +  - ]:     3508720 :         if (PageIsNew(page))
     809   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     810                 :             :                                 (errcode(ERRCODE_INDEX_CORRUPTED),
     811                 :             :                                  errmsg("index \"%s\" contains unexpected zero page at block %u",
     812                 :             :                                                 RelationGetRelationName(rel),
     813                 :             :                                                 BufferGetBlockNumber(buf)),
     814                 :             :                                  errhint("Please REINDEX it.")));
     815                 :             : 
     816                 :             :         /*
     817                 :             :          * Additionally check that the special area looks sane.
     818                 :             :          */
     819         [ +  - ]:     3508720 :         if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
     820   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     821                 :             :                                 (errcode(ERRCODE_INDEX_CORRUPTED),
     822                 :             :                                  errmsg("index \"%s\" contains corrupted page at block %u",
     823                 :             :                                                 RelationGetRelationName(rel),
     824                 :             :                                                 BufferGetBlockNumber(buf)),
     825                 :             :                                  errhint("Please REINDEX it.")));
     826                 :     3508720 : }
     827                 :             : 
     828                 :             : /*
     829                 :             :  *      _bt_getbuf() -- Get an existing block in a buffer, for read or write.
     830                 :             :  *
     831                 :             :  *              The general rule in nbtree is that it's never okay to access a
     832                 :             :  *              page without holding both a buffer pin and a buffer lock on
     833                 :             :  *              the page's buffer.
     834                 :             :  *
     835                 :             :  *              When this routine returns, the appropriate lock is set on the
     836                 :             :  *              requested buffer and its reference count has been incremented
     837                 :             :  *              (ie, the buffer is "locked and pinned").  Also, we apply
     838                 :             :  *              _bt_checkpage to sanity-check the page, and perform Valgrind
     839                 :             :  *              client requests that help Valgrind detect unsafe page accesses.
     840                 :             :  *
     841                 :             :  *              Note: raw LockBuffer() calls are disallowed in nbtree; all
     842                 :             :  *              buffer lock requests need to go through wrapper functions such
     843                 :             :  *              as _bt_lockbuf().
     844                 :             :  */
     845                 :             : Buffer
     846                 :     1921354 : _bt_getbuf(Relation rel, BlockNumber blkno, int access)
     847                 :             : {
     848                 :     1921354 :         Buffer          buf;
     849                 :             : 
     850         [ +  - ]:     1921354 :         Assert(BlockNumberIsValid(blkno));
     851                 :             : 
     852                 :             :         /* Read an existing block of the relation */
     853                 :     1921354 :         buf = ReadBuffer(rel, blkno);
     854                 :     1921354 :         _bt_lockbuf(rel, buf, access);
     855                 :     1921354 :         _bt_checkpage(rel, buf);
     856                 :             : 
     857                 :     3842708 :         return buf;
     858                 :     1921354 : }
     859                 :             : 
     860                 :             : /*
     861                 :             :  *      _bt_allocbuf() -- Allocate a new block/page.
     862                 :             :  *
     863                 :             :  * Returns a write-locked buffer containing an unallocated nbtree page.
     864                 :             :  *
     865                 :             :  * Callers are required to pass a valid heaprel.  We need heaprel so that we
     866                 :             :  * can handle generating a snapshotConflictHorizon that makes reusing a page
     867                 :             :  * from the FSM safe for queries that may be running on standbys.
     868                 :             :  */
     869                 :             : Buffer
     870                 :        3077 : _bt_allocbuf(Relation rel, Relation heaprel)
     871                 :             : {
     872                 :        3077 :         Buffer          buf;
     873                 :        3077 :         BlockNumber blkno;
     874                 :        3077 :         Page            page;
     875                 :             : 
     876         [ +  - ]:        3077 :         Assert(heaprel != NULL);
     877                 :             : 
     878                 :             :         /*
     879                 :             :          * First see if the FSM knows of any free pages.
     880                 :             :          *
     881                 :             :          * We can't trust the FSM's report unreservedly; we have to check that the
     882                 :             :          * page is still free.  (For example, an already-free page could have been
     883                 :             :          * re-used between the time the last VACUUM scanned it and the time the
     884                 :             :          * VACUUM made its FSM updates.)
     885                 :             :          *
     886                 :             :          * In fact, it's worse than that: we can't even assume that it's safe to
     887                 :             :          * take a lock on the reported page.  If somebody else has a lock on it,
     888                 :             :          * or even worse our own caller does, we could deadlock.  (The own-caller
     889                 :             :          * scenario is actually not improbable. Consider an index on a serial or
     890                 :             :          * timestamp column.  Nearly all splits will be at the rightmost page, so
     891                 :             :          * it's entirely likely that _bt_split will call us while holding a lock
     892                 :             :          * on the page most recently acquired from FSM. A VACUUM running
     893                 :             :          * concurrently with the previous split could well have placed that page
     894                 :             :          * back in FSM.)
     895                 :             :          *
     896                 :             :          * To get around that, we ask for only a conditional lock on the reported
     897                 :             :          * page.  If we fail, then someone else is using the page, and we may
     898                 :             :          * reasonably assume it's not free.  (If we happen to be wrong, the worst
     899                 :             :          * consequence is the page will be lost to use till the next VACUUM, which
     900                 :             :          * is no big problem.)
     901                 :             :          */
     902                 :        3077 :         for (;;)
     903                 :             :         {
     904                 :        3077 :                 blkno = GetFreeIndexPage(rel);
     905         [ +  + ]:        3077 :                 if (blkno == InvalidBlockNumber)
     906                 :        3066 :                         break;
     907                 :          11 :                 buf = ReadBuffer(rel, blkno);
     908         [ +  - ]:          11 :                 if (_bt_conditionallockbuf(rel, buf))
     909                 :             :                 {
     910                 :          11 :                         page = BufferGetPage(buf);
     911                 :             : 
     912                 :             :                         /*
     913                 :             :                          * It's possible to find an all-zeroes page in an index.  For
     914                 :             :                          * example, a backend might successfully extend the relation one
     915                 :             :                          * page and then crash before it is able to make a WAL entry for
     916                 :             :                          * adding the page.  If we find a zeroed page then reclaim it
     917                 :             :                          * immediately.
     918                 :             :                          */
     919         [ -  + ]:          11 :                         if (PageIsNew(page))
     920                 :             :                         {
     921                 :             :                                 /* Okay to use page.  Initialize and return it. */
     922                 :           0 :                                 _bt_pageinit(page, BufferGetPageSize(buf));
     923                 :           0 :                                 return buf;
     924                 :             :                         }
     925                 :             : 
     926         [ -  + ]:          11 :                         if (BTPageIsRecyclable(page, heaprel))
     927                 :             :                         {
     928                 :             :                                 /*
     929                 :             :                                  * If we are generating WAL for Hot Standby then create a WAL
     930                 :             :                                  * record that will allow us to conflict with queries running
     931                 :             :                                  * on standby, in case they have snapshots older than safexid
     932                 :             :                                  * value
     933                 :             :                                  */
     934   [ +  -  +  -  :          11 :                                 if (RelationNeedsWAL(rel) && XLogStandbyInfoActive())
             +  -  #  # ]
     935                 :             :                                 {
     936                 :           0 :                                         xl_btree_reuse_page xlrec_reuse;
     937                 :             : 
     938                 :             :                                         /*
     939                 :             :                                          * Note that we don't register the buffer with the record,
     940                 :             :                                          * because this operation doesn't modify the page (that
     941                 :             :                                          * already happened, back when VACUUM deleted the page).
     942                 :             :                                          * This record only exists to provide a conflict point for
     943                 :             :                                          * Hot Standby.  See record REDO routine comments.
     944                 :             :                                          */
     945                 :           0 :                                         xlrec_reuse.locator = rel->rd_locator;
     946                 :           0 :                                         xlrec_reuse.block = blkno;
     947                 :           0 :                                         xlrec_reuse.snapshotConflictHorizon = BTPageGetDeleteXid(page);
     948                 :           0 :                                         xlrec_reuse.isCatalogRel =
     949   [ #  #  #  #  :           0 :                                                 RelationIsAccessibleInLogicalDecoding(heaprel);
          #  #  #  #  #  
          #  #  #  #  #  
             #  #  #  # ]
     950                 :             : 
     951                 :           0 :                                         XLogBeginInsert();
     952                 :           0 :                                         XLogRegisterData(&xlrec_reuse, SizeOfBtreeReusePage);
     953                 :             : 
     954                 :           0 :                                         XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
     955                 :           0 :                                 }
     956                 :             : 
     957                 :             :                                 /* Okay to use page.  Re-initialize and return it. */
     958                 :          11 :                                 _bt_pageinit(page, BufferGetPageSize(buf));
     959                 :          11 :                                 return buf;
     960                 :             :                         }
     961   [ #  #  #  # ]:           0 :                         elog(DEBUG2, "FSM returned nonrecyclable page");
     962                 :           0 :                         _bt_relbuf(rel, buf);
     963                 :           0 :                 }
     964                 :             :                 else
     965                 :             :                 {
     966   [ #  #  #  # ]:           0 :                         elog(DEBUG2, "FSM returned nonlockable page");
     967                 :             :                         /* couldn't get lock, so just drop pin */
     968                 :           0 :                         ReleaseBuffer(buf);
     969                 :             :                 }
     970                 :             :         }
     971                 :             : 
     972                 :             :         /*
     973                 :             :          * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
     974                 :             :          * risk a race condition against btvacuumscan --- see comments therein.
     975                 :             :          * This forces us to repeat the valgrind request that _bt_lockbuf()
     976                 :             :          * otherwise would make, as we can't use _bt_lockbuf() without introducing
     977                 :             :          * a race.
     978                 :             :          */
     979                 :        3066 :         buf = ExtendBufferedRel(BMR_REL(rel), MAIN_FORKNUM, NULL, EB_LOCK_FIRST);
     980         [ +  + ]:        3066 :         if (!RelationUsesLocalBuffers(rel))
     981                 :        2966 :                 VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
     982                 :             : 
     983                 :             :         /* Initialize the new page before returning it */
     984                 :        3066 :         page = BufferGetPage(buf);
     985         [ +  - ]:        3066 :         Assert(PageIsNew(page));
     986                 :        3066 :         _bt_pageinit(page, BufferGetPageSize(buf));
     987                 :             : 
     988                 :        3066 :         return buf;
     989                 :        3077 : }
     990                 :             : 
     991                 :             : /*
     992                 :             :  *      _bt_relandgetbuf() -- release a locked buffer and get another one.
     993                 :             :  *
     994                 :             :  * This is equivalent to _bt_relbuf followed by _bt_getbuf.  Also, if obuf is
     995                 :             :  * InvalidBuffer then it reduces to just _bt_getbuf; allowing this case
     996                 :             :  * simplifies some callers.
     997                 :             :  *
     998                 :             :  * The original motivation for using this was to avoid two entries to the
     999                 :             :  * bufmgr when one would do.  However, now it's mainly just a notational
    1000                 :             :  * convenience.  The only case where it saves work over _bt_relbuf/_bt_getbuf
    1001                 :             :  * is when the target page is the same one already in the buffer.
    1002                 :             :  */
    1003                 :             : Buffer
    1004                 :     1586453 : _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
    1005                 :             : {
    1006                 :     1586453 :         Buffer          buf;
    1007                 :             : 
    1008         [ +  - ]:     1586453 :         Assert(BlockNumberIsValid(blkno));
    1009         [ +  + ]:     1586453 :         if (BufferIsValid(obuf))
    1010                 :     1583819 :                 _bt_unlockbuf(rel, obuf);
    1011                 :     1586453 :         buf = ReleaseAndReadBuffer(obuf, rel, blkno);
    1012                 :     1586453 :         _bt_lockbuf(rel, buf, access);
    1013                 :             : 
    1014                 :     1586453 :         _bt_checkpage(rel, buf);
    1015                 :     3172906 :         return buf;
    1016                 :     1586453 : }
    1017                 :             : 
    1018                 :             : /*
    1019                 :             :  *      _bt_relbuf() -- release a locked buffer.
    1020                 :             :  *
    1021                 :             :  * Lock and pin (refcount) are both dropped.
    1022                 :             :  */
    1023                 :             : void
    1024                 :     1605453 : _bt_relbuf(Relation rel, Buffer buf)
    1025                 :             : {
    1026                 :     1605453 :         _bt_unlockbuf(rel, buf);
    1027                 :     1605453 :         ReleaseBuffer(buf);
    1028                 :     1605453 : }
    1029                 :             : 
    1030                 :             : /*
    1031                 :             :  *      _bt_lockbuf() -- lock a pinned buffer.
    1032                 :             :  *
    1033                 :             :  * Lock is acquired without acquiring another pin.  This is like a raw
    1034                 :             :  * LockBuffer() call, but performs extra steps needed by Valgrind.
    1035                 :             :  *
    1036                 :             :  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
    1037                 :             :  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
    1038                 :             :  */
    1039                 :             : void
    1040                 :     3574648 : _bt_lockbuf(Relation rel, Buffer buf, int access)
    1041                 :             : {
    1042                 :             :         /* LockBuffer() asserts that pin is held by this backend */
    1043                 :     3574648 :         LockBuffer(buf, access);
    1044                 :             : 
    1045                 :             :         /*
    1046                 :             :          * It doesn't matter that _bt_unlockbuf() won't get called in the event of
    1047                 :             :          * an nbtree error (e.g. a unique violation error).  That won't cause
    1048                 :             :          * Valgrind false positives.
    1049                 :             :          *
    1050                 :             :          * The nbtree client requests are superimposed on top of the bufmgr.c
    1051                 :             :          * buffer pin client requests.  In the event of an nbtree error the buffer
    1052                 :             :          * will certainly get marked as defined when the backend once again
    1053                 :             :          * acquires its first pin on the buffer. (Of course, if the backend never
    1054                 :             :          * touches the buffer again then it doesn't matter that it remains
    1055                 :             :          * non-accessible to Valgrind.)
    1056                 :             :          *
    1057                 :             :          * Note: When an IndexTuple C pointer gets computed using an ItemId read
    1058                 :             :          * from a page while a lock was held, the C pointer becomes unsafe to
    1059                 :             :          * dereference forever as soon as the lock is released.  Valgrind can only
    1060                 :             :          * detect cases where the pointer gets dereferenced with no _current_
    1061                 :             :          * lock/pin held, though.
    1062                 :             :          */
    1063         [ +  + ]:     3574648 :         if (!RelationUsesLocalBuffers(rel))
    1064                 :     3548080 :                 VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
    1065                 :     3574648 : }
    1066                 :             : 
    1067                 :             : /*
    1068                 :             :  *      _bt_unlockbuf() -- unlock a pinned buffer.
    1069                 :             :  */
    1070                 :             : void
    1071                 :     3577763 : _bt_unlockbuf(Relation rel, Buffer buf)
    1072                 :             : {
    1073                 :             :         /*
    1074                 :             :          * Buffer is pinned and locked, which means that it is expected to be
    1075                 :             :          * defined and addressable.  Check that proactively.
    1076                 :             :          */
    1077                 :     3577763 :         VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
    1078                 :             : 
    1079                 :             :         /* LockBuffer() asserts that pin is held by this backend */
    1080                 :     3577763 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    1081                 :             : 
    1082         [ +  + ]:     3577763 :         if (!RelationUsesLocalBuffers(rel))
    1083                 :     3551095 :                 VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
    1084                 :     3577763 : }
    1085                 :             : 
    1086                 :             : /*
    1087                 :             :  *      _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
    1088                 :             :  *      buffer.
    1089                 :             :  *
    1090                 :             :  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
    1091                 :             :  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
    1092                 :             :  */
    1093                 :             : bool
    1094                 :          49 : _bt_conditionallockbuf(Relation rel, Buffer buf)
    1095                 :             : {
    1096                 :             :         /* ConditionalLockBuffer() asserts that pin is held by this backend */
    1097         [ -  + ]:          49 :         if (!ConditionalLockBuffer(buf))
    1098                 :           0 :                 return false;
    1099                 :             : 
    1100         [ -  + ]:          49 :         if (!RelationUsesLocalBuffers(rel))
    1101                 :          49 :                 VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
    1102                 :             : 
    1103                 :          49 :         return true;
    1104                 :          49 : }
    1105                 :             : 
    1106                 :             : /*
    1107                 :             :  *      _bt_upgradelockbufcleanup() -- upgrade lock to a full cleanup lock.
    1108                 :             :  */
    1109                 :             : void
    1110                 :         733 : _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
    1111                 :             : {
    1112                 :             :         /*
    1113                 :             :          * Buffer is pinned and locked, which means that it is expected to be
    1114                 :             :          * defined and addressable.  Check that proactively.
    1115                 :             :          */
    1116                 :         733 :         VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
    1117                 :             : 
    1118                 :             :         /* LockBuffer() asserts that pin is held by this backend */
    1119                 :         733 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    1120                 :         733 :         LockBufferForCleanup(buf);
    1121                 :         733 : }
    1122                 :             : 
    1123                 :             : /*
    1124                 :             :  *      _bt_pageinit() -- Initialize a new page.
    1125                 :             :  *
    1126                 :             :  * On return, the page header is initialized; data space is empty;
    1127                 :             :  * special space is zeroed out.
    1128                 :             :  */
    1129                 :             : void
    1130                 :       13079 : _bt_pageinit(Page page, Size size)
    1131                 :             : {
    1132                 :       13079 :         PageInit(page, size, sizeof(BTPageOpaqueData));
    1133                 :       13079 : }
    1134                 :             : 
    1135                 :             : /*
    1136                 :             :  * Delete item(s) from a btree leaf page during VACUUM.
    1137                 :             :  *
    1138                 :             :  * This routine assumes that the caller already has a full cleanup lock on
    1139                 :             :  * the buffer.  Also, the given deletable and updatable arrays *must* be
    1140                 :             :  * sorted in ascending order.
    1141                 :             :  *
    1142                 :             :  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
    1143                 :             :  * in an existing posting list item are to be removed.  This works by
    1144                 :             :  * updating/overwriting an existing item with caller's new version of the item
    1145                 :             :  * (a version that lacks the TIDs that are to be deleted).
    1146                 :             :  *
    1147                 :             :  * We record VACUUMs and b-tree deletes differently in WAL.  Deletes must
    1148                 :             :  * generate their own snapshotConflictHorizon directly from the tableam,
    1149                 :             :  * whereas VACUUMs rely on the initial VACUUM table scan performing
    1150                 :             :  * WAL-logging that takes care of the issue for the table's indexes
    1151                 :             :  * indirectly.  Also, we remove the VACUUM cycle ID from pages, which b-tree
    1152                 :             :  * deletes don't do.
    1153                 :             :  */
    1154                 :             : void
    1155                 :         515 : _bt_delitems_vacuum(Relation rel, Buffer buf,
    1156                 :             :                                         OffsetNumber *deletable, int ndeletable,
    1157                 :             :                                         BTVacuumPosting *updatable, int nupdatable)
    1158                 :             : {
    1159                 :         515 :         Page            page = BufferGetPage(buf);
    1160                 :         515 :         BTPageOpaque opaque;
    1161   [ -  +  +  +  :        1030 :         bool            needswal = RelationNeedsWAL(rel);
                   -  + ]
    1162                 :         515 :         char       *updatedbuf = NULL;
    1163                 :         515 :         Size            updatedbuflen = 0;
    1164                 :         515 :         OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
    1165                 :             : 
    1166                 :             :         /* Shouldn't be called unless there's something to do */
    1167   [ +  +  +  - ]:         515 :         Assert(ndeletable > 0 || nupdatable > 0);
    1168                 :             : 
    1169                 :             :         /* Generate new version of posting lists without deleted TIDs */
    1170         [ +  + ]:         515 :         if (nupdatable > 0)
    1171                 :         102 :                 updatedbuf = _bt_delitems_update(updatable, nupdatable,
    1172                 :          51 :                                                                                  updatedoffsets, &updatedbuflen,
    1173                 :          51 :                                                                                  needswal);
    1174                 :             : 
    1175                 :             :         /* No ereport(ERROR) until changes are logged */
    1176                 :         515 :         START_CRIT_SECTION();
    1177                 :             : 
    1178                 :             :         /*
    1179                 :             :          * Handle posting tuple updates.
    1180                 :             :          *
    1181                 :             :          * Deliberately do this before handling simple deletes.  If we did it the
    1182                 :             :          * other way around (i.e. WAL record order -- simple deletes before
    1183                 :             :          * updates) then we'd have to make compensating changes to the 'updatable'
    1184                 :             :          * array of offset numbers.
    1185                 :             :          *
    1186                 :             :          * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
    1187                 :             :          * happens to already be set.  It's important that we not interfere with
    1188                 :             :          * any future simple index tuple deletion operations.
    1189                 :             :          */
    1190         [ +  + ]:        1648 :         for (int i = 0; i < nupdatable; i++)
    1191                 :             :         {
    1192                 :        1133 :                 OffsetNumber updatedoffset = updatedoffsets[i];
    1193                 :        1133 :                 IndexTuple      itup;
    1194                 :        1133 :                 Size            itemsz;
    1195                 :             : 
    1196                 :        1133 :                 itup = updatable[i]->itup;
    1197                 :        1133 :                 itemsz = MAXALIGN(IndexTupleSize(itup));
    1198         [ +  - ]:        1133 :                 if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz))
    1199   [ #  #  #  # ]:           0 :                         elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
    1200                 :             :                                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
    1201                 :        1133 :         }
    1202                 :             : 
    1203                 :             :         /* Now handle simple deletes of entire tuples */
    1204         [ +  + ]:         515 :         if (ndeletable > 0)
    1205                 :         509 :                 PageIndexMultiDelete(page, deletable, ndeletable);
    1206                 :             : 
    1207                 :             :         /*
    1208                 :             :          * We can clear the vacuum cycle ID since this page has certainly been
    1209                 :             :          * processed by the current vacuum scan.
    1210                 :             :          */
    1211                 :         515 :         opaque = BTPageGetOpaque(page);
    1212                 :         515 :         opaque->btpo_cycleid = 0;
    1213                 :             : 
    1214                 :             :         /*
    1215                 :             :          * Clear the BTP_HAS_GARBAGE page flag.
    1216                 :             :          *
    1217                 :             :          * This flag indicates the presence of LP_DEAD items on the page (though
    1218                 :             :          * not reliably).  Note that we only rely on it with pg_upgrade'd
    1219                 :             :          * !heapkeyspace indexes.  That's why clearing it here won't usually
    1220                 :             :          * interfere with simple index tuple deletion.
    1221                 :             :          */
    1222                 :         515 :         opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
    1223                 :             : 
    1224                 :         515 :         MarkBufferDirty(buf);
    1225                 :             : 
    1226                 :             :         /* XLOG stuff */
    1227         [ -  + ]:         515 :         if (needswal)
    1228                 :             :         {
    1229                 :         515 :                 XLogRecPtr      recptr;
    1230                 :         515 :                 xl_btree_vacuum xlrec_vacuum;
    1231                 :             : 
    1232                 :         515 :                 xlrec_vacuum.ndeleted = ndeletable;
    1233                 :         515 :                 xlrec_vacuum.nupdated = nupdatable;
    1234                 :             : 
    1235                 :         515 :                 XLogBeginInsert();
    1236                 :         515 :                 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1237                 :         515 :                 XLogRegisterData(&xlrec_vacuum, SizeOfBtreeVacuum);
    1238                 :             : 
    1239         [ +  + ]:         515 :                 if (ndeletable > 0)
    1240                 :        1018 :                         XLogRegisterBufData(0, deletable,
    1241                 :         509 :                                                                 ndeletable * sizeof(OffsetNumber));
    1242                 :             : 
    1243         [ +  + ]:         515 :                 if (nupdatable > 0)
    1244                 :             :                 {
    1245                 :         102 :                         XLogRegisterBufData(0, updatedoffsets,
    1246                 :          51 :                                                                 nupdatable * sizeof(OffsetNumber));
    1247                 :          51 :                         XLogRegisterBufData(0, updatedbuf, updatedbuflen);
    1248                 :          51 :                 }
    1249                 :             : 
    1250                 :         515 :                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
    1251                 :             : 
    1252                 :         515 :                 PageSetLSN(page, recptr);
    1253                 :         515 :         }
    1254                 :             : 
    1255         [ +  - ]:         515 :         END_CRIT_SECTION();
    1256                 :             : 
    1257                 :             :         /* can't leak memory here */
    1258         [ +  + ]:         515 :         if (updatedbuf != NULL)
    1259                 :          51 :                 pfree(updatedbuf);
    1260                 :             :         /* free tuples allocated within _bt_delitems_update() */
    1261         [ +  + ]:        1648 :         for (int i = 0; i < nupdatable; i++)
    1262                 :        1133 :                 pfree(updatable[i]->itup);
    1263                 :         515 : }
    1264                 :             : 
    1265                 :             : /*
    1266                 :             :  * Delete item(s) from a btree leaf page during single-page cleanup.
    1267                 :             :  *
    1268                 :             :  * This routine assumes that the caller has pinned and write locked the
    1269                 :             :  * buffer.  Also, the given deletable and updatable arrays *must* be sorted in
    1270                 :             :  * ascending order.
    1271                 :             :  *
    1272                 :             :  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
    1273                 :             :  * in an existing posting list item are to be removed.  This works by
    1274                 :             :  * updating/overwriting an existing item with caller's new version of the item
    1275                 :             :  * (a version that lacks the TIDs that are to be deleted).
    1276                 :             :  *
    1277                 :             :  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
    1278                 :             :  * the page, but it needs its own snapshotConflictHorizon and isCatalogRel
    1279                 :             :  * (from the tableam).  This is used by the REDO routine to generate recovery
    1280                 :             :  * conflicts.  The other difference is that only _bt_delitems_vacuum will
    1281                 :             :  * clear page's VACUUM cycle ID.
    1282                 :             :  */
    1283                 :             : static void
    1284                 :         885 : _bt_delitems_delete(Relation rel, Buffer buf,
    1285                 :             :                                         TransactionId snapshotConflictHorizon, bool isCatalogRel,
    1286                 :             :                                         OffsetNumber *deletable, int ndeletable,
    1287                 :             :                                         BTVacuumPosting *updatable, int nupdatable)
    1288                 :             : {
    1289                 :         885 :         Page            page = BufferGetPage(buf);
    1290                 :         885 :         BTPageOpaque opaque;
    1291   [ -  +  +  +  :        1770 :         bool            needswal = RelationNeedsWAL(rel);
                   +  + ]
    1292                 :         885 :         char       *updatedbuf = NULL;
    1293                 :         885 :         Size            updatedbuflen = 0;
    1294                 :         885 :         OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
    1295                 :             : 
    1296                 :             :         /* Shouldn't be called unless there's something to do */
    1297   [ +  +  +  - ]:         885 :         Assert(ndeletable > 0 || nupdatable > 0);
    1298                 :             : 
    1299                 :             :         /* Generate new versions of posting lists without deleted TIDs */
    1300         [ +  + ]:         885 :         if (nupdatable > 0)
    1301                 :         152 :                 updatedbuf = _bt_delitems_update(updatable, nupdatable,
    1302                 :          76 :                                                                                  updatedoffsets, &updatedbuflen,
    1303                 :          76 :                                                                                  needswal);
    1304                 :             : 
    1305                 :             :         /* No ereport(ERROR) until changes are logged */
    1306                 :         885 :         START_CRIT_SECTION();
    1307                 :             : 
    1308                 :             :         /* Handle updates and deletes just like _bt_delitems_vacuum */
    1309         [ +  + ]:        1157 :         for (int i = 0; i < nupdatable; i++)
    1310                 :             :         {
    1311                 :         272 :                 OffsetNumber updatedoffset = updatedoffsets[i];
    1312                 :         272 :                 IndexTuple      itup;
    1313                 :         272 :                 Size            itemsz;
    1314                 :             : 
    1315                 :         272 :                 itup = updatable[i]->itup;
    1316                 :         272 :                 itemsz = MAXALIGN(IndexTupleSize(itup));
    1317         [ +  - ]:         272 :                 if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz))
    1318   [ #  #  #  # ]:           0 :                         elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
    1319                 :             :                                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
    1320                 :         272 :         }
    1321                 :             : 
    1322         [ +  + ]:         885 :         if (ndeletable > 0)
    1323                 :         884 :                 PageIndexMultiDelete(page, deletable, ndeletable);
    1324                 :             : 
    1325                 :             :         /*
    1326                 :             :          * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
    1327                 :             :          * this point.  The VACUUM command alone controls vacuum cycle IDs.
    1328                 :             :          */
    1329                 :         885 :         opaque = BTPageGetOpaque(page);
    1330                 :             : 
    1331                 :             :         /*
    1332                 :             :          * Clear the BTP_HAS_GARBAGE page flag.
    1333                 :             :          *
    1334                 :             :          * This flag indicates the presence of LP_DEAD items on the page (though
    1335                 :             :          * not reliably).  Note that we only rely on it with pg_upgrade'd
    1336                 :             :          * !heapkeyspace indexes.
    1337                 :             :          */
    1338                 :         885 :         opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
    1339                 :             : 
    1340                 :         885 :         MarkBufferDirty(buf);
    1341                 :             : 
    1342                 :             :         /* XLOG stuff */
    1343         [ +  + ]:         885 :         if (needswal)
    1344                 :             :         {
    1345                 :         861 :                 XLogRecPtr      recptr;
    1346                 :         861 :                 xl_btree_delete xlrec_delete;
    1347                 :             : 
    1348                 :         861 :                 xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon;
    1349                 :         861 :                 xlrec_delete.ndeleted = ndeletable;
    1350                 :         861 :                 xlrec_delete.nupdated = nupdatable;
    1351                 :         861 :                 xlrec_delete.isCatalogRel = isCatalogRel;
    1352                 :             : 
    1353                 :         861 :                 XLogBeginInsert();
    1354                 :         861 :                 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1355                 :         861 :                 XLogRegisterData(&xlrec_delete, SizeOfBtreeDelete);
    1356                 :             : 
    1357         [ +  + ]:         861 :                 if (ndeletable > 0)
    1358                 :        1720 :                         XLogRegisterBufData(0, deletable,
    1359                 :         860 :                                                                 ndeletable * sizeof(OffsetNumber));
    1360                 :             : 
    1361         [ +  + ]:         861 :                 if (nupdatable > 0)
    1362                 :             :                 {
    1363                 :         152 :                         XLogRegisterBufData(0, updatedoffsets,
    1364                 :          76 :                                                                 nupdatable * sizeof(OffsetNumber));
    1365                 :          76 :                         XLogRegisterBufData(0, updatedbuf, updatedbuflen);
    1366                 :          76 :                 }
    1367                 :             : 
    1368                 :         861 :                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
    1369                 :             : 
    1370                 :         861 :                 PageSetLSN(page, recptr);
    1371                 :         861 :         }
    1372                 :             : 
    1373         [ +  - ]:         885 :         END_CRIT_SECTION();
    1374                 :             : 
    1375                 :             :         /* can't leak memory here */
    1376         [ +  + ]:         885 :         if (updatedbuf != NULL)
    1377                 :          76 :                 pfree(updatedbuf);
    1378                 :             :         /* free tuples allocated within _bt_delitems_update() */
    1379         [ +  + ]:        1157 :         for (int i = 0; i < nupdatable; i++)
    1380                 :         272 :                 pfree(updatable[i]->itup);
    1381                 :         885 : }
    1382                 :             : 
    1383                 :             : /*
    1384                 :             :  * Set up state needed to delete TIDs from posting list tuples via "updating"
    1385                 :             :  * the tuple.  Performs steps common to both _bt_delitems_vacuum and
    1386                 :             :  * _bt_delitems_delete.  These steps must take place before each function's
    1387                 :             :  * critical section begins.
    1388                 :             :  *
    1389                 :             :  * updatable and nupdatable are inputs, though note that we will use
    1390                 :             :  * _bt_update_posting() to replace the original itup with a pointer to a final
    1391                 :             :  * version in palloc()'d memory.  Caller should free the tuples when its done.
    1392                 :             :  *
    1393                 :             :  * The first nupdatable entries from updatedoffsets are set to the page offset
    1394                 :             :  * number for posting list tuples that caller updates.  This is mostly useful
    1395                 :             :  * because caller may need to WAL-log the page offsets (though we always do
    1396                 :             :  * this for caller out of convenience).
    1397                 :             :  *
    1398                 :             :  * Returns buffer consisting of an array of xl_btree_update structs that
    1399                 :             :  * describe the steps we perform here for caller (though only when needswal is
    1400                 :             :  * true).  Also sets *updatedbuflen to the final size of the buffer.  This
    1401                 :             :  * buffer is used by caller when WAL logging is required.
    1402                 :             :  */
    1403                 :             : static char *
    1404                 :         127 : _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
    1405                 :             :                                         OffsetNumber *updatedoffsets, Size *updatedbuflen,
    1406                 :             :                                         bool needswal)
    1407                 :             : {
    1408                 :         127 :         char       *updatedbuf = NULL;
    1409                 :         127 :         Size            buflen = 0;
    1410                 :             : 
    1411                 :             :         /* Shouldn't be called unless there's something to do */
    1412         [ +  - ]:         127 :         Assert(nupdatable > 0);
    1413                 :             : 
    1414         [ +  + ]:        1532 :         for (int i = 0; i < nupdatable; i++)
    1415                 :             :         {
    1416                 :        1405 :                 BTVacuumPosting vacposting = updatable[i];
    1417                 :        1405 :                 Size            itemsz;
    1418                 :             : 
    1419                 :             :                 /* Replace work area IndexTuple with updated version */
    1420                 :        1405 :                 _bt_update_posting(vacposting);
    1421                 :             : 
    1422                 :             :                 /* Keep track of size of xl_btree_update for updatedbuf in passing */
    1423                 :        1405 :                 itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
    1424                 :        1405 :                 buflen += itemsz;
    1425                 :             : 
    1426                 :             :                 /* Build updatedoffsets buffer in passing */
    1427                 :        1405 :                 updatedoffsets[i] = vacposting->updatedoffset;
    1428                 :        1405 :         }
    1429                 :             : 
    1430                 :             :         /* XLOG stuff */
    1431         [ -  + ]:         127 :         if (needswal)
    1432                 :             :         {
    1433                 :         127 :                 Size            offset = 0;
    1434                 :             : 
    1435                 :             :                 /* Allocate, set final size for caller */
    1436                 :         127 :                 updatedbuf = palloc(buflen);
    1437                 :         127 :                 *updatedbuflen = buflen;
    1438         [ +  + ]:        1532 :                 for (int i = 0; i < nupdatable; i++)
    1439                 :             :                 {
    1440                 :        1405 :                         BTVacuumPosting vacposting = updatable[i];
    1441                 :        1405 :                         Size            itemsz;
    1442                 :        1405 :                         xl_btree_update update;
    1443                 :             : 
    1444                 :        1405 :                         update.ndeletedtids = vacposting->ndeletedtids;
    1445                 :        1405 :                         memcpy(updatedbuf + offset, &update.ndeletedtids,
    1446                 :             :                                    SizeOfBtreeUpdate);
    1447                 :        1405 :                         offset += SizeOfBtreeUpdate;
    1448                 :             : 
    1449                 :        1405 :                         itemsz = update.ndeletedtids * sizeof(uint16);
    1450                 :        1405 :                         memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
    1451                 :        1405 :                         offset += itemsz;
    1452                 :        1405 :                 }
    1453                 :         127 :         }
    1454                 :             : 
    1455                 :         254 :         return updatedbuf;
    1456                 :         127 : }
    1457                 :             : 
    1458                 :             : /*
    1459                 :             :  * Comparator used by _bt_delitems_delete_check() to restore deltids array
    1460                 :             :  * back to its original leaf-page-wise sort order
    1461                 :             :  */
    1462                 :             : static int
    1463                 :      506683 : _bt_delitems_cmp(const void *a, const void *b)
    1464                 :             : {
    1465                 :      506683 :         const TM_IndexDelete *indexdelete1 = a;
    1466                 :      506683 :         const TM_IndexDelete *indexdelete2 = b;
    1467                 :             : 
    1468         [ +  - ]:      506683 :         Assert(indexdelete1->id != indexdelete2->id);
    1469                 :             : 
    1470                 :     1013366 :         return pg_cmp_s16(indexdelete1->id, indexdelete2->id);
    1471                 :      506683 : }
    1472                 :             : 
    1473                 :             : /*
    1474                 :             :  * Try to delete item(s) from a btree leaf page during single-page cleanup.
    1475                 :             :  *
    1476                 :             :  * nbtree interface to table_index_delete_tuples().  Deletes a subset of index
    1477                 :             :  * tuples from caller's deltids array: those whose TIDs are found safe to
    1478                 :             :  * delete by the tableam (or already marked LP_DEAD in index, and so already
    1479                 :             :  * known to be deletable by our simple index deletion caller).  We physically
    1480                 :             :  * delete index tuples from buf leaf page last of all (for index tuples where
    1481                 :             :  * that is known to be safe following our table_index_delete_tuples() call).
    1482                 :             :  *
    1483                 :             :  * Simple index deletion caller only includes TIDs from index tuples marked
    1484                 :             :  * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be
    1485                 :             :  * included without increasing the total number of distinct table blocks for
    1486                 :             :  * the deletion operation as a whole.  This approach often allows us to delete
    1487                 :             :  * some extra index tuples that were practically free for tableam to check in
    1488                 :             :  * passing (when they actually turn out to be safe to delete).  It probably
    1489                 :             :  * only makes sense for the tableam to go ahead with these extra checks when
    1490                 :             :  * it is block-oriented (otherwise the checks probably won't be practically
    1491                 :             :  * free, which we rely on).  The tableam interface requires the tableam side
    1492                 :             :  * to handle the problem, though, so this is okay (we as an index AM are free
    1493                 :             :  * to make the simplifying assumption that all tableams must be block-based).
    1494                 :             :  *
    1495                 :             :  * Bottom-up index deletion caller provides all the TIDs from the leaf page,
    1496                 :             :  * without expecting that tableam will check most of them.  The tableam has
    1497                 :             :  * considerable discretion around which entries/blocks it checks.  Our role in
    1498                 :             :  * costing the bottom-up deletion operation is strictly advisory.
    1499                 :             :  *
    1500                 :             :  * Note: Caller must have added deltids entries (i.e. entries that go in
    1501                 :             :  * delstate's main array) in leaf-page-wise order: page offset number order,
    1502                 :             :  * TID order among entries taken from the same posting list tuple (tiebreak on
    1503                 :             :  * TID).  This order is convenient to work with here.
    1504                 :             :  *
    1505                 :             :  * Note: We also rely on the id field of each deltids element "capturing" this
    1506                 :             :  * original leaf-page-wise order.  That is, we expect to be able to get back
    1507                 :             :  * to the original leaf-page-wise order just by sorting deltids on the id
    1508                 :             :  * field (tableam will sort deltids for its own reasons, so we'll need to put
    1509                 :             :  * it back in leaf-page-wise order afterwards).
    1510                 :             :  */
    1511                 :             : void
    1512                 :        1037 : _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel,
    1513                 :             :                                                   TM_IndexDeleteOp *delstate)
    1514                 :             : {
    1515                 :        1037 :         Page            page = BufferGetPage(buf);
    1516                 :        1037 :         TransactionId snapshotConflictHorizon;
    1517                 :        1037 :         bool            isCatalogRel;
    1518                 :        1037 :         OffsetNumber postingidxoffnum = InvalidOffsetNumber;
    1519                 :        1037 :         int                     ndeletable = 0,
    1520                 :        1037 :                                 nupdatable = 0;
    1521                 :        1037 :         OffsetNumber deletable[MaxIndexTuplesPerPage];
    1522                 :        1037 :         BTVacuumPosting updatable[MaxIndexTuplesPerPage];
    1523                 :             : 
    1524                 :             :         /* Use tableam interface to determine which tuples to delete first */
    1525                 :        1037 :         snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
    1526   [ +  -  -  +  :        1037 :         isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
          #  #  #  #  #  
          #  #  #  #  #  
             #  #  #  # ]
    1527                 :             : 
    1528                 :             :         /* Should not WAL-log snapshotConflictHorizon unless it's required */
    1529         [ +  + ]:        1037 :         if (!XLogStandbyInfoActive())
    1530                 :        1026 :                 snapshotConflictHorizon = InvalidTransactionId;
    1531                 :             : 
    1532                 :             :         /*
    1533                 :             :          * Construct a leaf-page-wise description of what _bt_delitems_delete()
    1534                 :             :          * needs to do to physically delete index tuples from the page.
    1535                 :             :          *
    1536                 :             :          * Must sort deltids array to restore leaf-page-wise order (original order
    1537                 :             :          * before call to tableam).  This is the order that the loop expects.
    1538                 :             :          *
    1539                 :             :          * Note that deltids array might be a lot smaller now.  It might even have
    1540                 :             :          * no entries at all (with bottom-up deletion caller), in which case there
    1541                 :             :          * is nothing left to do.
    1542                 :             :          */
    1543                 :        1037 :         qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
    1544                 :             :                   _bt_delitems_cmp);
    1545         [ +  + ]:        1037 :         if (delstate->ndeltids == 0)
    1546                 :             :         {
    1547         [ +  - ]:         152 :                 Assert(delstate->bottomup);
    1548                 :         152 :                 return;
    1549                 :             :         }
    1550                 :             : 
    1551                 :             :         /* We definitely have to delete at least one index tuple (or one TID) */
    1552         [ +  + ]:       73426 :         for (int i = 0; i < delstate->ndeltids; i++)
    1553                 :             :         {
    1554                 :       72541 :                 TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
    1555                 :       72541 :                 OffsetNumber idxoffnum = dstatus->idxoffnum;
    1556                 :       72541 :                 ItemId          itemid = PageGetItemId(page, idxoffnum);
    1557                 :       72541 :                 IndexTuple      itup = (IndexTuple) PageGetItem(page, itemid);
    1558                 :       72541 :                 int                     nestedi,
    1559                 :             :                                         nitem;
    1560                 :       72541 :                 BTVacuumPosting vacposting;
    1561                 :             : 
    1562   [ -  +  +  - ]:       72541 :                 Assert(OffsetNumberIsValid(idxoffnum));
    1563                 :             : 
    1564         [ +  + ]:       72541 :                 if (idxoffnum == postingidxoffnum)
    1565                 :             :                 {
    1566                 :             :                         /*
    1567                 :             :                          * This deltid entry is a TID from a posting list tuple that has
    1568                 :             :                          * already been completely processed
    1569                 :             :                          */
    1570         [ -  + ]:        2445 :                         Assert(BTreeTupleIsPosting(itup));
    1571         [ -  + ]:        2445 :                         Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup),
    1572                 :             :                                                                           &delstate->deltids[i].tid) < 0);
    1573         [ -  + ]:        2445 :                         Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup),
    1574                 :             :                                                                           &delstate->deltids[i].tid) >= 0);
    1575                 :        2445 :                         continue;
    1576                 :             :                 }
    1577                 :             : 
    1578         [ +  + ]:       70096 :                 if (!BTreeTupleIsPosting(itup))
    1579                 :             :                 {
    1580                 :             :                         /* Plain non-pivot tuple */
    1581         [ -  + ]:       68822 :                         Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
    1582         [ +  + ]:       68822 :                         if (dstatus->knowndeletable)
    1583                 :       53978 :                                 deletable[ndeletable++] = idxoffnum;
    1584                 :       68822 :                         continue;
    1585                 :             :                 }
    1586                 :             : 
    1587                 :             :                 /*
    1588                 :             :                  * itup is a posting list tuple whose lowest deltids entry (which may
    1589                 :             :                  * or may not be for the first TID from itup) is considered here now.
    1590                 :             :                  * We should process all of the deltids entries for the posting list
    1591                 :             :                  * together now, though (not just the lowest).  Remember to skip over
    1592                 :             :                  * later itup-related entries during later iterations of outermost
    1593                 :             :                  * loop.
    1594                 :             :                  */
    1595                 :        1274 :                 postingidxoffnum = idxoffnum;   /* Remember work in outermost loop */
    1596                 :        1274 :                 nestedi = i;                    /* Initialize for first itup deltids entry */
    1597                 :        1274 :                 vacposting = NULL;              /* Describes final action for itup */
    1598                 :        1274 :                 nitem = BTreeTupleGetNPosting(itup);
    1599         [ +  + ]:        6996 :                 for (int p = 0; p < nitem; p++)
    1600                 :             :                 {
    1601                 :        5722 :                         ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
    1602                 :        5722 :                         int                     ptidcmp = -1;
    1603                 :             : 
    1604                 :             :                         /*
    1605                 :             :                          * This nested loop reuses work across ptid TIDs taken from itup.
    1606                 :             :                          * We take advantage of the fact that both itup's TIDs and deltids
    1607                 :             :                          * entries (within a single itup/posting list grouping) must both
    1608                 :             :                          * be in ascending TID order.
    1609                 :             :                          */
    1610         [ +  + ]:        8601 :                         for (; nestedi < delstate->ndeltids; nestedi++)
    1611                 :             :                         {
    1612                 :        8456 :                                 TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
    1613                 :        8456 :                                 TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
    1614                 :             : 
    1615                 :             :                                 /* Stop once we get past all itup related deltids entries */
    1616         [ -  + ]:        8456 :                                 Assert(tdstatus->idxoffnum >= idxoffnum);
    1617         [ +  + ]:        8456 :                                 if (tdstatus->idxoffnum != idxoffnum)
    1618                 :        1355 :                                         break;
    1619                 :             : 
    1620                 :             :                                 /* Skip past non-deletable itup related entries up front */
    1621         [ +  + ]:        7101 :                                 if (!tdstatus->knowndeletable)
    1622                 :        1112 :                                         continue;
    1623                 :             : 
    1624                 :             :                                 /* Entry is first partial ptid match (or an exact match)? */
    1625                 :        5989 :                                 ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
    1626         [ +  + ]:        5989 :                                 if (ptidcmp >= 0)
    1627                 :             :                                 {
    1628                 :             :                                         /* Greater than or equal (partial or exact) match... */
    1629                 :        4222 :                                         break;
    1630                 :             :                                 }
    1631      [ +  +  + ]:        8456 :                         }
    1632                 :             : 
    1633                 :             :                         /* ...exact ptid match to a deletable deltids entry? */
    1634         [ +  + ]:        5722 :                         if (ptidcmp != 0)
    1635                 :        3115 :                                 continue;
    1636                 :             : 
    1637                 :             :                         /* Exact match for deletable deltids entry -- ptid gets deleted */
    1638         [ +  + ]:        2607 :                         if (vacposting == NULL)
    1639                 :             :                         {
    1640                 :         923 :                                 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
    1641                 :         923 :                                                                         nitem * sizeof(uint16));
    1642                 :         923 :                                 vacposting->itup = itup;
    1643                 :         923 :                                 vacposting->updatedoffset = idxoffnum;
    1644                 :         923 :                                 vacposting->ndeletedtids = 0;
    1645                 :         923 :                         }
    1646                 :        2607 :                         vacposting->deletetids[vacposting->ndeletedtids++] = p;
    1647         [ +  + ]:        5722 :                 }
    1648                 :             : 
    1649                 :             :                 /* Final decision on itup, a posting list tuple */
    1650                 :             : 
    1651         [ +  + ]:        1274 :                 if (vacposting == NULL)
    1652                 :             :                 {
    1653                 :             :                         /* No TIDs to delete from itup -- do nothing */
    1654                 :         351 :                 }
    1655         [ +  + ]:         923 :                 else if (vacposting->ndeletedtids == nitem)
    1656                 :             :                 {
    1657                 :             :                         /* Straight delete of itup (to delete all TIDs) */
    1658                 :         651 :                         deletable[ndeletable++] = idxoffnum;
    1659                 :             :                         /* Turns out we won't need granular information */
    1660                 :         651 :                         pfree(vacposting);
    1661                 :         651 :                 }
    1662                 :             :                 else
    1663                 :             :                 {
    1664                 :             :                         /* Delete some (but not all) TIDs from itup */
    1665         [ +  - ]:         272 :                         Assert(vacposting->ndeletedtids > 0 &&
    1666                 :             :                                    vacposting->ndeletedtids < nitem);
    1667                 :         272 :                         updatable[nupdatable++] = vacposting;
    1668                 :             :                 }
    1669         [ +  + ]:       72541 :         }
    1670                 :             : 
    1671                 :             :         /* Physically delete tuples (or TIDs) using deletable (or updatable) */
    1672                 :        1770 :         _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
    1673                 :         885 :                                                 deletable, ndeletable, updatable, nupdatable);
    1674                 :             : 
    1675                 :             :         /* be tidy */
    1676         [ +  + ]:        1157 :         for (int i = 0; i < nupdatable; i++)
    1677                 :         272 :                 pfree(updatable[i]);
    1678                 :        1037 : }
    1679                 :             : 
    1680                 :             : /*
    1681                 :             :  * Check that leftsib page (the btpo_prev of target page) is not marked with
    1682                 :             :  * INCOMPLETE_SPLIT flag.  Used during page deletion.
    1683                 :             :  *
    1684                 :             :  * Returning true indicates that page flag is set in leftsib (which is
    1685                 :             :  * definitely still the left sibling of target).  When that happens, the
    1686                 :             :  * target doesn't have a downlink in parent, and the page deletion algorithm
    1687                 :             :  * isn't prepared to handle that.  Deletion of the target page (or the whole
    1688                 :             :  * subtree that contains the target page) cannot take place.
    1689                 :             :  *
    1690                 :             :  * Caller should not have a lock on the target page itself, since pages on the
    1691                 :             :  * same level must always be locked left to right to avoid deadlocks.
    1692                 :             :  */
    1693                 :             : static bool
    1694                 :         201 : _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
    1695                 :             : {
    1696                 :         201 :         Buffer          buf;
    1697                 :         201 :         Page            page;
    1698                 :         201 :         BTPageOpaque opaque;
    1699                 :         201 :         bool            result;
    1700                 :             : 
    1701                 :             :         /* Easy case: No left sibling */
    1702         [ +  + ]:         201 :         if (leftsib == P_NONE)
    1703                 :         124 :                 return false;
    1704                 :             : 
    1705                 :          77 :         buf = _bt_getbuf(rel, leftsib, BT_READ);
    1706                 :          77 :         page = BufferGetPage(buf);
    1707                 :          77 :         opaque = BTPageGetOpaque(page);
    1708                 :             : 
    1709                 :             :         /*
    1710                 :             :          * If the left sibling was concurrently split, so that its next-pointer
    1711                 :             :          * doesn't point to the current page anymore, the split that created
    1712                 :             :          * target must be completed.  Caller can reasonably expect that there will
    1713                 :             :          * be a downlink to the target page that it can relocate using its stack.
    1714                 :             :          * (We don't allow splitting an incompletely split page again until the
    1715                 :             :          * previous split has been completed.)
    1716                 :             :          */
    1717         [ -  + ]:          77 :         result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
    1718                 :          77 :         _bt_relbuf(rel, buf);
    1719                 :             : 
    1720                 :          77 :         return result;
    1721                 :         201 : }
    1722                 :             : 
    1723                 :             : /*
    1724                 :             :  * Check that leafrightsib page (the btpo_next of target leaf page) is not
    1725                 :             :  * marked with ISHALFDEAD flag.  Used during page deletion.
    1726                 :             :  *
    1727                 :             :  * Returning true indicates that page flag is set in leafrightsib, so page
    1728                 :             :  * deletion cannot go ahead.  Our caller is not prepared to deal with the case
    1729                 :             :  * where the parent page does not have a pivot tuples whose downlink points to
    1730                 :             :  * leafrightsib (due to an earlier interrupted VACUUM operation).  It doesn't
    1731                 :             :  * seem worth going to the trouble of teaching our caller to deal with it.
    1732                 :             :  * The situation will be resolved after VACUUM finishes the deletion of the
    1733                 :             :  * half-dead page (when a future VACUUM operation reaches the target page
    1734                 :             :  * again).
    1735                 :             :  *
    1736                 :             :  * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
    1737                 :             :  * _bt_rightsib_halfdeadflag() is only called for leaf pages, though.  This is
    1738                 :             :  * okay because of the restriction on deleting pages that are the rightmost
    1739                 :             :  * page of their parent (i.e. that such deletions can only take place when the
    1740                 :             :  * entire subtree must be deleted).  The leaf level check made here will apply
    1741                 :             :  * to a right "cousin" leaf page rather than a simple right sibling leaf page
    1742                 :             :  * in cases where caller actually goes on to attempt deleting pages that are
    1743                 :             :  * above the leaf page.  The right cousin leaf page is representative of the
    1744                 :             :  * left edge of the subtree to the right of the to-be-deleted subtree as a
    1745                 :             :  * whole, which is exactly the condition that our caller cares about.
    1746                 :             :  * (Besides, internal pages are never marked half-dead, so it isn't even
    1747                 :             :  * possible to _directly_ assess if an internal page is part of some other
    1748                 :             :  * to-be-deleted subtree.)
    1749                 :             :  */
    1750                 :             : static bool
    1751                 :         151 : _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
    1752                 :             : {
    1753                 :         151 :         Buffer          buf;
    1754                 :         151 :         Page            page;
    1755                 :         151 :         BTPageOpaque opaque;
    1756                 :         151 :         bool            result;
    1757                 :             : 
    1758         [ +  - ]:         151 :         Assert(leafrightsib != P_NONE);
    1759                 :             : 
    1760                 :         151 :         buf = _bt_getbuf(rel, leafrightsib, BT_READ);
    1761                 :         151 :         page = BufferGetPage(buf);
    1762                 :         151 :         opaque = BTPageGetOpaque(page);
    1763                 :             : 
    1764         [ +  - ]:         151 :         Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
    1765                 :         151 :         result = P_ISHALFDEAD(opaque);
    1766                 :         151 :         _bt_relbuf(rel, buf);
    1767                 :             : 
    1768                 :         302 :         return result;
    1769                 :         151 : }
    1770                 :             : 
    1771                 :             : /*
    1772                 :             :  * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
    1773                 :             :  *
    1774                 :             :  * This action unlinks the leaf page from the b-tree structure, removing all
    1775                 :             :  * pointers leading to it --- but not touching its own left and right links.
    1776                 :             :  * The page cannot be physically reclaimed right away, since other processes
    1777                 :             :  * may currently be trying to follow links leading to the page; they have to
    1778                 :             :  * be allowed to use its right-link to recover.  See nbtree/README.
    1779                 :             :  *
    1780                 :             :  * On entry, the target buffer must be pinned and locked (either read or write
    1781                 :             :  * lock is OK).  The page must be an empty leaf page, which may be half-dead
    1782                 :             :  * already (a half-dead page should only be passed to us when an earlier
    1783                 :             :  * VACUUM operation was interrupted, though).  Note in particular that caller
    1784                 :             :  * should never pass a buffer containing an existing deleted page here.  The
    1785                 :             :  * lock and pin on caller's buffer will be dropped before we return.
    1786                 :             :  *
    1787                 :             :  * Maintains bulk delete stats for caller, which are taken from vstate.  We
    1788                 :             :  * need to cooperate closely with caller here so that whole VACUUM operation
    1789                 :             :  * reliably avoids any double counting of subsidiary-to-leafbuf pages that we
    1790                 :             :  * delete in passing.  If such pages happen to be from a block number that is
    1791                 :             :  * ahead of the current scanblkno position, then caller is expected to count
    1792                 :             :  * them directly later on.  It's simpler for us to understand caller's
    1793                 :             :  * requirements than it would be for caller to understand when or how a
    1794                 :             :  * deleted page became deleted after the fact.
    1795                 :             :  *
    1796                 :             :  * NOTE: this leaks memory.  Rather than trying to clean up everything
    1797                 :             :  * carefully, it's better to run it in a temp context that can be reset
    1798                 :             :  * frequently.
    1799                 :             :  */
    1800                 :             : void
    1801                 :         164 : _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
    1802                 :             : {
    1803                 :         164 :         BlockNumber rightsib;
    1804                 :         164 :         bool            rightsib_empty;
    1805                 :         164 :         Page            page;
    1806                 :         164 :         BTPageOpaque opaque;
    1807                 :             : 
    1808                 :             :         /*
    1809                 :             :          * Save original leafbuf block number from caller.  Only deleted blocks
    1810                 :             :          * that are <= scanblkno are added to bulk delete stat's pages_deleted
    1811                 :             :          * count.
    1812                 :             :          */
    1813                 :         164 :         BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
    1814                 :             : 
    1815                 :             :         /*
    1816                 :             :          * "stack" is a search stack leading (approximately) to the target page.
    1817                 :             :          * It is initially NULL, but when iterating, we keep it to avoid
    1818                 :             :          * duplicated search effort.
    1819                 :             :          *
    1820                 :             :          * Also, when "stack" is not NULL, we have already checked that the
    1821                 :             :          * current page is not the right half of an incomplete split, i.e. the
    1822                 :             :          * left sibling does not have its INCOMPLETE_SPLIT flag set, including
    1823                 :             :          * when the current target page is to the right of caller's initial page
    1824                 :             :          * (the scanblkno page).
    1825                 :             :          */
    1826                 :         164 :         BTStack         stack = NULL;
    1827                 :             : 
    1828                 :         167 :         for (;;)
    1829                 :             :         {
    1830                 :         315 :                 page = BufferGetPage(leafbuf);
    1831                 :         315 :                 opaque = BTPageGetOpaque(page);
    1832                 :             : 
    1833                 :             :                 /*
    1834                 :             :                  * Internal pages are never deleted directly, only as part of deleting
    1835                 :             :                  * the whole subtree all the way down to leaf level.
    1836                 :             :                  *
    1837                 :             :                  * Also check for deleted pages here.  Caller never passes us a fully
    1838                 :             :                  * deleted page.  Only VACUUM can delete pages, so there can't have
    1839                 :             :                  * been a concurrent deletion.  Assume that we reached any deleted
    1840                 :             :                  * page encountered here by following a sibling link, and that the
    1841                 :             :                  * index is corrupt.
    1842                 :             :                  */
    1843         [ +  - ]:         315 :                 Assert(!P_ISDELETED(opaque));
    1844   [ +  -  -  + ]:         315 :                 if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
    1845                 :             :                 {
    1846                 :             :                         /*
    1847                 :             :                          * Pre-9.4 page deletion only marked internal pages as half-dead,
    1848                 :             :                          * but now we only use that flag on leaf pages. The old algorithm
    1849                 :             :                          * was never supposed to leave half-dead pages in the tree, it was
    1850                 :             :                          * just a transient state, but it was nevertheless possible in
    1851                 :             :                          * error scenarios. We don't know how to deal with them here. They
    1852                 :             :                          * are harmless as far as searches are considered, but inserts
    1853                 :             :                          * into the deleted keyspace could add out-of-order downlinks in
    1854                 :             :                          * the upper levels. Log a notice, hopefully the admin will notice
    1855                 :             :                          * and reindex.
    1856                 :             :                          */
    1857         [ #  # ]:           0 :                         if (P_ISHALFDEAD(opaque))
    1858   [ #  #  #  # ]:           0 :                                 ereport(LOG,
    1859                 :             :                                                 (errcode(ERRCODE_INDEX_CORRUPTED),
    1860                 :             :                                                  errmsg("index \"%s\" contains a half-dead internal page",
    1861                 :             :                                                                 RelationGetRelationName(rel)),
    1862                 :             :                                                  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
    1863                 :             : 
    1864         [ #  # ]:           0 :                         if (P_ISDELETED(opaque))
    1865   [ #  #  #  # ]:           0 :                                 ereport(LOG,
    1866                 :             :                                                 (errcode(ERRCODE_INDEX_CORRUPTED),
    1867                 :             :                                                  errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
    1868                 :             :                                                                                  BufferGetBlockNumber(leafbuf),
    1869                 :             :                                                                                  scanblkno,
    1870                 :             :                                                                                  RelationGetRelationName(rel))));
    1871                 :             : 
    1872                 :           0 :                         _bt_relbuf(rel, leafbuf);
    1873                 :           0 :                         return;
    1874                 :             :                 }
    1875                 :             : 
    1876                 :             :                 /*
    1877                 :             :                  * We can never delete rightmost pages nor root pages.  While at it,
    1878                 :             :                  * check that page is empty, since it's possible that the leafbuf page
    1879                 :             :                  * was empty a moment ago, but has since had some inserts.
    1880                 :             :                  *
    1881                 :             :                  * To keep the algorithm simple, we also never delete an incompletely
    1882                 :             :                  * split page (they should be rare enough that this doesn't make any
    1883                 :             :                  * meaningful difference to disk usage):
    1884                 :             :                  *
    1885                 :             :                  * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
    1886                 :             :                  * left half of an incomplete split, but ensuring that it's not the
    1887                 :             :                  * right half is more complicated.  For that, we have to check that
    1888                 :             :                  * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
    1889                 :             :                  * _bt_leftsib_splitflag().  On the first iteration, we temporarily
    1890                 :             :                  * release the lock on scanblkno/leafbuf, check the left sibling, and
    1891                 :             :                  * construct a search stack to scanblkno.  On subsequent iterations,
    1892                 :             :                  * we know we stepped right from a page that passed these tests, so
    1893                 :             :                  * it's OK.
    1894                 :             :                  */
    1895   [ +  +  +  - ]:         315 :                 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
    1896   [ +  -  -  + ]:         299 :                         P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
    1897                 :         299 :                         P_INCOMPLETE_SPLIT(opaque))
    1898                 :             :                 {
    1899                 :             :                         /* Should never fail to delete a half-dead page */
    1900         [ +  - ]:          16 :                         Assert(!P_ISHALFDEAD(opaque));
    1901                 :             : 
    1902                 :          16 :                         _bt_relbuf(rel, leafbuf);
    1903                 :          16 :                         return;
    1904                 :             :                 }
    1905                 :             : 
    1906                 :             :                 /*
    1907                 :             :                  * First, remove downlink pointing to the page (or a parent of the
    1908                 :             :                  * page, if we are going to delete a taller subtree), and mark the
    1909                 :             :                  * leafbuf page half-dead
    1910                 :             :                  */
    1911         [ -  + ]:         299 :                 if (!P_ISHALFDEAD(opaque))
    1912                 :             :                 {
    1913                 :             :                         /*
    1914                 :             :                          * We need an approximate pointer to the page's parent page.  We
    1915                 :             :                          * use a variant of the standard search mechanism to search for
    1916                 :             :                          * the page's high key; this will give us a link to either the
    1917                 :             :                          * current parent or someplace to its left (if there are multiple
    1918                 :             :                          * equal high keys, which is possible with !heapkeyspace indexes).
    1919                 :             :                          *
    1920                 :             :                          * Also check if this is the right-half of an incomplete split
    1921                 :             :                          * (see comment above).
    1922                 :             :                          */
    1923         [ +  + ]:         299 :                         if (!stack)
    1924                 :             :                         {
    1925                 :         148 :                                 BTScanInsert itup_key;
    1926                 :         148 :                                 ItemId          itemid;
    1927                 :         148 :                                 IndexTuple      targetkey;
    1928                 :         148 :                                 BlockNumber leftsib,
    1929                 :             :                                                         leafblkno;
    1930                 :         148 :                                 Buffer          sleafbuf;
    1931                 :             : 
    1932                 :         148 :                                 itemid = PageGetItemId(page, P_HIKEY);
    1933                 :         148 :                                 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
    1934                 :             : 
    1935                 :         148 :                                 leftsib = opaque->btpo_prev;
    1936                 :         148 :                                 leafblkno = BufferGetBlockNumber(leafbuf);
    1937                 :             : 
    1938                 :             :                                 /*
    1939                 :             :                                  * To avoid deadlocks, we'd better drop the leaf page lock
    1940                 :             :                                  * before going further.
    1941                 :             :                                  */
    1942                 :         148 :                                 _bt_unlockbuf(rel, leafbuf);
    1943                 :             : 
    1944                 :             :                                 /*
    1945                 :             :                                  * Check that the left sibling of leafbuf (if any) is not
    1946                 :             :                                  * marked with INCOMPLETE_SPLIT flag before proceeding
    1947                 :             :                                  */
    1948         [ -  + ]:         148 :                                 Assert(leafblkno == scanblkno);
    1949         [ -  + ]:         148 :                                 if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
    1950                 :             :                                 {
    1951                 :           0 :                                         ReleaseBuffer(leafbuf);
    1952                 :           0 :                                         return;
    1953                 :             :                                 }
    1954                 :             : 
    1955                 :             :                                 /*
    1956                 :             :                                  * We need an insertion scan key, so build one.
    1957                 :             :                                  *
    1958                 :             :                                  * _bt_search searches for the leaf page that contains any
    1959                 :             :                                  * matching non-pivot tuples, but we need it to "search" for
    1960                 :             :                                  * the high key pivot from the page that we're set to delete.
    1961                 :             :                                  * Compensate for the mismatch by having _bt_search locate the
    1962                 :             :                                  * last position < equal-to-untruncated-prefix non-pivots.
    1963                 :             :                                  */
    1964                 :         148 :                                 itup_key = _bt_mkscankey(rel, targetkey);
    1965                 :             : 
    1966                 :             :                                 /* Set up a BTLessStrategyNumber-like insertion scan key */
    1967                 :         148 :                                 itup_key->nextkey = false;
    1968                 :         148 :                                 itup_key->backward = true;
    1969                 :         148 :                                 stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ);
    1970                 :             :                                 /* won't need a second lock or pin on leafbuf */
    1971                 :         148 :                                 _bt_relbuf(rel, sleafbuf);
    1972                 :             : 
    1973                 :             :                                 /*
    1974                 :             :                                  * Re-lock the leaf page, and start over to use our stack
    1975                 :             :                                  * within _bt_mark_page_halfdead.  We must do it that way
    1976                 :             :                                  * because it's possible that leafbuf can no longer be
    1977                 :             :                                  * deleted.  We need to recheck.
    1978                 :             :                                  *
    1979                 :             :                                  * Note: We can't simply hold on to the sleafbuf lock instead,
    1980                 :             :                                  * because it's barely possible that sleafbuf is not the same
    1981                 :             :                                  * page as leafbuf.  This happens when leafbuf split after our
    1982                 :             :                                  * original lock was dropped, but before _bt_search finished
    1983                 :             :                                  * its descent.  We rely on the assumption that we'll find
    1984                 :             :                                  * leafbuf isn't safe to delete anymore in this scenario.
    1985                 :             :                                  * (Page deletion can cope with the stack being to the left of
    1986                 :             :                                  * leafbuf, but not to the right of leafbuf.)
    1987                 :             :                                  */
    1988                 :         148 :                                 _bt_lockbuf(rel, leafbuf, BT_WRITE);
    1989                 :         148 :                                 continue;
    1990         [ +  - ]:         148 :                         }
    1991                 :             : 
    1992                 :             :                         /*
    1993                 :             :                          * See if it's safe to delete the leaf page, and determine how
    1994                 :             :                          * many parent/internal pages above the leaf level will be
    1995                 :             :                          * deleted.  If it's safe then _bt_mark_page_halfdead will also
    1996                 :             :                          * perform the first phase of deletion, which includes marking the
    1997                 :             :                          * leafbuf page half-dead.
    1998                 :             :                          */
    1999         [ +  - ]:         151 :                         Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
    2000   [ +  +  +  + ]:         302 :                         if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,
    2001                 :         151 :                                                                                 stack))
    2002                 :             :                         {
    2003                 :           3 :                                 _bt_relbuf(rel, leafbuf);
    2004                 :           3 :                                 return;
    2005                 :             :                         }
    2006                 :         148 :                 }
    2007                 :             :                 else
    2008                 :             :                 {
    2009                 :             :                         INJECTION_POINT("nbtree-finish-half-dead-page-vacuum", NULL);
    2010                 :             :                 }
    2011                 :             : 
    2012                 :             :                 /*
    2013                 :             :                  * Then unlink it from its siblings.  Each call to
    2014                 :             :                  * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
    2015                 :             :                  * making it shallower.  Iterate until the leafbuf page is deleted.
    2016                 :             :                  */
    2017                 :         148 :                 rightsib_empty = false;
    2018         [ +  - ]:         148 :                 Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
    2019         [ +  + ]:         342 :                 while (P_ISHALFDEAD(opaque))
    2020                 :             :                 {
    2021                 :             :                         /* Check for interrupts in _bt_unlink_halfdead_page */
    2022   [ +  -  +  - ]:         388 :                         if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
    2023                 :         194 :                                                                                   &rightsib_empty, vstate))
    2024                 :             :                         {
    2025                 :             :                                 /*
    2026                 :             :                                  * _bt_unlink_halfdead_page should never fail, since we
    2027                 :             :                                  * established that deletion is generally safe in
    2028                 :             :                                  * _bt_mark_page_halfdead -- index must be corrupt.
    2029                 :             :                                  *
    2030                 :             :                                  * Note that _bt_unlink_halfdead_page already released the
    2031                 :             :                                  * lock and pin on leafbuf for us.
    2032                 :             :                                  */
    2033                 :           0 :                                 Assert(false);
    2034                 :           0 :                                 return;
    2035                 :             :                         }
    2036                 :             :                 }
    2037                 :             : 
    2038         [ +  - ]:         148 :                 Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
    2039                 :             : 
    2040                 :         148 :                 rightsib = opaque->btpo_next;
    2041                 :             : 
    2042                 :         148 :                 _bt_relbuf(rel, leafbuf);
    2043                 :             : 
    2044                 :             :                 /*
    2045                 :             :                  * Check here, as calling loops will have locks held, preventing
    2046                 :             :                  * interrupts from being processed.
    2047                 :             :                  */
    2048         [ +  - ]:         148 :                 CHECK_FOR_INTERRUPTS();
    2049                 :             : 
    2050                 :             :                 /*
    2051                 :             :                  * The page has now been deleted. If its right sibling is completely
    2052                 :             :                  * empty, it's possible that the reason we haven't deleted it earlier
    2053                 :             :                  * is that it was the rightmost child of the parent. Now that we
    2054                 :             :                  * removed the downlink for this page, the right sibling might now be
    2055                 :             :                  * the only child of the parent, and could be removed. It would be
    2056                 :             :                  * picked up by the next vacuum anyway, but might as well try to
    2057                 :             :                  * remove it now, so loop back to process the right sibling.
    2058                 :             :                  *
    2059                 :             :                  * Note: This relies on the assumption that _bt_getstackbuf() will be
    2060                 :             :                  * able to reuse our original descent stack with a different child
    2061                 :             :                  * block (provided that the child block is to the right of the
    2062                 :             :                  * original leaf page reached by _bt_search()). It will even update
    2063                 :             :                  * the descent stack each time we loop around, avoiding repeated work.
    2064                 :             :                  */
    2065         [ +  + ]:         148 :                 if (!rightsib_empty)
    2066                 :         145 :                         break;
    2067                 :             : 
    2068                 :           3 :                 leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
    2069                 :             :         }
    2070         [ -  + ]:         164 : }
    2071                 :             : 
    2072                 :             : /*
    2073                 :             :  * First stage of page deletion.
    2074                 :             :  *
    2075                 :             :  * Establish the height of the to-be-deleted subtree with leafbuf at its
    2076                 :             :  * lowest level, remove the downlink to the subtree, and mark leafbuf
    2077                 :             :  * half-dead.  The final to-be-deleted subtree is usually just leafbuf itself,
    2078                 :             :  * but may include additional internal pages (at most one per level of the
    2079                 :             :  * tree below the root).
    2080                 :             :  *
    2081                 :             :  * Caller must pass a valid heaprel, since it's just about possible that our
    2082                 :             :  * call to _bt_lock_subtree_parent will need to allocate a new index page to
    2083                 :             :  * complete a page split.  Every call to _bt_allocbuf needs to pass a heaprel.
    2084                 :             :  *
    2085                 :             :  * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
    2086                 :             :  * the rightmost child of its parent (and parent has more than one downlink).
    2087                 :             :  * Returns 'true' when the first stage of page deletion completed
    2088                 :             :  * successfully.
    2089                 :             :  */
    2090                 :             : static bool
    2091                 :         151 : _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf,
    2092                 :             :                                            BTStack stack)
    2093                 :             : {
    2094                 :         151 :         BlockNumber leafblkno;
    2095                 :         151 :         BlockNumber leafrightsib;
    2096                 :         151 :         BlockNumber topparent;
    2097                 :         151 :         BlockNumber topparentrightsib;
    2098                 :         151 :         ItemId          itemid;
    2099                 :         151 :         Page            page;
    2100                 :         151 :         BTPageOpaque opaque;
    2101                 :         151 :         Buffer          subtreeparent;
    2102                 :         151 :         OffsetNumber poffset;
    2103                 :         151 :         OffsetNumber nextoffset;
    2104                 :         151 :         IndexTuple      itup;
    2105                 :         151 :         IndexTupleData trunctuple;
    2106                 :             : 
    2107                 :         151 :         page = BufferGetPage(leafbuf);
    2108                 :         151 :         opaque = BTPageGetOpaque(page);
    2109                 :             : 
    2110         [ +  - ]:         151 :         Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
    2111                 :             :                    P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
    2112                 :             :                    P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
    2113         [ +  - ]:         151 :         Assert(heaprel != NULL);
    2114                 :             : 
    2115                 :             :         /*
    2116                 :             :          * Save info about the leaf page.
    2117                 :             :          */
    2118                 :         151 :         leafblkno = BufferGetBlockNumber(leafbuf);
    2119                 :         151 :         leafrightsib = opaque->btpo_next;
    2120                 :             : 
    2121                 :             :         /*
    2122                 :             :          * Before attempting to lock the parent page, check that the right sibling
    2123                 :             :          * is not in half-dead state.  A half-dead right sibling would have no
    2124                 :             :          * downlink in the parent, which would be highly confusing later when we
    2125                 :             :          * delete the downlink.  It would fail the "right sibling of target page
    2126                 :             :          * is also the next child in parent page" cross-check below.
    2127                 :             :          */
    2128         [ -  + ]:         151 :         if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
    2129                 :             :         {
    2130   [ #  #  #  # ]:           0 :                 elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
    2131                 :             :                          leafblkno, leafrightsib);
    2132                 :           0 :                 return false;
    2133                 :             :         }
    2134                 :             : 
    2135                 :             :         /*
    2136                 :             :          * We cannot delete a page that is the rightmost child of its immediate
    2137                 :             :          * parent, unless it is the only child --- in which case the parent has to
    2138                 :             :          * be deleted too, and the same condition applies recursively to it. We
    2139                 :             :          * have to check this condition all the way up before trying to delete,
    2140                 :             :          * and lock the parent of the root of the to-be-deleted subtree (the
    2141                 :             :          * "subtree parent").  _bt_lock_subtree_parent() locks the subtree parent
    2142                 :             :          * for us.  We remove the downlink to the "top parent" page (subtree root
    2143                 :             :          * page) from the subtree parent page below.
    2144                 :             :          *
    2145                 :             :          * Initialize topparent to be leafbuf page now.  The final to-be-deleted
    2146                 :             :          * subtree is often a degenerate one page subtree consisting only of the
    2147                 :             :          * leafbuf page.  When that happens, the leafbuf page is the final subtree
    2148                 :             :          * root page/top parent page.
    2149                 :             :          */
    2150                 :         151 :         topparent = leafblkno;
    2151                 :         151 :         topparentrightsib = leafrightsib;
    2152         [ +  + ]:         151 :         if (!_bt_lock_subtree_parent(rel, heaprel, leafblkno, stack,
    2153                 :             :                                                                  &subtreeparent, &poffset,
    2154                 :             :                                                                  &topparent, &topparentrightsib))
    2155                 :           3 :                 return false;
    2156                 :             : 
    2157                 :         148 :         page = BufferGetPage(subtreeparent);
    2158                 :         148 :         opaque = BTPageGetOpaque(page);
    2159                 :             : 
    2160                 :             : #ifdef USE_ASSERT_CHECKING
    2161                 :             : 
    2162                 :             :         /*
    2163                 :             :          * This is just an assertion because _bt_lock_subtree_parent should have
    2164                 :             :          * guaranteed tuple has the expected contents
    2165                 :             :          */
    2166                 :         148 :         itemid = PageGetItemId(page, poffset);
    2167                 :         148 :         itup = (IndexTuple) PageGetItem(page, itemid);
    2168         [ +  - ]:         148 :         Assert(BTreeTupleGetDownLink(itup) == topparent);
    2169                 :             : #endif
    2170                 :             : 
    2171                 :         148 :         nextoffset = OffsetNumberNext(poffset);
    2172                 :         148 :         itemid = PageGetItemId(page, nextoffset);
    2173                 :         148 :         itup = (IndexTuple) PageGetItem(page, itemid);
    2174                 :             : 
    2175                 :             :         /*
    2176                 :             :          * Check that the parent-page index items we're about to delete/overwrite
    2177                 :             :          * in subtree parent page contain what we expect.  This can fail if the
    2178                 :             :          * index has become corrupt for some reason.  When that happens we back
    2179                 :             :          * out of deletion of the leafbuf subtree.  (This is just like the case
    2180                 :             :          * where _bt_lock_subtree_parent() cannot "re-find" leafbuf's downlink.)
    2181                 :             :          */
    2182         [ +  - ]:         148 :         if (BTreeTupleGetDownLink(itup) != topparentrightsib)
    2183                 :             :         {
    2184   [ #  #  #  # ]:           0 :                 ereport(LOG,
    2185                 :             :                                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2186                 :             :                                  errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
    2187                 :             :                                                                  topparentrightsib, topparent,
    2188                 :             :                                                                  BTreeTupleGetDownLink(itup),
    2189                 :             :                                                                  BufferGetBlockNumber(subtreeparent),
    2190                 :             :                                                                  RelationGetRelationName(rel))));
    2191                 :             : 
    2192                 :           0 :                 _bt_relbuf(rel, subtreeparent);
    2193                 :           0 :                 Assert(false);
    2194                 :           0 :                 return false;
    2195                 :             :         }
    2196                 :             : 
    2197                 :             :         /*
    2198                 :             :          * Any insert which would have gone on the leaf block will now go to its
    2199                 :             :          * right sibling.  In other words, the key space moves right.
    2200                 :             :          */
    2201                 :         148 :         PredicateLockPageCombine(rel, leafblkno, leafrightsib);
    2202                 :             : 
    2203                 :             :         /* No ereport(ERROR) until changes are logged */
    2204                 :         148 :         START_CRIT_SECTION();
    2205                 :             : 
    2206                 :             :         /*
    2207                 :             :          * Update parent of subtree.  We want to delete the downlink to the top
    2208                 :             :          * parent page/root of the subtree, and the *following* key.  Easiest way
    2209                 :             :          * is to copy the right sibling's downlink over the downlink that points
    2210                 :             :          * to top parent page, and then delete the right sibling's original pivot
    2211                 :             :          * tuple.
    2212                 :             :          *
    2213                 :             :          * Lanin and Shasha make the key space move left when deleting a page,
    2214                 :             :          * whereas the key space moves right here.  That's why we cannot simply
    2215                 :             :          * delete the pivot tuple with the downlink to the top parent page.  See
    2216                 :             :          * nbtree/README.
    2217                 :             :          */
    2218                 :         148 :         page = BufferGetPage(subtreeparent);
    2219                 :         148 :         opaque = BTPageGetOpaque(page);
    2220                 :             : 
    2221                 :         148 :         itemid = PageGetItemId(page, poffset);
    2222                 :         148 :         itup = (IndexTuple) PageGetItem(page, itemid);
    2223                 :         148 :         BTreeTupleSetDownLink(itup, topparentrightsib);
    2224                 :             : 
    2225                 :         148 :         nextoffset = OffsetNumberNext(poffset);
    2226                 :         148 :         PageIndexTupleDelete(page, nextoffset);
    2227                 :             : 
    2228                 :             :         /*
    2229                 :             :          * Mark the leaf page as half-dead, and stamp it with a link to the top
    2230                 :             :          * parent page.  When the leaf page is also the top parent page, the link
    2231                 :             :          * is set to InvalidBlockNumber.
    2232                 :             :          */
    2233                 :         148 :         page = BufferGetPage(leafbuf);
    2234                 :         148 :         opaque = BTPageGetOpaque(page);
    2235                 :         148 :         opaque->btpo_flags |= BTP_HALF_DEAD;
    2236                 :             : 
    2237         [ +  - ]:         148 :         Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
    2238   [ +  -  +  -  :         296 :         MemSet(&trunctuple, 0, sizeof(IndexTupleData));
          +  -  -  +  +  
                      + ]
    2239                 :         148 :         trunctuple.t_info = sizeof(IndexTupleData);
    2240         [ +  + ]:         148 :         if (topparent != leafblkno)
    2241                 :          20 :                 BTreeTupleSetTopParent(&trunctuple, topparent);
    2242                 :             :         else
    2243                 :         128 :                 BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
    2244                 :             : 
    2245         [ +  - ]:         148 :         if (!PageIndexTupleOverwrite(page, P_HIKEY, &trunctuple, IndexTupleSize(&trunctuple)))
    2246   [ #  #  #  # ]:           0 :                 elog(ERROR, "could not overwrite high key in half-dead page");
    2247                 :             : 
    2248                 :             :         /* Must mark buffers dirty before XLogInsert */
    2249                 :         148 :         MarkBufferDirty(subtreeparent);
    2250                 :         148 :         MarkBufferDirty(leafbuf);
    2251                 :             : 
    2252                 :             :         /* XLOG stuff */
    2253   [ +  -  +  -  :         148 :         if (RelationNeedsWAL(rel))
             +  -  -  + ]
    2254                 :             :         {
    2255                 :         148 :                 xl_btree_mark_page_halfdead xlrec;
    2256                 :         148 :                 XLogRecPtr      recptr;
    2257                 :             : 
    2258                 :         148 :                 xlrec.poffset = poffset;
    2259                 :         148 :                 xlrec.leafblk = leafblkno;
    2260         [ +  + ]:         148 :                 if (topparent != leafblkno)
    2261                 :          20 :                         xlrec.topparent = topparent;
    2262                 :             :                 else
    2263                 :         128 :                         xlrec.topparent = InvalidBlockNumber;
    2264                 :             : 
    2265                 :         148 :                 XLogBeginInsert();
    2266                 :         148 :                 XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
    2267                 :         148 :                 XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
    2268                 :             : 
    2269                 :         148 :                 page = BufferGetPage(leafbuf);
    2270                 :         148 :                 opaque = BTPageGetOpaque(page);
    2271                 :         148 :                 xlrec.leftblk = opaque->btpo_prev;
    2272                 :         148 :                 xlrec.rightblk = opaque->btpo_next;
    2273                 :             : 
    2274                 :         148 :                 XLogRegisterData(&xlrec, SizeOfBtreeMarkPageHalfDead);
    2275                 :             : 
    2276                 :         148 :                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
    2277                 :             : 
    2278                 :         148 :                 page = BufferGetPage(subtreeparent);
    2279                 :         148 :                 PageSetLSN(page, recptr);
    2280                 :         148 :                 page = BufferGetPage(leafbuf);
    2281                 :         148 :                 PageSetLSN(page, recptr);
    2282                 :         148 :         }
    2283                 :             : 
    2284         [ +  - ]:         148 :         END_CRIT_SECTION();
    2285                 :             : 
    2286                 :         148 :         _bt_relbuf(rel, subtreeparent);
    2287                 :         148 :         return true;
    2288                 :         151 : }
    2289                 :             : 
    2290                 :             : /*
    2291                 :             :  * Second stage of page deletion.
    2292                 :             :  *
    2293                 :             :  * Unlinks a single page (in the subtree undergoing deletion) from its
    2294                 :             :  * siblings.  Also marks the page deleted.
    2295                 :             :  *
    2296                 :             :  * To get rid of the whole subtree, including the leaf page itself, call here
    2297                 :             :  * until the leaf page is deleted.  The original "top parent" established in
    2298                 :             :  * the first stage of deletion is deleted in the first call here, while the
    2299                 :             :  * leaf page is deleted in the last call here.  Note that the leaf page itself
    2300                 :             :  * is often the initial top parent page.
    2301                 :             :  *
    2302                 :             :  * Returns 'false' if the page could not be unlinked (shouldn't happen).  If
    2303                 :             :  * the right sibling of the current target page is empty, *rightsib_empty is
    2304                 :             :  * set to true, allowing caller to delete the target's right sibling page in
    2305                 :             :  * passing.  Note that *rightsib_empty is only actually used by caller when
    2306                 :             :  * target page is leafbuf, following last call here for leafbuf/the subtree
    2307                 :             :  * containing leafbuf.  (We always set *rightsib_empty for caller, just to be
    2308                 :             :  * consistent.)
    2309                 :             :  *
    2310                 :             :  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
    2311                 :             :  * On success exit, we'll be holding pin and write lock.  On failure exit,
    2312                 :             :  * we'll release both pin and lock before returning (we define it that way
    2313                 :             :  * to avoid having to reacquire a lock we already released).
    2314                 :             :  */
    2315                 :             : static bool
    2316                 :         194 : _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
    2317                 :             :                                                  bool *rightsib_empty, BTVacState *vstate)
    2318                 :             : {
    2319                 :         194 :         BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
    2320                 :         194 :         IndexBulkDeleteResult *stats = vstate->stats;
    2321                 :         194 :         BlockNumber leafleftsib;
    2322                 :         194 :         BlockNumber leafrightsib;
    2323                 :         194 :         BlockNumber target;
    2324                 :         194 :         BlockNumber leftsib;
    2325                 :         194 :         BlockNumber rightsib;
    2326                 :         194 :         Buffer          lbuf = InvalidBuffer;
    2327                 :         194 :         Buffer          buf;
    2328                 :         194 :         Buffer          rbuf;
    2329                 :         194 :         Buffer          metabuf = InvalidBuffer;
    2330                 :         194 :         Page            metapg = NULL;
    2331                 :         194 :         BTMetaPageData *metad = NULL;
    2332                 :         194 :         ItemId          itemid;
    2333                 :         194 :         Page            page;
    2334                 :         194 :         BTPageOpaque opaque;
    2335                 :         194 :         FullTransactionId safexid;
    2336                 :         194 :         bool            rightsib_is_rightmost;
    2337                 :         194 :         uint32          targetlevel;
    2338                 :         194 :         IndexTuple      leafhikey;
    2339                 :         194 :         BlockNumber leaftopparent;
    2340                 :             : 
    2341                 :         194 :         page = BufferGetPage(leafbuf);
    2342                 :         194 :         opaque = BTPageGetOpaque(page);
    2343                 :             : 
    2344         [ +  - ]:         194 :         Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
    2345                 :             : 
    2346                 :             :         /*
    2347                 :             :          * Remember some information about the leaf page.
    2348                 :             :          */
    2349                 :         194 :         itemid = PageGetItemId(page, P_HIKEY);
    2350                 :         194 :         leafhikey = (IndexTuple) PageGetItem(page, itemid);
    2351                 :         194 :         target = BTreeTupleGetTopParent(leafhikey);
    2352                 :         194 :         leafleftsib = opaque->btpo_prev;
    2353                 :         194 :         leafrightsib = opaque->btpo_next;
    2354                 :             : 
    2355                 :         194 :         _bt_unlockbuf(rel, leafbuf);
    2356                 :             : 
    2357                 :             :         INJECTION_POINT("nbtree-leave-page-half-dead", NULL);
    2358                 :             : 
    2359                 :             :         /*
    2360                 :             :          * Check here, as calling loops will have locks held, preventing
    2361                 :             :          * interrupts from being processed.
    2362                 :             :          */
    2363         [ +  - ]:         194 :         CHECK_FOR_INTERRUPTS();
    2364                 :             : 
    2365                 :             :         /* Unlink the current top parent of the subtree */
    2366         [ +  + ]:         194 :         if (!BlockNumberIsValid(target))
    2367                 :             :         {
    2368                 :             :                 /* Target is leaf page (or leaf page is top parent, if you prefer) */
    2369                 :         148 :                 target = leafblkno;
    2370                 :             : 
    2371                 :         148 :                 buf = leafbuf;
    2372                 :         148 :                 leftsib = leafleftsib;
    2373                 :         148 :                 targetlevel = 0;
    2374                 :         148 :         }
    2375                 :             :         else
    2376                 :             :         {
    2377                 :             :                 /* Target is the internal page taken from leaf's top parent link */
    2378         [ +  - ]:          46 :                 Assert(target != leafblkno);
    2379                 :             : 
    2380                 :             :                 /* Fetch the block number of the target's left sibling */
    2381                 :          46 :                 buf = _bt_getbuf(rel, target, BT_READ);
    2382                 :          46 :                 page = BufferGetPage(buf);
    2383                 :          46 :                 opaque = BTPageGetOpaque(page);
    2384                 :          46 :                 leftsib = opaque->btpo_prev;
    2385                 :          46 :                 targetlevel = opaque->btpo_level;
    2386         [ +  - ]:          46 :                 Assert(targetlevel > 0);
    2387                 :             : 
    2388                 :             :                 /*
    2389                 :             :                  * To avoid deadlocks, we'd better drop the target page lock before
    2390                 :             :                  * going further.
    2391                 :             :                  */
    2392                 :          46 :                 _bt_unlockbuf(rel, buf);
    2393                 :             :         }
    2394                 :             : 
    2395                 :             :         /*
    2396                 :             :          * We have to lock the pages we need to modify in the standard order:
    2397                 :             :          * moving right, then up.  Else we will deadlock against other writers.
    2398                 :             :          *
    2399                 :             :          * So, first lock the leaf page, if it's not the target.  Then find and
    2400                 :             :          * write-lock the current left sibling of the target page.  The sibling
    2401                 :             :          * that was current a moment ago could have split, so we may have to move
    2402                 :             :          * right.
    2403                 :             :          */
    2404         [ +  + ]:         194 :         if (target != leafblkno)
    2405                 :          46 :                 _bt_lockbuf(rel, leafbuf, BT_WRITE);
    2406         [ +  + ]:         194 :         if (leftsib != P_NONE)
    2407                 :             :         {
    2408                 :          67 :                 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
    2409                 :          67 :                 page = BufferGetPage(lbuf);
    2410                 :          67 :                 opaque = BTPageGetOpaque(page);
    2411   [ -  +  -  + ]:          67 :                 while (P_ISDELETED(opaque) || opaque->btpo_next != target)
    2412                 :             :                 {
    2413                 :           0 :                         bool            leftsibvalid = true;
    2414                 :             : 
    2415                 :             :                         /*
    2416                 :             :                          * Before we follow the link from the page that was the left
    2417                 :             :                          * sibling mere moments ago, validate its right link.  This
    2418                 :             :                          * reduces the opportunities for loop to fail to ever make any
    2419                 :             :                          * progress in the presence of index corruption.
    2420                 :             :                          *
    2421                 :             :                          * Note: we rely on the assumption that there can only be one
    2422                 :             :                          * vacuum process running at a time (against the same index).
    2423                 :             :                          */
    2424   [ #  #  #  #  :           0 :                         if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
                   #  # ]
    2425                 :           0 :                                 leftsib == opaque->btpo_next)
    2426                 :           0 :                                 leftsibvalid = false;
    2427                 :             : 
    2428                 :           0 :                         leftsib = opaque->btpo_next;
    2429                 :           0 :                         _bt_relbuf(rel, lbuf);
    2430                 :             : 
    2431         [ #  # ]:           0 :                         if (!leftsibvalid)
    2432                 :             :                         {
    2433                 :             :                                 /*
    2434                 :             :                                  * This is known to fail in the field; sibling link corruption
    2435                 :             :                                  * is relatively common.  Press on with vacuuming rather than
    2436                 :             :                                  * just throwing an ERROR.
    2437                 :             :                                  */
    2438   [ #  #  #  # ]:           0 :                                 ereport(LOG,
    2439                 :             :                                                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2440                 :             :                                                  errmsg_internal("valid left sibling for deletion target could not be located: "
    2441                 :             :                                                                                  "left sibling %u of target %u with leafblkno %u and scanblkno %u on level %u of index \"%s\"",
    2442                 :             :                                                                                  leftsib, target, leafblkno, scanblkno,
    2443                 :             :                                                                                  targetlevel, RelationGetRelationName(rel))));
    2444                 :             : 
    2445                 :             :                                 /* Must release all pins and locks on failure exit */
    2446                 :           0 :                                 ReleaseBuffer(buf);
    2447         [ #  # ]:           0 :                                 if (target != leafblkno)
    2448                 :           0 :                                         _bt_relbuf(rel, leafbuf);
    2449                 :             : 
    2450                 :           0 :                                 return false;
    2451                 :             :                         }
    2452                 :             : 
    2453         [ #  # ]:           0 :                         CHECK_FOR_INTERRUPTS();
    2454                 :             : 
    2455                 :             :                         /* step right one page */
    2456                 :           0 :                         lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
    2457                 :           0 :                         page = BufferGetPage(lbuf);
    2458                 :           0 :                         opaque = BTPageGetOpaque(page);
    2459         [ #  # ]:           0 :                 }
    2460                 :          67 :         }
    2461                 :             :         else
    2462                 :         127 :                 lbuf = InvalidBuffer;
    2463                 :             : 
    2464                 :             :         /* Next write-lock the target page itself */
    2465                 :         194 :         _bt_lockbuf(rel, buf, BT_WRITE);
    2466                 :         194 :         page = BufferGetPage(buf);
    2467                 :         194 :         opaque = BTPageGetOpaque(page);
    2468                 :             : 
    2469                 :             :         /*
    2470                 :             :          * Check page is still empty etc, else abandon deletion.  This is just for
    2471                 :             :          * paranoia's sake; a half-dead page cannot resurrect because there can be
    2472                 :             :          * only one vacuum process running at a time.
    2473                 :             :          */
    2474         [ +  - ]:         194 :         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
    2475   [ #  #  #  # ]:           0 :                 elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
    2476                 :             :                          target, RelationGetRelationName(rel));
    2477                 :             : 
    2478         [ +  - ]:         194 :         if (opaque->btpo_prev != leftsib)
    2479   [ #  #  #  # ]:           0 :                 ereport(ERROR,
    2480                 :             :                                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2481                 :             :                                  errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
    2482                 :             :                                                                  leftsib, opaque->btpo_prev, target,
    2483                 :             :                                                                  RelationGetRelationName(rel))));
    2484                 :             : 
    2485         [ +  + ]:         194 :         if (target == leafblkno)
    2486                 :             :         {
    2487         [ +  - ]:         148 :                 if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
    2488                 :         148 :                         !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
    2489   [ #  #  #  # ]:           0 :                         elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
    2490                 :             :                                  target, RelationGetRelationName(rel));
    2491                 :             : 
    2492                 :             :                 /* Leaf page is also target page: don't set leaftopparent */
    2493                 :         148 :                 leaftopparent = InvalidBlockNumber;
    2494                 :         148 :         }
    2495                 :             :         else
    2496                 :             :         {
    2497                 :          46 :                 IndexTuple      finaldataitem;
    2498                 :             : 
    2499         [ +  - ]:          46 :                 if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
    2500                 :          46 :                         P_ISLEAF(opaque))
    2501   [ #  #  #  # ]:           0 :                         elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
    2502                 :             :                                  targetlevel, target, RelationGetRelationName(rel));
    2503                 :             : 
    2504                 :             :                 /* Target is internal: set leaftopparent for next call here...  */
    2505                 :          46 :                 itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
    2506                 :          46 :                 finaldataitem = (IndexTuple) PageGetItem(page, itemid);
    2507                 :          46 :                 leaftopparent = BTreeTupleGetDownLink(finaldataitem);
    2508                 :             :                 /* ...except when it would be a redundant pointer-to-self */
    2509         [ +  + ]:          46 :                 if (leaftopparent == leafblkno)
    2510                 :          20 :                         leaftopparent = InvalidBlockNumber;
    2511                 :          46 :         }
    2512                 :             : 
    2513                 :             :         /* No leaftopparent for level 0 (leaf page) or level 1 target */
    2514   [ +  +  +  - ]:         194 :         Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
    2515                 :             : 
    2516                 :             :         /*
    2517                 :             :          * And next write-lock the (current) right sibling.
    2518                 :             :          */
    2519                 :         194 :         rightsib = opaque->btpo_next;
    2520                 :         194 :         rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
    2521                 :         194 :         page = BufferGetPage(rbuf);
    2522                 :         194 :         opaque = BTPageGetOpaque(page);
    2523                 :             : 
    2524                 :             :         /*
    2525                 :             :          * Validate target's right sibling page.  Its left link must point back to
    2526                 :             :          * the target page.
    2527                 :             :          */
    2528         [ -  + ]:         194 :         if (opaque->btpo_prev != target)
    2529                 :             :         {
    2530                 :             :                 /*
    2531                 :             :                  * This is known to fail in the field; sibling link corruption is
    2532                 :             :                  * relatively common.  Press on with vacuuming rather than just
    2533                 :             :                  * throwing an ERROR (same approach used for left-sibling's-right-link
    2534                 :             :                  * validation check a moment ago).
    2535                 :             :                  */
    2536   [ #  #  #  # ]:           0 :                 ereport(LOG,
    2537                 :             :                                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2538                 :             :                                  errmsg_internal("right sibling's left-link doesn't match: "
    2539                 :             :                                                                  "right sibling %u of target %u with leafblkno %u "
    2540                 :             :                                                                  "and scanblkno %u spuriously links to non-target %u "
    2541                 :             :                                                                  "on level %u of index \"%s\"",
    2542                 :             :                                                                  rightsib, target, leafblkno,
    2543                 :             :                                                                  scanblkno, opaque->btpo_prev,
    2544                 :             :                                                                  targetlevel, RelationGetRelationName(rel))));
    2545                 :             : 
    2546                 :             :                 /* Must release all pins and locks on failure exit */
    2547         [ #  # ]:           0 :                 if (BufferIsValid(lbuf))
    2548                 :           0 :                         _bt_relbuf(rel, lbuf);
    2549                 :           0 :                 _bt_relbuf(rel, rbuf);
    2550                 :           0 :                 _bt_relbuf(rel, buf);
    2551         [ #  # ]:           0 :                 if (target != leafblkno)
    2552                 :           0 :                         _bt_relbuf(rel, leafbuf);
    2553                 :             : 
    2554                 :           0 :                 return false;
    2555                 :             :         }
    2556                 :             : 
    2557                 :         194 :         rightsib_is_rightmost = P_RIGHTMOST(opaque);
    2558                 :         194 :         *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
    2559                 :             : 
    2560                 :             :         /*
    2561                 :             :          * If we are deleting the next-to-last page on the target's level, then
    2562                 :             :          * the rightsib is a candidate to become the new fast root. (In theory, it
    2563                 :             :          * might be possible to push the fast root even further down, but the odds
    2564                 :             :          * of doing so are slim, and the locking considerations daunting.)
    2565                 :             :          *
    2566                 :             :          * We can safely acquire a lock on the metapage here --- see comments for
    2567                 :             :          * _bt_newlevel().
    2568                 :             :          */
    2569   [ +  +  +  + ]:         194 :         if (leftsib == P_NONE && rightsib_is_rightmost)
    2570                 :             :         {
    2571                 :           8 :                 page = BufferGetPage(rbuf);
    2572                 :           8 :                 opaque = BTPageGetOpaque(page);
    2573         [ +  - ]:           8 :                 if (P_RIGHTMOST(opaque))
    2574                 :             :                 {
    2575                 :             :                         /* rightsib will be the only one left on the level */
    2576                 :           8 :                         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    2577                 :           8 :                         metapg = BufferGetPage(metabuf);
    2578                 :           8 :                         metad = BTPageGetMeta(metapg);
    2579                 :             : 
    2580                 :             :                         /*
    2581                 :             :                          * The expected case here is btm_fastlevel == targetlevel+1; if
    2582                 :             :                          * the fastlevel is <= targetlevel, something is wrong, and we
    2583                 :             :                          * choose to overwrite it to fix it.
    2584                 :             :                          */
    2585         [ +  - ]:           8 :                         if (metad->btm_fastlevel > targetlevel + 1)
    2586                 :             :                         {
    2587                 :             :                                 /* no update wanted */
    2588                 :           0 :                                 _bt_relbuf(rel, metabuf);
    2589                 :           0 :                                 metabuf = InvalidBuffer;
    2590                 :           0 :                         }
    2591                 :           8 :                 }
    2592                 :           8 :         }
    2593                 :             : 
    2594                 :             :         /*
    2595                 :             :          * Here we begin doing the deletion.
    2596                 :             :          */
    2597                 :             : 
    2598                 :             :         /* No ereport(ERROR) until changes are logged */
    2599                 :         194 :         START_CRIT_SECTION();
    2600                 :             : 
    2601                 :             :         /*
    2602                 :             :          * Update siblings' side-links.  Note the target page's side-links will
    2603                 :             :          * continue to point to the siblings.  Asserts here are just rechecking
    2604                 :             :          * things we already verified above.
    2605                 :             :          */
    2606         [ +  + ]:         194 :         if (BufferIsValid(lbuf))
    2607                 :             :         {
    2608                 :          67 :                 page = BufferGetPage(lbuf);
    2609                 :          67 :                 opaque = BTPageGetOpaque(page);
    2610         [ +  - ]:          67 :                 Assert(opaque->btpo_next == target);
    2611                 :          67 :                 opaque->btpo_next = rightsib;
    2612                 :          67 :         }
    2613                 :         194 :         page = BufferGetPage(rbuf);
    2614                 :         194 :         opaque = BTPageGetOpaque(page);
    2615         [ +  - ]:         194 :         Assert(opaque->btpo_prev == target);
    2616                 :         194 :         opaque->btpo_prev = leftsib;
    2617                 :             : 
    2618                 :             :         /*
    2619                 :             :          * If we deleted a parent of the targeted leaf page, instead of the leaf
    2620                 :             :          * itself, update the leaf to point to the next remaining child in the
    2621                 :             :          * subtree.
    2622                 :             :          *
    2623                 :             :          * Note: We rely on the fact that a buffer pin on the leaf page has been
    2624                 :             :          * held since leafhikey was initialized.  This is safe, though only
    2625                 :             :          * because the page was already half-dead at that point.  The leaf page
    2626                 :             :          * cannot have been modified by any other backend during the period when
    2627                 :             :          * no lock was held.
    2628                 :             :          */
    2629         [ +  + ]:         194 :         if (target != leafblkno)
    2630                 :          46 :                 BTreeTupleSetTopParent(leafhikey, leaftopparent);
    2631                 :             : 
    2632                 :             :         /*
    2633                 :             :          * Mark the page itself deleted.  It can be recycled when all current
    2634                 :             :          * transactions are gone.  Storing GetTopTransactionId() would work, but
    2635                 :             :          * we're in VACUUM and would not otherwise have an XID.  Having already
    2636                 :             :          * updated links to the target, ReadNextFullTransactionId() suffices as an
    2637                 :             :          * upper bound.  Any scan having retained a now-stale link is advertising
    2638                 :             :          * in its PGPROC an xmin less than or equal to the value we read here.  It
    2639                 :             :          * will continue to do so, holding back the xmin horizon, for the duration
    2640                 :             :          * of that scan.
    2641                 :             :          */
    2642                 :         194 :         page = BufferGetPage(buf);
    2643                 :         194 :         opaque = BTPageGetOpaque(page);
    2644   [ +  +  +  - ]:         194 :         Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
    2645                 :             : 
    2646                 :             :         /*
    2647                 :             :          * Store upper bound XID that's used to determine when deleted page is no
    2648                 :             :          * longer needed as a tombstone
    2649                 :             :          */
    2650                 :         194 :         safexid = ReadNextFullTransactionId();
    2651                 :         194 :         BTPageSetDeleted(page, safexid);
    2652                 :         194 :         opaque->btpo_cycleid = 0;
    2653                 :             : 
    2654                 :             :         /* And update the metapage, if needed */
    2655         [ +  + ]:         194 :         if (BufferIsValid(metabuf))
    2656                 :             :         {
    2657                 :             :                 /* upgrade metapage if needed */
    2658         [ +  - ]:           8 :                 if (metad->btm_version < BTREE_NOVAC_VERSION)
    2659                 :           0 :                         _bt_upgrademetapage(metapg);
    2660                 :           8 :                 metad->btm_fastroot = rightsib;
    2661                 :           8 :                 metad->btm_fastlevel = targetlevel;
    2662                 :           8 :                 MarkBufferDirty(metabuf);
    2663                 :           8 :         }
    2664                 :             : 
    2665                 :             :         /* Must mark buffers dirty before XLogInsert */
    2666                 :         194 :         MarkBufferDirty(rbuf);
    2667                 :         194 :         MarkBufferDirty(buf);
    2668         [ +  + ]:         194 :         if (BufferIsValid(lbuf))
    2669                 :          67 :                 MarkBufferDirty(lbuf);
    2670         [ +  + ]:         194 :         if (target != leafblkno)
    2671                 :          46 :                 MarkBufferDirty(leafbuf);
    2672                 :             : 
    2673                 :             :         /* XLOG stuff */
    2674   [ +  -  +  -  :         194 :         if (RelationNeedsWAL(rel))
             +  -  -  + ]
    2675                 :             :         {
    2676                 :         194 :                 xl_btree_unlink_page xlrec;
    2677                 :         194 :                 xl_btree_metadata xlmeta;
    2678                 :         194 :                 uint8           xlinfo;
    2679                 :         194 :                 XLogRecPtr      recptr;
    2680                 :             : 
    2681                 :         194 :                 XLogBeginInsert();
    2682                 :             : 
    2683                 :         194 :                 XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
    2684         [ +  + ]:         194 :                 if (BufferIsValid(lbuf))
    2685                 :          67 :                         XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
    2686                 :         194 :                 XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
    2687         [ +  + ]:         194 :                 if (target != leafblkno)
    2688                 :          46 :                         XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
    2689                 :             : 
    2690                 :             :                 /* information stored on the target/to-be-unlinked block */
    2691                 :         194 :                 xlrec.leftsib = leftsib;
    2692                 :         194 :                 xlrec.rightsib = rightsib;
    2693                 :         194 :                 xlrec.level = targetlevel;
    2694                 :         194 :                 xlrec.safexid = safexid;
    2695                 :             : 
    2696                 :             :                 /* information needed to recreate the leaf block (if not the target) */
    2697                 :         194 :                 xlrec.leafleftsib = leafleftsib;
    2698                 :         194 :                 xlrec.leafrightsib = leafrightsib;
    2699                 :         194 :                 xlrec.leaftopparent = leaftopparent;
    2700                 :             : 
    2701                 :         194 :                 XLogRegisterData(&xlrec, SizeOfBtreeUnlinkPage);
    2702                 :             : 
    2703         [ +  + ]:         194 :                 if (BufferIsValid(metabuf))
    2704                 :             :                 {
    2705                 :           8 :                         XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
    2706                 :             : 
    2707         [ +  - ]:           8 :                         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
    2708                 :           8 :                         xlmeta.version = metad->btm_version;
    2709                 :           8 :                         xlmeta.root = metad->btm_root;
    2710                 :           8 :                         xlmeta.level = metad->btm_level;
    2711                 :           8 :                         xlmeta.fastroot = metad->btm_fastroot;
    2712                 :           8 :                         xlmeta.fastlevel = metad->btm_fastlevel;
    2713                 :           8 :                         xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
    2714                 :           8 :                         xlmeta.allequalimage = metad->btm_allequalimage;
    2715                 :             : 
    2716                 :           8 :                         XLogRegisterBufData(4, &xlmeta, sizeof(xl_btree_metadata));
    2717                 :           8 :                         xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
    2718                 :           8 :                 }
    2719                 :             :                 else
    2720                 :         186 :                         xlinfo = XLOG_BTREE_UNLINK_PAGE;
    2721                 :             : 
    2722                 :         194 :                 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
    2723                 :             : 
    2724         [ +  + ]:         194 :                 if (BufferIsValid(metabuf))
    2725                 :             :                 {
    2726                 :           8 :                         PageSetLSN(metapg, recptr);
    2727                 :           8 :                 }
    2728                 :         194 :                 page = BufferGetPage(rbuf);
    2729                 :         194 :                 PageSetLSN(page, recptr);
    2730                 :         194 :                 page = BufferGetPage(buf);
    2731                 :         194 :                 PageSetLSN(page, recptr);
    2732         [ +  + ]:         194 :                 if (BufferIsValid(lbuf))
    2733                 :             :                 {
    2734                 :          67 :                         page = BufferGetPage(lbuf);
    2735                 :          67 :                         PageSetLSN(page, recptr);
    2736                 :          67 :                 }
    2737         [ +  + ]:         194 :                 if (target != leafblkno)
    2738                 :             :                 {
    2739                 :          46 :                         page = BufferGetPage(leafbuf);
    2740                 :          46 :                         PageSetLSN(page, recptr);
    2741                 :          46 :                 }
    2742                 :         194 :         }
    2743                 :             : 
    2744         [ +  - ]:         194 :         END_CRIT_SECTION();
    2745                 :             : 
    2746                 :             :         /* release metapage */
    2747         [ +  + ]:         194 :         if (BufferIsValid(metabuf))
    2748                 :           8 :                 _bt_relbuf(rel, metabuf);
    2749                 :             : 
    2750                 :             :         /* release siblings */
    2751         [ +  + ]:         194 :         if (BufferIsValid(lbuf))
    2752                 :          67 :                 _bt_relbuf(rel, lbuf);
    2753                 :         194 :         _bt_relbuf(rel, rbuf);
    2754                 :             : 
    2755                 :             :         /* If the target is not leafbuf, we're done with it now -- release it */
    2756         [ +  + ]:         194 :         if (target != leafblkno)
    2757                 :          46 :                 _bt_relbuf(rel, buf);
    2758                 :             : 
    2759                 :             :         /*
    2760                 :             :          * Maintain pages_newly_deleted, which is simply the number of pages
    2761                 :             :          * deleted by the ongoing VACUUM operation.
    2762                 :             :          *
    2763                 :             :          * Maintain pages_deleted in a way that takes into account how
    2764                 :             :          * btvacuumpage() will count deleted pages that have yet to become
    2765                 :             :          * scanblkno -- only count page when it's not going to get that treatment
    2766                 :             :          * later on.
    2767                 :             :          */
    2768                 :         194 :         stats->pages_newly_deleted++;
    2769         [ +  + ]:         194 :         if (target <= scanblkno)
    2770                 :         155 :                 stats->pages_deleted++;
    2771                 :             : 
    2772                 :             :         /*
    2773                 :             :          * Remember information about the target page (now a newly deleted page)
    2774                 :             :          * in dedicated vstate space for later.  The page will be considered as a
    2775                 :             :          * candidate to place in the FSM at the end of the current btvacuumscan()
    2776                 :             :          * call.
    2777                 :             :          */
    2778                 :         194 :         _bt_pendingfsm_add(vstate, target, safexid);
    2779                 :             : 
    2780                 :             :         /* Success - hold on to lock on leafbuf (might also have been target) */
    2781                 :         194 :         return true;
    2782                 :         194 : }
    2783                 :             : 
    2784                 :             : /*
    2785                 :             :  * Establish how tall the to-be-deleted subtree will be during the first stage
    2786                 :             :  * of page deletion.
    2787                 :             :  *
    2788                 :             :  * Caller's child argument is the block number of the page caller wants to
    2789                 :             :  * delete (this is leafbuf's block number, except when we're called
    2790                 :             :  * recursively).  stack is a search stack leading to it.  Note that we will
    2791                 :             :  * update the stack entry(s) to reflect current downlink positions --- this is
    2792                 :             :  * similar to the corresponding point in page split handling.
    2793                 :             :  *
    2794                 :             :  * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
    2795                 :             :  * false.  Returns true on success, in which case caller can use certain
    2796                 :             :  * details established here to perform the first stage of deletion.  This
    2797                 :             :  * function is the last point at which page deletion may be deemed unsafe
    2798                 :             :  * (barring index corruption, or unexpected concurrent page deletions).
    2799                 :             :  *
    2800                 :             :  * We write lock the parent of the root of the to-be-deleted subtree for
    2801                 :             :  * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
    2802                 :             :  * caller).  Caller will have to remove a downlink from *subtreeparent.  We
    2803                 :             :  * also set a *subtreeparent offset number in *poffset, to indicate the
    2804                 :             :  * location of the pivot tuple that contains the relevant downlink.
    2805                 :             :  *
    2806                 :             :  * The root of the to-be-deleted subtree is called the "top parent".  Note
    2807                 :             :  * that the leafbuf page is often the final "top parent" page (you can think
    2808                 :             :  * of the leafbuf page as a degenerate single page subtree when that happens).
    2809                 :             :  * Caller should initialize *topparent to the target leafbuf page block number
    2810                 :             :  * (while *topparentrightsib should be set to leafbuf's right sibling block
    2811                 :             :  * number).  We will update *topparent (and *topparentrightsib) for caller
    2812                 :             :  * here, though only when it turns out that caller will delete at least one
    2813                 :             :  * internal page (i.e. only when caller needs to store a valid link to the top
    2814                 :             :  * parent block in the leafbuf page using BTreeTupleSetTopParent()).
    2815                 :             :  */
    2816                 :             : static bool
    2817                 :         204 : _bt_lock_subtree_parent(Relation rel, Relation heaprel, BlockNumber child,
    2818                 :             :                                                 BTStack stack, Buffer *subtreeparent,
    2819                 :             :                                                 OffsetNumber *poffset, BlockNumber *topparent,
    2820                 :             :                                                 BlockNumber *topparentrightsib)
    2821                 :             : {
    2822                 :         204 :         BlockNumber parent,
    2823                 :             :                                 leftsibparent;
    2824                 :         204 :         OffsetNumber parentoffset,
    2825                 :             :                                 maxoff;
    2826                 :         204 :         Buffer          pbuf;
    2827                 :         204 :         Page            page;
    2828                 :         204 :         BTPageOpaque opaque;
    2829                 :             : 
    2830                 :             :         /*
    2831                 :             :          * Locate the pivot tuple whose downlink points to "child".  Write lock
    2832                 :             :          * the parent page itself.
    2833                 :             :          */
    2834                 :         204 :         pbuf = _bt_getstackbuf(rel, heaprel, stack, child);
    2835         [ +  - ]:         204 :         if (pbuf == InvalidBuffer)
    2836                 :             :         {
    2837                 :             :                 /*
    2838                 :             :                  * Failed to "re-find" a pivot tuple whose downlink matched our child
    2839                 :             :                  * block number on the parent level -- the index must be corrupt.
    2840                 :             :                  * Don't even try to delete the leafbuf subtree.  Just report the
    2841                 :             :                  * issue and press on with vacuuming the index.
    2842                 :             :                  *
    2843                 :             :                  * Note: _bt_getstackbuf() recovers from concurrent page splits that
    2844                 :             :                  * take place on the parent level.  Its approach is a near-exhaustive
    2845                 :             :                  * linear search.  This also gives it a surprisingly good chance of
    2846                 :             :                  * recovering in the event of a buggy or inconsistent opclass.  But we
    2847                 :             :                  * don't rely on that here.
    2848                 :             :                  */
    2849   [ #  #  #  # ]:           0 :                 ereport(LOG,
    2850                 :             :                                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2851                 :             :                                  errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
    2852                 :             :                                                                  RelationGetRelationName(rel), child)));
    2853                 :           0 :                 Assert(false);
    2854                 :           0 :                 return false;
    2855                 :             :         }
    2856                 :             : 
    2857                 :         204 :         parent = stack->bts_blkno;
    2858                 :         204 :         parentoffset = stack->bts_offset;
    2859                 :             : 
    2860                 :         204 :         page = BufferGetPage(pbuf);
    2861                 :         204 :         opaque = BTPageGetOpaque(page);
    2862                 :         204 :         maxoff = PageGetMaxOffsetNumber(page);
    2863                 :         204 :         leftsibparent = opaque->btpo_prev;
    2864                 :             : 
    2865                 :             :         /*
    2866                 :             :          * _bt_getstackbuf() completes page splits on returned parent buffer when
    2867                 :             :          * required.
    2868                 :             :          *
    2869                 :             :          * In general it's a bad idea for VACUUM to use up more disk space, which
    2870                 :             :          * is why page deletion does not finish incomplete page splits most of the
    2871                 :             :          * time.  We allow this limited exception because the risk is much lower,
    2872                 :             :          * and the potential downside of not proceeding is much higher:  A single
    2873                 :             :          * internal page with the INCOMPLETE_SPLIT flag set might otherwise
    2874                 :             :          * prevent us from deleting hundreds of empty leaf pages from one level
    2875                 :             :          * down.
    2876                 :             :          */
    2877         [ +  - ]:         204 :         Assert(!P_INCOMPLETE_SPLIT(opaque));
    2878                 :             : 
    2879         [ +  + ]:         204 :         if (parentoffset < maxoff)
    2880                 :             :         {
    2881                 :             :                 /*
    2882                 :             :                  * Child is not the rightmost child in parent, so it's safe to delete
    2883                 :             :                  * the subtree whose root/topparent is child page
    2884                 :             :                  */
    2885                 :         148 :                 *subtreeparent = pbuf;
    2886                 :         148 :                 *poffset = parentoffset;
    2887                 :         148 :                 return true;
    2888                 :             :         }
    2889                 :             : 
    2890                 :             :         /*
    2891                 :             :          * Child is the rightmost child of parent.
    2892                 :             :          *
    2893                 :             :          * Since it's the rightmost child of parent, deleting the child (or
    2894                 :             :          * deleting the subtree whose root/topparent is the child page) is only
    2895                 :             :          * safe when it's also possible to delete the parent.
    2896                 :             :          */
    2897         [ +  - ]:          56 :         Assert(parentoffset == maxoff);
    2898   [ +  +  -  + ]:          56 :         if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
    2899                 :             :         {
    2900                 :             :                 /*
    2901                 :             :                  * Child isn't parent's only child, or parent is rightmost on its
    2902                 :             :                  * entire level.  Definitely cannot delete any pages.
    2903                 :             :                  */
    2904                 :           3 :                 _bt_relbuf(rel, pbuf);
    2905                 :           3 :                 return false;
    2906                 :             :         }
    2907                 :             : 
    2908                 :             :         /*
    2909                 :             :          * Now make sure that the parent deletion is itself safe by examining the
    2910                 :             :          * child's grandparent page.  Recurse, passing the parent page as the
    2911                 :             :          * child page (child's grandparent is the parent on the next level up). If
    2912                 :             :          * parent deletion is unsafe, then child deletion must also be unsafe (in
    2913                 :             :          * which case caller cannot delete any pages at all).
    2914                 :             :          */
    2915                 :          53 :         *topparent = parent;
    2916                 :          53 :         *topparentrightsib = opaque->btpo_next;
    2917                 :             : 
    2918                 :             :         /*
    2919                 :             :          * Release lock on parent before recursing.
    2920                 :             :          *
    2921                 :             :          * It's OK to release page locks on parent before recursive call locks
    2922                 :             :          * grandparent.  An internal page can only acquire an entry if the child
    2923                 :             :          * is split, but that cannot happen as long as we still hold a lock on the
    2924                 :             :          * leafbuf page.
    2925                 :             :          */
    2926                 :          53 :         _bt_relbuf(rel, pbuf);
    2927                 :             : 
    2928                 :             :         /*
    2929                 :             :          * Before recursing, check that the left sibling of parent (if any) is not
    2930                 :             :          * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
    2931                 :             :          * parent lock).
    2932                 :             :          *
    2933                 :             :          * Note: We deliberately avoid completing incomplete splits here.
    2934                 :             :          */
    2935         [ -  + ]:          53 :         if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
    2936                 :           0 :                 return false;
    2937                 :             : 
    2938                 :             :         /* Recurse to examine child page's grandparent page */
    2939                 :         106 :         return _bt_lock_subtree_parent(rel, heaprel, parent, stack->bts_parent,
    2940                 :          53 :                                                                    subtreeparent, poffset,
    2941                 :          53 :                                                                    topparent, topparentrightsib);
    2942                 :         204 : }
    2943                 :             : 
    2944                 :             : /*
    2945                 :             :  * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize
    2946                 :             :  * optimization.
    2947                 :             :  *
    2948                 :             :  * Called at the start of a btvacuumscan().  Caller's cleanuponly argument
    2949                 :             :  * indicates if ongoing VACUUM has not (and will not) call btbulkdelete().
    2950                 :             :  *
    2951                 :             :  * We expect to allocate memory inside VACUUM's top-level memory context here.
    2952                 :             :  * The working buffer is subject to a limit based on work_mem.  Our strategy
    2953                 :             :  * when the array can no longer grow within the bounds of that limit is to
    2954                 :             :  * stop saving additional newly deleted pages, while proceeding as usual with
    2955                 :             :  * the pages that we can fit.
    2956                 :             :  */
    2957                 :             : void
    2958                 :         170 : _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
    2959                 :             : {
    2960                 :         170 :         Size            maxbufsize;
    2961                 :             : 
    2962                 :             :         /*
    2963                 :             :          * Don't bother with optimization in cleanup-only case -- we don't expect
    2964                 :             :          * any newly deleted pages.  Besides, cleanup-only calls to btvacuumscan()
    2965                 :             :          * can only take place because this optimization didn't work out during
    2966                 :             :          * the last VACUUM.
    2967                 :             :          */
    2968         [ +  + ]:         170 :         if (cleanuponly)
    2969                 :           1 :                 return;
    2970                 :             : 
    2971                 :             :         /*
    2972                 :             :          * Cap maximum size of array so that we always respect work_mem.  Avoid
    2973                 :             :          * int overflow here.
    2974                 :             :          */
    2975                 :         169 :         vstate->bufsize = 256;
    2976                 :         169 :         maxbufsize = (work_mem * (Size) 1024) / sizeof(BTPendingFSM);
    2977         [ +  - ]:         169 :         maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
    2978                 :             :         /* BTVacState.maxbufsize has type int */
    2979         [ +  - ]:         169 :         maxbufsize = Min(maxbufsize, INT_MAX);
    2980                 :             :         /* Stay sane with small work_mem */
    2981         [ +  - ]:         169 :         maxbufsize = Max(maxbufsize, vstate->bufsize);
    2982                 :         169 :         vstate->maxbufsize = (int) maxbufsize;
    2983                 :             : 
    2984                 :             :         /* Allocate buffer, indicate that there are currently 0 pending pages */
    2985                 :         169 :         vstate->pendingpages = palloc_array(BTPendingFSM, vstate->bufsize);
    2986                 :         169 :         vstate->npendingpages = 0;
    2987         [ -  + ]:         170 : }
    2988                 :             : 
    2989                 :             : /*
    2990                 :             :  * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during
    2991                 :             :  * the ongoing VACUUM operation) into the free space map -- though only when
    2992                 :             :  * it is actually safe to do so by now.
    2993                 :             :  *
    2994                 :             :  * Called at the end of a btvacuumscan(), just before free space map vacuuming
    2995                 :             :  * takes place.
    2996                 :             :  *
    2997                 :             :  * Frees memory allocated by _bt_pendingfsm_init(), if any.
    2998                 :             :  */
    2999                 :             : void
    3000                 :         170 : _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
    3001                 :             : {
    3002                 :         170 :         IndexBulkDeleteResult *stats = vstate->stats;
    3003                 :         170 :         Relation        heaprel = vstate->info->heaprel;
    3004                 :             : 
    3005         [ +  - ]:         170 :         Assert(stats->pages_newly_deleted >= vstate->npendingpages);
    3006         [ +  - ]:         170 :         Assert(heaprel != NULL);
    3007                 :             : 
    3008         [ +  + ]:         170 :         if (vstate->npendingpages == 0)
    3009                 :             :         {
    3010                 :             :                 /* Just free memory when nothing to do */
    3011         [ +  + ]:         156 :                 if (vstate->pendingpages)
    3012                 :         155 :                         pfree(vstate->pendingpages);
    3013                 :             : 
    3014                 :         156 :                 return;
    3015                 :             :         }
    3016                 :             : 
    3017                 :             : #ifdef DEBUG_BTREE_PENDING_FSM
    3018                 :             : 
    3019                 :             :         /*
    3020                 :             :          * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
    3021                 :             :          * placing pending pages in the FSM.  Note that the optimization will
    3022                 :             :          * never be effective without some other backend concurrently consuming an
    3023                 :             :          * XID.
    3024                 :             :          */
    3025                 :             :         pg_usleep(5000000L);
    3026                 :             : #endif
    3027                 :             : 
    3028                 :             :         /*
    3029                 :             :          * Recompute VACUUM XID boundaries.
    3030                 :             :          *
    3031                 :             :          * We don't actually care about the oldest non-removable XID.  Computing
    3032                 :             :          * the oldest such XID has a useful side-effect that we rely on: it
    3033                 :             :          * forcibly updates the XID horizon state for this backend.  This step is
    3034                 :             :          * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
    3035                 :             :          * that it is now safe to recycle newly deleted pages without this step.
    3036                 :             :          */
    3037                 :          14 :         GetOldestNonRemovableTransactionId(heaprel);
    3038                 :             : 
    3039         [ +  - ]:          28 :         for (int i = 0; i < vstate->npendingpages; i++)
    3040                 :             :         {
    3041                 :          14 :                 BlockNumber target = vstate->pendingpages[i].target;
    3042                 :          14 :                 FullTransactionId safexid = vstate->pendingpages[i].safexid;
    3043                 :             : 
    3044                 :             :                 /*
    3045                 :             :                  * Do the equivalent of checking BTPageIsRecyclable(), but without
    3046                 :             :                  * accessing the page again a second time.
    3047                 :             :                  *
    3048                 :             :                  * Give up on finding the first non-recyclable page -- all later pages
    3049                 :             :                  * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
    3050                 :             :                  * to the array in safexid order.
    3051                 :             :                  */
    3052         [ -  + ]:          14 :                 if (!GlobalVisCheckRemovableFullXid(heaprel, safexid))
    3053                 :          14 :                         break;
    3054                 :             : 
    3055                 :           0 :                 RecordFreeIndexPage(rel, target);
    3056                 :           0 :                 stats->pages_free++;
    3057         [ +  - ]:          14 :         }
    3058                 :             : 
    3059                 :          14 :         pfree(vstate->pendingpages);
    3060         [ -  + ]:         170 : }
    3061                 :             : 
    3062                 :             : /*
    3063                 :             :  * Maintain array of pages that were deleted during current btvacuumscan()
    3064                 :             :  * call, for use in _bt_pendingfsm_finalize()
    3065                 :             :  */
    3066                 :             : static void
    3067                 :         194 : _bt_pendingfsm_add(BTVacState *vstate,
    3068                 :             :                                    BlockNumber target,
    3069                 :             :                                    FullTransactionId safexid)
    3070                 :             : {
    3071         [ +  - ]:         194 :         Assert(vstate->npendingpages <= vstate->bufsize);
    3072         [ +  - ]:         194 :         Assert(vstate->bufsize <= vstate->maxbufsize);
    3073                 :             : 
    3074                 :             : #ifdef USE_ASSERT_CHECKING
    3075                 :             : 
    3076                 :             :         /*
    3077                 :             :          * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
    3078                 :             :          * array will always be in safexid order (since that is the order that we
    3079                 :             :          * save them in here)
    3080                 :             :          */
    3081         [ +  + ]:         194 :         if (vstate->npendingpages > 0)
    3082                 :             :         {
    3083                 :         180 :                 FullTransactionId lastsafexid =
    3084                 :         180 :                         vstate->pendingpages[vstate->npendingpages - 1].safexid;
    3085                 :             : 
    3086         [ +  - ]:         180 :                 Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
    3087                 :         180 :         }
    3088                 :             : #endif
    3089                 :             : 
    3090                 :             :         /*
    3091                 :             :          * If temp buffer reaches maxbufsize/work_mem capacity then we discard
    3092                 :             :          * information about this page.
    3093                 :             :          *
    3094                 :             :          * Note that this also covers the case where we opted to not use the
    3095                 :             :          * optimization in _bt_pendingfsm_init().
    3096                 :             :          */
    3097         [ -  + ]:         194 :         if (vstate->npendingpages == vstate->maxbufsize)
    3098                 :           0 :                 return;
    3099                 :             : 
    3100                 :             :         /* Consider enlarging buffer */
    3101         [ +  - ]:         194 :         if (vstate->npendingpages == vstate->bufsize)
    3102                 :             :         {
    3103                 :           0 :                 int                     newbufsize = vstate->bufsize * 2;
    3104                 :             : 
    3105                 :             :                 /* Respect work_mem */
    3106         [ #  # ]:           0 :                 if (newbufsize > vstate->maxbufsize)
    3107                 :           0 :                         newbufsize = vstate->maxbufsize;
    3108                 :             : 
    3109                 :           0 :                 vstate->bufsize = newbufsize;
    3110                 :           0 :                 vstate->pendingpages =
    3111                 :           0 :                         repalloc(vstate->pendingpages,
    3112                 :           0 :                                          sizeof(BTPendingFSM) * vstate->bufsize);
    3113                 :           0 :         }
    3114                 :             : 
    3115                 :             :         /* Save metadata for newly deleted page */
    3116                 :         194 :         vstate->pendingpages[vstate->npendingpages].target = target;
    3117                 :         194 :         vstate->pendingpages[vstate->npendingpages].safexid = safexid;
    3118                 :         194 :         vstate->npendingpages++;
    3119                 :         194 : }
        

Generated by: LCOV version 2.3.2-1