LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtdedup.c (source / functions) Coverage Total Hit
Test: Code coverage Lines: 96.2 % 419 403
Test Date: 2026-01-26 10:56:24 Functions: 100.0 % 12 12
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
Branches: 65.6 % 224 147

             Branch data     Line data    Source code
       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * nbtdedup.c
       4                 :             :  *        Deduplicate or bottom-up delete items in Postgres btrees.
       5                 :             :  *
       6                 :             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7                 :             :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :             :  *
       9                 :             :  *
      10                 :             :  * IDENTIFICATION
      11                 :             :  *        src/backend/access/nbtree/nbtdedup.c
      12                 :             :  *
      13                 :             :  *-------------------------------------------------------------------------
      14                 :             :  */
      15                 :             : #include "postgres.h"
      16                 :             : 
      17                 :             : #include "access/nbtree.h"
      18                 :             : #include "access/nbtxlog.h"
      19                 :             : #include "access/tableam.h"
      20                 :             : #include "access/xloginsert.h"
      21                 :             : #include "miscadmin.h"
      22                 :             : #include "utils/rel.h"
      23                 :             : 
      24                 :             : static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
      25                 :             :                                                                                    TM_IndexDeleteOp *delstate);
      26                 :             : static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
      27                 :             :                                                          OffsetNumber minoff, IndexTuple newitem);
      28                 :             : static void _bt_singleval_fillfactor(Page page, BTDedupState state,
      29                 :             :                                                                          Size newitemsz);
      30                 :             : #ifdef USE_ASSERT_CHECKING
      31                 :             : static bool _bt_posting_valid(IndexTuple posting);
      32                 :             : #endif
      33                 :             : 
      34                 :             : /*
      35                 :             :  * Perform a deduplication pass.
      36                 :             :  *
      37                 :             :  * The general approach taken here is to perform as much deduplication as
      38                 :             :  * possible to free as much space as possible.  Note, however, that "single
      39                 :             :  * value" strategy is used for !bottomupdedup callers when the page is full of
      40                 :             :  * tuples of a single value.  Deduplication passes that apply the strategy
      41                 :             :  * will leave behind a few untouched tuples at the end of the page, preparing
      42                 :             :  * the page for an anticipated page split that uses nbtsplitloc.c's own single
      43                 :             :  * value strategy.  Our high level goal is to delay merging the untouched
      44                 :             :  * tuples until after the page splits.
      45                 :             :  *
      46                 :             :  * When a call to _bt_bottomupdel_pass() just took place (and failed), our
      47                 :             :  * high level goal is to prevent a page split entirely by buying more time.
      48                 :             :  * We still hope that a page split can be avoided altogether.  That's why
      49                 :             :  * single value strategy is not even considered for bottomupdedup callers.
      50                 :             :  *
      51                 :             :  * The page will have to be split if we cannot successfully free at least
      52                 :             :  * newitemsz (we also need space for newitem's line pointer, which isn't
      53                 :             :  * included in caller's newitemsz).
      54                 :             :  *
      55                 :             :  * Note: Caller should have already deleted all existing items with their
      56                 :             :  * LP_DEAD bits set.
      57                 :             :  */
      58                 :             : void
      59                 :        2559 : _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz,
      60                 :             :                            bool bottomupdedup)
      61                 :             : {
      62                 :        2559 :         OffsetNumber offnum,
      63                 :             :                                 minoff,
      64                 :             :                                 maxoff;
      65                 :        2559 :         Page            page = BufferGetPage(buf);
      66                 :        2559 :         BTPageOpaque opaque = BTPageGetOpaque(page);
      67                 :        2559 :         Page            newpage;
      68                 :        2559 :         BTDedupState state;
      69                 :        2559 :         Size            pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
      70                 :        2559 :         bool            singlevalstrat = false;
      71                 :        2559 :         int                     nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
      72                 :             : 
      73                 :             :         /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
      74                 :        2559 :         newitemsz += sizeof(ItemIdData);
      75                 :             : 
      76                 :             :         /*
      77                 :             :          * Initialize deduplication state.
      78                 :             :          *
      79                 :             :          * It would be possible for maxpostingsize (limit on posting list tuple
      80                 :             :          * size) to be set to one third of the page.  However, it seems like a
      81                 :             :          * good idea to limit the size of posting lists to one sixth of a page.
      82                 :             :          * That ought to leave us with a good split point when pages full of
      83                 :             :          * duplicates can be split several times.
      84                 :             :          */
      85                 :        2559 :         state = palloc_object(BTDedupStateData);
      86                 :        2559 :         state->deduplicate = true;
      87                 :        2559 :         state->nmaxitems = 0;
      88                 :        2559 :         state->maxpostingsize = Min(BTMaxItemSize / 2, INDEX_SIZE_MASK);
      89                 :             :         /* Metadata about base tuple of current pending posting list */
      90                 :        2559 :         state->base = NULL;
      91                 :        2559 :         state->baseoff = InvalidOffsetNumber;
      92                 :        2559 :         state->basetupsize = 0;
      93                 :             :         /* Metadata about current pending posting list TIDs */
      94                 :        2559 :         state->htids = palloc(state->maxpostingsize);
      95                 :        2559 :         state->nhtids = 0;
      96                 :        2559 :         state->nitems = 0;
      97                 :             :         /* Size of all physical tuples to be replaced by pending posting list */
      98                 :        2559 :         state->phystupsize = 0;
      99                 :             :         /* nintervals should be initialized to zero */
     100                 :        2559 :         state->nintervals = 0;
     101                 :             : 
     102                 :        2559 :         minoff = P_FIRSTDATAKEY(opaque);
     103                 :        2559 :         maxoff = PageGetMaxOffsetNumber(page);
     104                 :             : 
     105                 :             :         /*
     106                 :             :          * Consider applying "single value" strategy, though only if the page
     107                 :             :          * seems likely to be split in the near future
     108                 :             :          */
     109         [ +  + ]:        2559 :         if (!bottomupdedup)
     110                 :        2400 :                 singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
     111                 :             : 
     112                 :             :         /*
     113                 :             :          * Deduplicate items from page, and write them to newpage.
     114                 :             :          *
     115                 :             :          * Copy the original page's LSN into newpage copy.  This will become the
     116                 :             :          * updated version of the page.  We need this because XLogInsert will
     117                 :             :          * examine the LSN and possibly dump it in a page image.
     118                 :             :          */
     119                 :        2559 :         newpage = PageGetTempPageCopySpecial(page);
     120                 :        2559 :         PageSetLSN(newpage, PageGetLSN(page));
     121                 :             : 
     122                 :             :         /* Copy high key, if any */
     123         [ +  + ]:        2559 :         if (!P_RIGHTMOST(opaque))
     124                 :             :         {
     125                 :        2031 :                 ItemId          hitemid = PageGetItemId(page, P_HIKEY);
     126                 :        2031 :                 Size            hitemsz = ItemIdGetLength(hitemid);
     127                 :        2031 :                 IndexTuple      hitem = (IndexTuple) PageGetItem(page, hitemid);
     128                 :             : 
     129         [ +  - ]:        2031 :                 if (PageAddItem(newpage, hitem, hitemsz, P_HIKEY, false, false) == InvalidOffsetNumber)
     130   [ #  #  #  # ]:           0 :                         elog(ERROR, "deduplication failed to add highkey");
     131                 :        2031 :         }
     132                 :             : 
     133         [ +  + ]:      547047 :         for (offnum = minoff;
     134                 :      547047 :                  offnum <= maxoff;
     135                 :      544488 :                  offnum = OffsetNumberNext(offnum))
     136                 :             :         {
     137                 :      544488 :                 ItemId          itemid = PageGetItemId(page, offnum);
     138                 :      544488 :                 IndexTuple      itup = (IndexTuple) PageGetItem(page, itemid);
     139                 :             : 
     140         [ +  - ]:      544488 :                 Assert(!ItemIdIsDead(itemid));
     141                 :             : 
     142         [ +  + ]:      544488 :                 if (offnum == minoff)
     143                 :             :                 {
     144                 :             :                         /*
     145                 :             :                          * No previous/base tuple for the data item -- use the data item
     146                 :             :                          * as base tuple of pending posting list
     147                 :             :                          */
     148                 :        2559 :                         _bt_dedup_start_pending(state, itup, offnum);
     149                 :        2559 :                 }
     150         [ +  + ]:      541929 :                 else if (state->deduplicate &&
     151   [ +  +  +  + ]:      541587 :                                  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
     152                 :       78727 :                                  _bt_dedup_save_htid(state, itup))
     153                 :             :                 {
     154                 :             :                         /*
     155                 :             :                          * Tuple is equal to base tuple of pending posting list.  Heap
     156                 :             :                          * TID(s) for itup have been saved in state.
     157                 :             :                          */
     158                 :       77271 :                 }
     159                 :             :                 else
     160                 :             :                 {
     161                 :             :                         /*
     162                 :             :                          * Tuple is not equal to pending posting list tuple, or
     163                 :             :                          * _bt_dedup_save_htid() opted to not merge current item into
     164                 :             :                          * pending posting list for some other reason (e.g., adding more
     165                 :             :                          * TIDs would have caused posting list to exceed current
     166                 :             :                          * maxpostingsize).
     167                 :             :                          *
     168                 :             :                          * If state contains pending posting list with more than one item,
     169                 :             :                          * form new posting tuple and add it to our temp page (newpage).
     170                 :             :                          * Else add pending interval's base tuple to the temp page as-is.
     171                 :             :                          */
     172                 :      464658 :                         pagesaving += _bt_dedup_finish_pending(newpage, state);
     173                 :             : 
     174         [ +  + ]:      464658 :                         if (singlevalstrat)
     175                 :             :                         {
     176                 :             :                                 /*
     177                 :             :                                  * Single value strategy's extra steps.
     178                 :             :                                  *
     179                 :             :                                  * Lower maxpostingsize for sixth and final large posting list
     180                 :             :                                  * tuple at the point where 5 maxpostingsize-capped tuples
     181                 :             :                                  * have either been formed or observed.
     182                 :             :                                  *
     183                 :             :                                  * When a sixth maxpostingsize-capped item is formed/observed,
     184                 :             :                                  * stop merging together tuples altogether.  The few tuples
     185                 :             :                                  * that remain at the end of the page won't be merged together
     186                 :             :                                  * at all (at least not until after a future page split takes
     187                 :             :                                  * place, when this page's newly allocated right sibling page
     188                 :             :                                  * gets its first deduplication pass).
     189                 :             :                                  */
     190         [ +  + ]:         772 :                                 if (state->nmaxitems == 5)
     191                 :         102 :                                         _bt_singleval_fillfactor(page, state, newitemsz);
     192         [ +  + ]:         670 :                                 else if (state->nmaxitems == 6)
     193                 :             :                                 {
     194                 :          35 :                                         state->deduplicate = false;
     195                 :          35 :                                         singlevalstrat = false; /* won't be back here */
     196                 :          35 :                                 }
     197                 :         772 :                         }
     198                 :             : 
     199                 :             :                         /* itup starts new pending posting list */
     200                 :      464658 :                         _bt_dedup_start_pending(state, itup, offnum);
     201                 :             :                 }
     202                 :      544488 :         }
     203                 :             : 
     204                 :             :         /* Handle the last item */
     205                 :        2559 :         pagesaving += _bt_dedup_finish_pending(newpage, state);
     206                 :             : 
     207                 :             :         /*
     208                 :             :          * If no items suitable for deduplication were found, newpage must be
     209                 :             :          * exactly the same as the original page, so just return from function.
     210                 :             :          *
     211                 :             :          * We could determine whether or not to proceed on the basis the space
     212                 :             :          * savings being sufficient to avoid an immediate page split instead.  We
     213                 :             :          * don't do that because there is some small value in nbtsplitloc.c always
     214                 :             :          * operating against a page that is fully deduplicated (apart from
     215                 :             :          * newitem).  Besides, most of the cost has already been paid.
     216                 :             :          */
     217         [ +  + ]:        2559 :         if (state->nintervals == 0)
     218                 :             :         {
     219                 :             :                 /* cannot leak memory here */
     220                 :         505 :                 pfree(newpage);
     221                 :         505 :                 pfree(state->htids);
     222                 :         505 :                 pfree(state);
     223                 :         505 :                 return;
     224                 :             :         }
     225                 :             : 
     226                 :             :         /*
     227                 :             :          * By here, it's clear that deduplication will definitely go ahead.
     228                 :             :          *
     229                 :             :          * Clear the BTP_HAS_GARBAGE page flag.  The index must be a heapkeyspace
     230                 :             :          * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
     231                 :             :          * But keep things tidy.
     232                 :             :          */
     233         [ +  - ]:        2054 :         if (P_HAS_GARBAGE(opaque))
     234                 :             :         {
     235                 :           0 :                 BTPageOpaque nopaque = BTPageGetOpaque(newpage);
     236                 :             : 
     237                 :           0 :                 nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
     238                 :           0 :         }
     239                 :             : 
     240                 :        2054 :         START_CRIT_SECTION();
     241                 :             : 
     242                 :        2054 :         PageRestoreTempPage(newpage, page);
     243                 :        2054 :         MarkBufferDirty(buf);
     244                 :             : 
     245                 :             :         /* XLOG stuff */
     246   [ +  +  +  +  :        2054 :         if (RelationNeedsWAL(rel))
             +  -  -  + ]
     247                 :             :         {
     248                 :        2033 :                 XLogRecPtr      recptr;
     249                 :        2033 :                 xl_btree_dedup xlrec_dedup;
     250                 :             : 
     251                 :        2033 :                 xlrec_dedup.nintervals = state->nintervals;
     252                 :             : 
     253                 :        2033 :                 XLogBeginInsert();
     254                 :        2033 :                 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
     255                 :        2033 :                 XLogRegisterData(&xlrec_dedup, SizeOfBtreeDedup);
     256                 :             : 
     257                 :             :                 /*
     258                 :             :                  * The intervals array is not in the buffer, but pretend that it is.
     259                 :             :                  * When XLogInsert stores the whole buffer, the array need not be
     260                 :             :                  * stored too.
     261                 :             :                  */
     262                 :        4066 :                 XLogRegisterBufData(0, state->intervals,
     263                 :        2033 :                                                         state->nintervals * sizeof(BTDedupInterval));
     264                 :             : 
     265                 :        2033 :                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
     266                 :             : 
     267                 :        2033 :                 PageSetLSN(page, recptr);
     268                 :        2033 :         }
     269                 :             : 
     270         [ +  - ]:        2054 :         END_CRIT_SECTION();
     271                 :             : 
     272                 :             :         /* Local space accounting should agree with page accounting */
     273   [ +  +  +  - ]:        2054 :         Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
     274                 :             : 
     275                 :             :         /* cannot leak memory here */
     276                 :        2054 :         pfree(state->htids);
     277                 :        2054 :         pfree(state);
     278         [ -  + ]:        2559 : }
     279                 :             : 
     280                 :             : /*
     281                 :             :  * Perform bottom-up index deletion pass.
     282                 :             :  *
     283                 :             :  * See if duplicate index tuples (plus certain nearby tuples) are eligible to
     284                 :             :  * be deleted via bottom-up index deletion.  The high level goal here is to
     285                 :             :  * entirely prevent "unnecessary" page splits caused by MVCC version churn
     286                 :             :  * from UPDATEs (when the UPDATEs don't logically modify any of the columns
     287                 :             :  * covered by the 'rel' index).  This is qualitative, not quantitative -- we
     288                 :             :  * do not particularly care about once-off opportunities to delete many index
     289                 :             :  * tuples together.
     290                 :             :  *
     291                 :             :  * See nbtree/README for details on the design of nbtree bottom-up deletion.
     292                 :             :  * See access/tableam.h for a description of how we're expected to cooperate
     293                 :             :  * with the tableam.
     294                 :             :  *
     295                 :             :  * Returns true on success, in which case caller can assume page split will be
     296                 :             :  * avoided for a reasonable amount of time.  Returns false when caller should
     297                 :             :  * deduplicate the page (if possible at all).
     298                 :             :  *
     299                 :             :  * Note: Occasionally we return true despite failing to delete enough items to
     300                 :             :  * avoid a split.  This makes caller skip deduplication and go split the page
     301                 :             :  * right away.  Our return value is always just advisory information.
     302                 :             :  *
     303                 :             :  * Note: Caller should have already deleted all existing items with their
     304                 :             :  * LP_DEAD bits set.
     305                 :             :  */
     306                 :             : bool
     307                 :         183 : _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
     308                 :             :                                          Size newitemsz)
     309                 :             : {
     310                 :         183 :         OffsetNumber offnum,
     311                 :             :                                 minoff,
     312                 :             :                                 maxoff;
     313                 :         183 :         Page            page = BufferGetPage(buf);
     314                 :         183 :         BTPageOpaque opaque = BTPageGetOpaque(page);
     315                 :         183 :         BTDedupState state;
     316                 :         183 :         TM_IndexDeleteOp delstate;
     317                 :         183 :         bool            neverdedup;
     318                 :         183 :         int                     nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     319                 :             : 
     320                 :             :         /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
     321                 :         183 :         newitemsz += sizeof(ItemIdData);
     322                 :             : 
     323                 :             :         /* Initialize deduplication state */
     324                 :         183 :         state = palloc_object(BTDedupStateData);
     325                 :         183 :         state->deduplicate = true;
     326                 :         183 :         state->nmaxitems = 0;
     327                 :         183 :         state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
     328                 :         183 :         state->base = NULL;
     329                 :         183 :         state->baseoff = InvalidOffsetNumber;
     330                 :         183 :         state->basetupsize = 0;
     331                 :         183 :         state->htids = palloc(state->maxpostingsize);
     332                 :         183 :         state->nhtids = 0;
     333                 :         183 :         state->nitems = 0;
     334                 :         183 :         state->phystupsize = 0;
     335                 :         183 :         state->nintervals = 0;
     336                 :             : 
     337                 :             :         /*
     338                 :             :          * Initialize tableam state that describes bottom-up index deletion
     339                 :             :          * operation.
     340                 :             :          *
     341                 :             :          * We'll go on to ask the tableam to search for TIDs whose index tuples we
     342                 :             :          * can safely delete.  The tableam will search until our leaf page space
     343                 :             :          * target is satisfied, or until the cost of continuing with the tableam
     344                 :             :          * operation seems too high.  It focuses its efforts on TIDs associated
     345                 :             :          * with duplicate index tuples that we mark "promising".
     346                 :             :          *
     347                 :             :          * This space target is a little arbitrary.  The tableam must be able to
     348                 :             :          * keep the costs and benefits in balance.  We provide the tableam with
     349                 :             :          * exhaustive information about what might work, without directly
     350                 :             :          * concerning ourselves with avoiding work during the tableam call.  Our
     351                 :             :          * role in costing the bottom-up deletion process is strictly advisory.
     352                 :             :          */
     353                 :         183 :         delstate.irel = rel;
     354                 :         183 :         delstate.iblknum = BufferGetBlockNumber(buf);
     355                 :         183 :         delstate.bottomup = true;
     356         [ +  - ]:         183 :         delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
     357                 :         183 :         delstate.ndeltids = 0;
     358                 :         183 :         delstate.deltids = palloc_array(TM_IndexDelete, MaxTIDsPerBTreePage);
     359                 :         183 :         delstate.status = palloc_array(TM_IndexStatus, MaxTIDsPerBTreePage);
     360                 :             : 
     361                 :         183 :         minoff = P_FIRSTDATAKEY(opaque);
     362                 :         183 :         maxoff = PageGetMaxOffsetNumber(page);
     363         [ +  + ]:       36301 :         for (offnum = minoff;
     364                 :       36301 :                  offnum <= maxoff;
     365                 :       36118 :                  offnum = OffsetNumberNext(offnum))
     366                 :             :         {
     367                 :       36118 :                 ItemId          itemid = PageGetItemId(page, offnum);
     368                 :       36118 :                 IndexTuple      itup = (IndexTuple) PageGetItem(page, itemid);
     369                 :             : 
     370         [ +  - ]:       36118 :                 Assert(!ItemIdIsDead(itemid));
     371                 :             : 
     372         [ +  + ]:       36118 :                 if (offnum == minoff)
     373                 :             :                 {
     374                 :             :                         /* itup starts first pending interval */
     375                 :         183 :                         _bt_dedup_start_pending(state, itup, offnum);
     376                 :         183 :                 }
     377   [ +  +  -  + ]:       35935 :                 else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
     378                 :        9431 :                                  _bt_dedup_save_htid(state, itup))
     379                 :             :                 {
     380                 :             :                         /* Tuple is equal; just added its TIDs to pending interval */
     381                 :        9431 :                 }
     382                 :             :                 else
     383                 :             :                 {
     384                 :             :                         /* Finalize interval -- move its TIDs to delete state */
     385                 :       26504 :                         _bt_bottomupdel_finish_pending(page, state, &delstate);
     386                 :             : 
     387                 :             :                         /* itup starts new pending interval */
     388                 :       26504 :                         _bt_dedup_start_pending(state, itup, offnum);
     389                 :             :                 }
     390                 :       36118 :         }
     391                 :             :         /* Finalize final interval -- move its TIDs to delete state */
     392                 :         183 :         _bt_bottomupdel_finish_pending(page, state, &delstate);
     393                 :             : 
     394                 :             :         /*
     395                 :             :          * We don't give up now in the event of having few (or even zero)
     396                 :             :          * promising tuples for the tableam because it's not up to us as the index
     397                 :             :          * AM to manage costs (note that the tableam might have heuristics of its
     398                 :             :          * own that work out what to do).  We should at least avoid having our
     399                 :             :          * caller do a useless deduplication pass after we return in the event of
     400                 :             :          * zero promising tuples, though.
     401                 :             :          */
     402                 :         183 :         neverdedup = false;
     403         [ +  + ]:         183 :         if (state->nintervals == 0)
     404                 :           2 :                 neverdedup = true;
     405                 :             : 
     406                 :         183 :         pfree(state->htids);
     407                 :         183 :         pfree(state);
     408                 :             : 
     409                 :             :         /* Ask tableam which TIDs are deletable, then physically delete them */
     410                 :         183 :         _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
     411                 :             : 
     412                 :         183 :         pfree(delstate.deltids);
     413                 :         183 :         pfree(delstate.status);
     414                 :             : 
     415                 :             :         /* Report "success" to caller unconditionally to avoid deduplication */
     416         [ +  + ]:         183 :         if (neverdedup)
     417                 :           2 :                 return true;
     418                 :             : 
     419                 :             :         /* Don't dedup when we won't end up back here any time soon anyway */
     420         [ +  - ]:         181 :         return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
     421                 :         183 : }
     422                 :             : 
     423                 :             : /*
     424                 :             :  * Create a new pending posting list tuple based on caller's base tuple.
     425                 :             :  *
     426                 :             :  * Every tuple processed by deduplication either becomes the base tuple for a
     427                 :             :  * posting list, or gets its heap TID(s) accepted into a pending posting list.
     428                 :             :  * A tuple that starts out as the base tuple for a posting list will only
     429                 :             :  * actually be rewritten within _bt_dedup_finish_pending() when it turns out
     430                 :             :  * that there are duplicates that can be merged into the base tuple.
     431                 :             :  */
     432                 :             : void
     433                 :     1172827 : _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
     434                 :             :                                                 OffsetNumber baseoff)
     435                 :             : {
     436         [ +  - ]:     1172827 :         Assert(state->nhtids == 0);
     437         [ +  - ]:     1172827 :         Assert(state->nitems == 0);
     438         [ +  - ]:     1172827 :         Assert(!BTreeTupleIsPivot(base));
     439                 :             : 
     440                 :             :         /*
     441                 :             :          * Copy heap TID(s) from new base tuple for new candidate posting list
     442                 :             :          * into working state's array
     443                 :             :          */
     444         [ +  + ]:     1172827 :         if (!BTreeTupleIsPosting(base))
     445                 :             :         {
     446                 :     1024876 :                 memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
     447                 :     1024876 :                 state->nhtids = 1;
     448                 :     1024876 :                 state->basetupsize = IndexTupleSize(base);
     449                 :     1024876 :         }
     450                 :             :         else
     451                 :             :         {
     452                 :      147951 :                 int                     nposting;
     453                 :             : 
     454                 :      147951 :                 nposting = BTreeTupleGetNPosting(base);
     455                 :      147951 :                 memcpy(state->htids, BTreeTupleGetPosting(base),
     456                 :             :                            sizeof(ItemPointerData) * nposting);
     457                 :      147951 :                 state->nhtids = nposting;
     458                 :             :                 /* basetupsize should not include existing posting list */
     459                 :      147951 :                 state->basetupsize = BTreeTupleGetPostingOffset(base);
     460                 :      147951 :         }
     461                 :             : 
     462                 :             :         /*
     463                 :             :          * Save new base tuple itself -- it'll be needed if we actually create a
     464                 :             :          * new posting list from new pending posting list.
     465                 :             :          *
     466                 :             :          * Must maintain physical size of all existing tuples (including line
     467                 :             :          * pointer overhead) so that we can calculate space savings on page.
     468                 :             :          */
     469                 :     1172827 :         state->nitems = 1;
     470                 :     1172827 :         state->base = base;
     471                 :     1172827 :         state->baseoff = baseoff;
     472                 :     1172827 :         state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
     473                 :             :         /* Also save baseoff in pending state for interval */
     474                 :     1172827 :         state->intervals[state->nintervals].baseoff = state->baseoff;
     475                 :     1172827 : }
     476                 :             : 
     477                 :             : /*
     478                 :             :  * Save itup heap TID(s) into pending posting list where possible.
     479                 :             :  *
     480                 :             :  * Returns bool indicating if the pending posting list managed by state now
     481                 :             :  * includes itup's heap TID(s).
     482                 :             :  */
     483                 :             : bool
     484                 :      402088 : _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
     485                 :             : {
     486                 :      402088 :         int                     nhtids;
     487                 :      402088 :         ItemPointer htids;
     488                 :      402088 :         Size            mergedtupsz;
     489                 :             : 
     490         [ +  - ]:      402088 :         Assert(!BTreeTupleIsPivot(itup));
     491                 :             : 
     492         [ +  + ]:      402088 :         if (!BTreeTupleIsPosting(itup))
     493                 :             :         {
     494                 :      400638 :                 nhtids = 1;
     495                 :      400638 :                 htids = &itup->t_tid;
     496                 :      400638 :         }
     497                 :             :         else
     498                 :             :         {
     499                 :        1450 :                 nhtids = BTreeTupleGetNPosting(itup);
     500                 :        1450 :                 htids = BTreeTupleGetPosting(itup);
     501                 :             :         }
     502                 :             : 
     503                 :             :         /*
     504                 :             :          * Don't append (have caller finish pending posting list as-is) if
     505                 :             :          * appending heap TID(s) from itup would put us over maxpostingsize limit.
     506                 :             :          *
     507                 :             :          * This calculation needs to match the code used within _bt_form_posting()
     508                 :             :          * for new posting list tuples.
     509                 :             :          */
     510                 :      402088 :         mergedtupsz = MAXALIGN(state->basetupsize +
     511                 :             :                                                    (state->nhtids + nhtids) * sizeof(ItemPointerData));
     512                 :             : 
     513         [ +  + ]:      402088 :         if (mergedtupsz > state->maxpostingsize)
     514                 :             :         {
     515                 :             :                 /*
     516                 :             :                  * Count this as an oversized item for single value strategy, though
     517                 :             :                  * only when there are 50 TIDs in the final posting list tuple.  This
     518                 :             :                  * limit (which is fairly arbitrary) avoids confusion about how many
     519                 :             :                  * 1/6 of a page tuples have been encountered/created by the current
     520                 :             :                  * deduplication pass.
     521                 :             :                  *
     522                 :             :                  * Note: We deliberately don't consider which deduplication pass
     523                 :             :                  * merged together tuples to create this item (could be a previous
     524                 :             :                  * deduplication pass, or current pass).  See _bt_do_singleval()
     525                 :             :                  * comments.
     526                 :             :                  */
     527         [ +  + ]:        3463 :                 if (state->nhtids > 50)
     528                 :        3318 :                         state->nmaxitems++;
     529                 :             : 
     530                 :        3463 :                 return false;
     531                 :             :         }
     532                 :             : 
     533                 :             :         /*
     534                 :             :          * Save heap TIDs to pending posting list tuple -- itup can be merged into
     535                 :             :          * pending posting list
     536                 :             :          */
     537                 :      398625 :         state->nitems++;
     538                 :      398625 :         memcpy(state->htids + state->nhtids, htids,
     539                 :             :                    sizeof(ItemPointerData) * nhtids);
     540                 :      398625 :         state->nhtids += nhtids;
     541                 :      398625 :         state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
     542                 :             : 
     543                 :      398625 :         return true;
     544                 :      402088 : }
     545                 :             : 
     546                 :             : /*
     547                 :             :  * Finalize pending posting list tuple, and add it to the page.  Final tuple
     548                 :             :  * is based on saved base tuple, and saved list of heap TIDs.
     549                 :             :  *
     550                 :             :  * Returns space saving from deduplicating to make a new posting list tuple.
     551                 :             :  * Note that this includes line pointer overhead.  This is zero in the case
     552                 :             :  * where no deduplication was possible.
     553                 :             :  */
     554                 :             : Size
     555                 :      467217 : _bt_dedup_finish_pending(Page newpage, BTDedupState state)
     556                 :             : {
     557                 :      467217 :         OffsetNumber tupoff;
     558                 :      467217 :         Size            tuplesz;
     559                 :      467217 :         Size            spacesaving;
     560                 :             : 
     561         [ +  - ]:      467217 :         Assert(state->nitems > 0);
     562         [ +  - ]:      467217 :         Assert(state->nitems <= state->nhtids);
     563         [ +  - ]:      467217 :         Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
     564                 :             : 
     565                 :      467217 :         tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
     566         [ +  + ]:      467217 :         if (state->nitems == 1)
     567                 :             :         {
     568                 :             :                 /* Use original, unchanged base tuple */
     569                 :      438816 :                 tuplesz = IndexTupleSize(state->base);
     570         [ +  - ]:      438816 :                 Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
     571         [ +  - ]:      438816 :                 Assert(tuplesz <= BTMaxItemSize);
     572         [ +  - ]:      438816 :                 if (PageAddItem(newpage, state->base, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
     573   [ #  #  #  # ]:           0 :                         elog(ERROR, "deduplication failed to add tuple to page");
     574                 :             : 
     575                 :      438816 :                 spacesaving = 0;
     576                 :      438816 :         }
     577                 :             :         else
     578                 :             :         {
     579                 :       28401 :                 IndexTuple      final;
     580                 :             : 
     581                 :             :                 /* Form a tuple with a posting list */
     582                 :       28401 :                 final = _bt_form_posting(state->base, state->htids, state->nhtids);
     583                 :       28401 :                 tuplesz = IndexTupleSize(final);
     584         [ +  - ]:       28401 :                 Assert(tuplesz <= state->maxpostingsize);
     585                 :             : 
     586                 :             :                 /* Save final number of items for posting list */
     587                 :       28401 :                 state->intervals[state->nintervals].nitems = state->nitems;
     588                 :             : 
     589         [ +  - ]:       28401 :                 Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
     590         [ +  - ]:       28401 :                 Assert(tuplesz <= BTMaxItemSize);
     591         [ +  - ]:       28401 :                 if (PageAddItem(newpage, final, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
     592   [ #  #  #  # ]:           0 :                         elog(ERROR, "deduplication failed to add tuple to page");
     593                 :             : 
     594                 :       28401 :                 pfree(final);
     595                 :       28401 :                 spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
     596                 :             :                 /* Increment nintervals, since we wrote a new posting list tuple */
     597                 :       28401 :                 state->nintervals++;
     598         [ +  - ]:       28401 :                 Assert(spacesaving > 0 && spacesaving < BLCKSZ);
     599                 :       28401 :         }
     600                 :             : 
     601                 :             :         /* Reset state for next pending posting list */
     602                 :      467217 :         state->nhtids = 0;
     603                 :      467217 :         state->nitems = 0;
     604                 :      467217 :         state->phystupsize = 0;
     605                 :             : 
     606                 :      934434 :         return spacesaving;
     607                 :      467217 : }
     608                 :             : 
     609                 :             : /*
     610                 :             :  * Finalize interval during bottom-up index deletion.
     611                 :             :  *
     612                 :             :  * During a bottom-up pass we expect that TIDs will be recorded in dedup state
     613                 :             :  * first, and then get moved over to delstate (in variable-sized batches) by
     614                 :             :  * calling here.  Call here happens when the number of TIDs in a dedup
     615                 :             :  * interval is known, and interval gets finalized (i.e. when caller sees next
     616                 :             :  * tuple on the page is not a duplicate, or when caller runs out of tuples to
     617                 :             :  * process from leaf page).
     618                 :             :  *
     619                 :             :  * This is where bottom-up deletion determines and remembers which entries are
     620                 :             :  * duplicates.  This will be important information to the tableam delete
     621                 :             :  * infrastructure later on.  Plain index tuple duplicates are marked
     622                 :             :  * "promising" here, per tableam contract.
     623                 :             :  *
     624                 :             :  * Our approach to marking entries whose TIDs come from posting lists is more
     625                 :             :  * complicated.  Posting lists can only be formed by a deduplication pass (or
     626                 :             :  * during an index build), so recent version churn affecting the pointed-to
     627                 :             :  * logical rows is not particularly likely.  We may still give a weak signal
     628                 :             :  * about posting list tuples' entries (by marking just one of its TIDs/entries
     629                 :             :  * promising), though this is only a possibility in the event of further
     630                 :             :  * duplicate index tuples in final interval that covers posting list tuple (as
     631                 :             :  * in the plain tuple case).  A weak signal/hint will be useful to the tableam
     632                 :             :  * when it has no stronger signal to go with for the deletion operation as a
     633                 :             :  * whole.
     634                 :             :  *
     635                 :             :  * The heuristics we use work well in practice because we only need to give
     636                 :             :  * the tableam the right _general_ idea about where to look.  Garbage tends to
     637                 :             :  * naturally get concentrated in relatively few table blocks with workloads
     638                 :             :  * that bottom-up deletion targets.  The tableam cannot possibly rank all
     639                 :             :  * available table blocks sensibly based on the hints we provide, but that's
     640                 :             :  * okay -- only the extremes matter.  The tableam just needs to be able to
     641                 :             :  * predict which few table blocks will have the most tuples that are safe to
     642                 :             :  * delete for each deletion operation, with low variance across related
     643                 :             :  * deletion operations.
     644                 :             :  */
     645                 :             : static void
     646                 :       26687 : _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
     647                 :             :                                                            TM_IndexDeleteOp *delstate)
     648                 :             : {
     649                 :       26687 :         bool            dupinterval = (state->nitems > 1);
     650                 :             : 
     651         [ +  - ]:       26687 :         Assert(state->nitems > 0);
     652         [ +  - ]:       26687 :         Assert(state->nitems <= state->nhtids);
     653         [ +  - ]:       26687 :         Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
     654                 :             : 
     655         [ +  + ]:       62805 :         for (int i = 0; i < state->nitems; i++)
     656                 :             :         {
     657                 :       36118 :                 OffsetNumber offnum = state->baseoff + i;
     658                 :       36118 :                 ItemId          itemid = PageGetItemId(page, offnum);
     659                 :       36118 :                 IndexTuple      itup = (IndexTuple) PageGetItem(page, itemid);
     660                 :       36118 :                 TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
     661                 :       36118 :                 TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
     662                 :             : 
     663         [ +  + ]:       36118 :                 if (!BTreeTupleIsPosting(itup))
     664                 :             :                 {
     665                 :             :                         /* Simple case: A plain non-pivot tuple */
     666                 :       26648 :                         ideltid->tid = itup->t_tid;
     667                 :       26648 :                         ideltid->id = delstate->ndeltids;
     668                 :       26648 :                         istatus->idxoffnum = offnum;
     669                 :       26648 :                         istatus->knowndeletable = false;     /* for now */
     670                 :       26648 :                         istatus->promising = dupinterval;    /* simple rule */
     671                 :       26648 :                         istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
     672                 :             : 
     673                 :       26648 :                         delstate->ndeltids++;
     674                 :       26648 :                 }
     675                 :             :                 else
     676                 :             :                 {
     677                 :             :                         /*
     678                 :             :                          * Complicated case: A posting list tuple.
     679                 :             :                          *
     680                 :             :                          * We make the conservative assumption that there can only be at
     681                 :             :                          * most one affected logical row per posting list tuple.  There
     682                 :             :                          * will be at most one promising entry in deltids to represent
     683                 :             :                          * this presumed lone logical row.  Note that this isn't even
     684                 :             :                          * considered unless the posting list tuple is also in an interval
     685                 :             :                          * of duplicates -- this complicated rule is just a variant of the
     686                 :             :                          * simple rule used to decide if plain index tuples are promising.
     687                 :             :                          */
     688                 :        9470 :                         int                     nitem = BTreeTupleGetNPosting(itup);
     689                 :        9470 :                         bool            firstpromising = false;
     690                 :        9470 :                         bool            lastpromising = false;
     691                 :             : 
     692         [ -  + ]:        9470 :                         Assert(_bt_posting_valid(itup));
     693                 :             : 
     694         [ +  + ]:        9470 :                         if (dupinterval)
     695                 :             :                         {
     696                 :             :                                 /*
     697                 :             :                                  * Complicated rule: either the first or last TID in the
     698                 :             :                                  * posting list gets marked promising (if any at all)
     699                 :             :                                  */
     700                 :        1844 :                                 BlockNumber minblocklist,
     701                 :             :                                                         midblocklist,
     702                 :             :                                                         maxblocklist;
     703                 :        1844 :                                 ItemPointer mintid,
     704                 :             :                                                         midtid,
     705                 :             :                                                         maxtid;
     706                 :             : 
     707                 :        1844 :                                 mintid = BTreeTupleGetHeapTID(itup);
     708                 :        1844 :                                 midtid = BTreeTupleGetPostingN(itup, nitem / 2);
     709                 :        1844 :                                 maxtid = BTreeTupleGetMaxHeapTID(itup);
     710                 :        1844 :                                 minblocklist = ItemPointerGetBlockNumber(mintid);
     711                 :        1844 :                                 midblocklist = ItemPointerGetBlockNumber(midtid);
     712                 :        1844 :                                 maxblocklist = ItemPointerGetBlockNumber(maxtid);
     713                 :             : 
     714                 :             :                                 /* Only entry with predominant table block can be promising */
     715                 :        1844 :                                 firstpromising = (minblocklist == midblocklist);
     716         [ +  + ]:        3642 :                                 lastpromising = (!firstpromising &&
     717                 :        1798 :                                                                  midblocklist == maxblocklist);
     718                 :        1844 :                         }
     719                 :             : 
     720         [ +  + ]:      109915 :                         for (int p = 0; p < nitem; p++)
     721                 :             :                         {
     722                 :      100445 :                                 ItemPointer htid = BTreeTupleGetPostingN(itup, p);
     723                 :             : 
     724                 :      100445 :                                 ideltid->tid = *htid;
     725                 :      100445 :                                 ideltid->id = delstate->ndeltids;
     726                 :      100445 :                                 istatus->idxoffnum = offnum;
     727                 :      100445 :                                 istatus->knowndeletable = false;     /* for now */
     728                 :      100445 :                                 istatus->promising = false;
     729   [ +  +  +  + ]:      119285 :                                 if ((firstpromising && p == 0) ||
     730         [ +  + ]:      100445 :                                         (lastpromising && p == nitem - 1))
     731                 :        1057 :                                         istatus->promising = true;
     732                 :      100445 :                                 istatus->freespace = sizeof(ItemPointerData);        /* at worst */
     733                 :             : 
     734                 :      100445 :                                 ideltid++;
     735                 :      100445 :                                 istatus++;
     736                 :      100445 :                                 delstate->ndeltids++;
     737                 :      100445 :                         }
     738                 :        9470 :                 }
     739                 :       36118 :         }
     740                 :             : 
     741         [ +  + ]:       26687 :         if (dupinterval)
     742                 :             :         {
     743                 :        3961 :                 state->intervals[state->nintervals].nitems = state->nitems;
     744                 :        3961 :                 state->nintervals++;
     745                 :        3961 :         }
     746                 :             : 
     747                 :             :         /* Reset state for next interval */
     748                 :       26687 :         state->nhtids = 0;
     749                 :       26687 :         state->nitems = 0;
     750                 :       26687 :         state->phystupsize = 0;
     751                 :       26687 : }
     752                 :             : 
     753                 :             : /*
     754                 :             :  * Determine if page non-pivot tuples (data items) are all duplicates of the
     755                 :             :  * same value -- if they are, deduplication's "single value" strategy should
     756                 :             :  * be applied.  The general goal of this strategy is to ensure that
     757                 :             :  * nbtsplitloc.c (which uses its own single value strategy) will find a useful
     758                 :             :  * split point as further duplicates are inserted, and successive rightmost
     759                 :             :  * page splits occur among pages that store the same duplicate value.  When
     760                 :             :  * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
     761                 :             :  * just like it would if deduplication were disabled.
     762                 :             :  *
     763                 :             :  * We expect that affected workloads will require _several_ single value
     764                 :             :  * strategy deduplication passes (over a page that only stores duplicates)
     765                 :             :  * before the page is finally split.  The first deduplication pass should only
     766                 :             :  * find regular non-pivot tuples.  Later deduplication passes will find
     767                 :             :  * existing maxpostingsize-capped posting list tuples, which must be skipped
     768                 :             :  * over.  The penultimate pass is generally the first pass that actually
     769                 :             :  * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
     770                 :             :  * few untouched non-pivot tuples.  The final deduplication pass won't free
     771                 :             :  * any space -- it will skip over everything without merging anything (it
     772                 :             :  * retraces the steps of the penultimate pass).
     773                 :             :  *
     774                 :             :  * Fortunately, having several passes isn't too expensive.  Each pass (after
     775                 :             :  * the first pass) won't spend many cycles on the large posting list tuples
     776                 :             :  * left by previous passes.  Each pass will find a large contiguous group of
     777                 :             :  * smaller duplicate tuples to merge together at the end of the page.
     778                 :             :  */
     779                 :             : static bool
     780                 :        2400 : _bt_do_singleval(Relation rel, Page page, BTDedupState state,
     781                 :             :                                  OffsetNumber minoff, IndexTuple newitem)
     782                 :             : {
     783                 :        2400 :         int                     nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     784                 :        2400 :         ItemId          itemid;
     785                 :        2400 :         IndexTuple      itup;
     786                 :             : 
     787                 :        2400 :         itemid = PageGetItemId(page, minoff);
     788                 :        2400 :         itup = (IndexTuple) PageGetItem(page, itemid);
     789                 :             : 
     790         [ +  + ]:        2400 :         if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
     791                 :             :         {
     792                 :         290 :                 itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
     793                 :         290 :                 itup = (IndexTuple) PageGetItem(page, itemid);
     794                 :             : 
     795         [ +  + ]:         290 :                 if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
     796                 :         194 :                         return true;
     797                 :          96 :         }
     798                 :             : 
     799                 :        2206 :         return false;
     800                 :        2400 : }
     801                 :             : 
     802                 :             : /*
     803                 :             :  * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
     804                 :             :  * and final maxpostingsize-capped tuple.  The sixth and final posting list
     805                 :             :  * tuple will end up somewhat smaller than the first five.  (Note: The first
     806                 :             :  * five tuples could actually just be very large duplicate tuples that
     807                 :             :  * couldn't be merged together at all.  Deduplication will simply not modify
     808                 :             :  * the page when that happens.)
     809                 :             :  *
     810                 :             :  * When there are six posting lists on the page (after current deduplication
     811                 :             :  * pass goes on to create/observe a sixth very large tuple), caller should end
     812                 :             :  * its deduplication pass.  It isn't useful to try to deduplicate items that
     813                 :             :  * are supposed to end up on the new right sibling page following the
     814                 :             :  * anticipated page split.  A future deduplication pass of future right
     815                 :             :  * sibling page might take care of it.  (This is why the first single value
     816                 :             :  * strategy deduplication pass for a given leaf page will generally find only
     817                 :             :  * plain non-pivot tuples -- see _bt_do_singleval() comments.)
     818                 :             :  */
     819                 :             : static void
     820                 :         102 : _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
     821                 :             : {
     822                 :         102 :         Size            leftfree;
     823                 :         102 :         int                     reduction;
     824                 :             : 
     825                 :             :         /* This calculation needs to match nbtsplitloc.c */
     826                 :         102 :         leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
     827                 :             :                 MAXALIGN(sizeof(BTPageOpaqueData));
     828                 :             :         /* Subtract size of new high key (includes pivot heap TID space) */
     829                 :         102 :         leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
     830                 :             : 
     831                 :             :         /*
     832                 :             :          * Reduce maxpostingsize by an amount equal to target free space on left
     833                 :             :          * half of page
     834                 :             :          */
     835                 :         102 :         reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
     836         [ +  - ]:         102 :         if (state->maxpostingsize > reduction)
     837                 :         102 :                 state->maxpostingsize -= reduction;
     838                 :             :         else
     839                 :           0 :                 state->maxpostingsize = 0;
     840                 :         102 : }
     841                 :             : 
     842                 :             : /*
     843                 :             :  * Build a posting list tuple based on caller's "base" index tuple and list of
     844                 :             :  * heap TIDs.  When nhtids == 1, builds a standard non-pivot tuple without a
     845                 :             :  * posting list. (Posting list tuples can never have a single heap TID, partly
     846                 :             :  * because that ensures that deduplication always reduces final MAXALIGN()'d
     847                 :             :  * size of entire tuple.)
     848                 :             :  *
     849                 :             :  * Convention is that posting list starts at a MAXALIGN()'d offset (rather
     850                 :             :  * than a SHORTALIGN()'d offset), in line with the approach taken when
     851                 :             :  * appending a heap TID to new pivot tuple/high key during suffix truncation.
     852                 :             :  * This sometimes wastes a little space that was only needed as alignment
     853                 :             :  * padding in the original tuple.  Following this convention simplifies the
     854                 :             :  * space accounting used when deduplicating a page (the same convention
     855                 :             :  * simplifies the accounting for choosing a point to split a page at).
     856                 :             :  *
     857                 :             :  * Note: Caller's "htids" array must be unique and already in ascending TID
     858                 :             :  * order.  Any existing heap TIDs from "base" won't automatically appear in
     859                 :             :  * returned posting list tuple (they must be included in htids array.)
     860                 :             :  */
     861                 :             : IndexTuple
     862                 :       33580 : _bt_form_posting(IndexTuple base, const ItemPointerData *htids, int nhtids)
     863                 :             : {
     864                 :       33580 :         uint32          keysize,
     865                 :             :                                 newsize;
     866                 :       33580 :         IndexTuple      itup;
     867                 :             : 
     868         [ +  + ]:       33580 :         if (BTreeTupleIsPosting(base))
     869                 :        7631 :                 keysize = BTreeTupleGetPostingOffset(base);
     870                 :             :         else
     871                 :       25949 :                 keysize = IndexTupleSize(base);
     872                 :             : 
     873         [ +  - ]:       33580 :         Assert(!BTreeTupleIsPivot(base));
     874         [ +  - ]:       33580 :         Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
     875         [ +  - ]:       33580 :         Assert(keysize == MAXALIGN(keysize));
     876                 :             : 
     877                 :             :         /* Determine final size of new tuple */
     878         [ +  - ]:       33580 :         if (nhtids > 1)
     879                 :       33580 :                 newsize = MAXALIGN(keysize +
     880                 :             :                                                    nhtids * sizeof(ItemPointerData));
     881                 :             :         else
     882                 :           0 :                 newsize = keysize;
     883                 :             : 
     884         [ +  - ]:       33580 :         Assert(newsize <= INDEX_SIZE_MASK);
     885         [ +  - ]:       33580 :         Assert(newsize == MAXALIGN(newsize));
     886                 :             : 
     887                 :             :         /* Allocate memory using palloc0() (matches index_form_tuple()) */
     888                 :       33580 :         itup = palloc0(newsize);
     889                 :       33580 :         memcpy(itup, base, keysize);
     890                 :       33580 :         itup->t_info &= ~INDEX_SIZE_MASK;
     891                 :       33580 :         itup->t_info |= newsize;
     892         [ +  - ]:       33580 :         if (nhtids > 1)
     893                 :             :         {
     894                 :             :                 /* Form posting list tuple */
     895                 :       33580 :                 BTreeTupleSetPosting(itup, nhtids, keysize);
     896                 :       33580 :                 memcpy(BTreeTupleGetPosting(itup), htids,
     897                 :             :                            sizeof(ItemPointerData) * nhtids);
     898         [ +  - ]:       33580 :                 Assert(_bt_posting_valid(itup));
     899                 :       33580 :         }
     900                 :             :         else
     901                 :             :         {
     902                 :             :                 /* Form standard non-pivot tuple */
     903                 :           0 :                 itup->t_info &= ~INDEX_ALT_TID_MASK;
     904                 :           0 :                 ItemPointerCopy(htids, &itup->t_tid);
     905         [ #  # ]:           0 :                 Assert(ItemPointerIsValid(&itup->t_tid));
     906                 :             :         }
     907                 :             : 
     908                 :       67160 :         return itup;
     909                 :       33580 : }
     910                 :             : 
     911                 :             : /*
     912                 :             :  * Generate a replacement tuple by "updating" a posting list tuple so that it
     913                 :             :  * no longer has TIDs that need to be deleted.
     914                 :             :  *
     915                 :             :  * Used by both VACUUM and index deletion.  Caller's vacposting argument
     916                 :             :  * points to the existing posting list tuple to be updated.
     917                 :             :  *
     918                 :             :  * On return, caller's vacposting argument will point to final "updated"
     919                 :             :  * tuple, which will be palloc()'d in caller's memory context.
     920                 :             :  */
     921                 :             : void
     922                 :        1405 : _bt_update_posting(BTVacuumPosting vacposting)
     923                 :             : {
     924                 :        1405 :         IndexTuple      origtuple = vacposting->itup;
     925                 :        1405 :         uint32          keysize,
     926                 :             :                                 newsize;
     927                 :        1405 :         IndexTuple      itup;
     928                 :        1405 :         int                     nhtids;
     929                 :        1405 :         int                     ui,
     930                 :             :                                 d;
     931                 :        1405 :         ItemPointer htids;
     932                 :             : 
     933                 :        1405 :         nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
     934                 :             : 
     935         [ +  - ]:        1405 :         Assert(_bt_posting_valid(origtuple));
     936         [ +  - ]:        1405 :         Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
     937                 :             : 
     938                 :             :         /*
     939                 :             :          * Determine final size of new tuple.
     940                 :             :          *
     941                 :             :          * This calculation needs to match the code used within _bt_form_posting()
     942                 :             :          * for new posting list tuples.  We avoid calling _bt_form_posting() here
     943                 :             :          * to save ourselves a second memory allocation for a htids workspace.
     944                 :             :          */
     945                 :        1405 :         keysize = BTreeTupleGetPostingOffset(origtuple);
     946         [ +  + ]:        1405 :         if (nhtids > 1)
     947                 :         225 :                 newsize = MAXALIGN(keysize +
     948                 :             :                                                    nhtids * sizeof(ItemPointerData));
     949                 :             :         else
     950                 :        1180 :                 newsize = keysize;
     951                 :             : 
     952         [ +  - ]:        1405 :         Assert(newsize <= INDEX_SIZE_MASK);
     953         [ +  - ]:        1405 :         Assert(newsize == MAXALIGN(newsize));
     954                 :             : 
     955                 :             :         /* Allocate memory using palloc0() (matches index_form_tuple()) */
     956                 :        1405 :         itup = palloc0(newsize);
     957                 :        1405 :         memcpy(itup, origtuple, keysize);
     958                 :        1405 :         itup->t_info &= ~INDEX_SIZE_MASK;
     959                 :        1405 :         itup->t_info |= newsize;
     960                 :             : 
     961         [ +  + ]:        1405 :         if (nhtids > 1)
     962                 :             :         {
     963                 :             :                 /* Form posting list tuple */
     964                 :         225 :                 BTreeTupleSetPosting(itup, nhtids, keysize);
     965                 :         225 :                 htids = BTreeTupleGetPosting(itup);
     966                 :         225 :         }
     967                 :             :         else
     968                 :             :         {
     969                 :             :                 /* Form standard non-pivot tuple */
     970                 :        1180 :                 itup->t_info &= ~INDEX_ALT_TID_MASK;
     971                 :        1180 :                 htids = &itup->t_tid;
     972                 :             :         }
     973                 :             : 
     974                 :        1405 :         ui = 0;
     975                 :        1405 :         d = 0;
     976         [ +  + ]:       10509 :         for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
     977                 :             :         {
     978   [ +  +  +  + ]:        9104 :                 if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
     979                 :             :                 {
     980                 :        4555 :                         d++;
     981                 :        4555 :                         continue;
     982                 :             :                 }
     983                 :        4549 :                 htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
     984                 :        4549 :         }
     985         [ +  - ]:        1405 :         Assert(ui == nhtids);
     986         [ +  - ]:        1405 :         Assert(d == vacposting->ndeletedtids);
     987   [ +  +  +  - ]:        1405 :         Assert(nhtids == 1 || _bt_posting_valid(itup));
     988   [ +  +  +  - ]:        1405 :         Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
     989                 :             : 
     990                 :             :         /* vacposting arg's itup will now point to updated version */
     991                 :        1405 :         vacposting->itup = itup;
     992                 :        1405 : }
     993                 :             : 
     994                 :             : /*
     995                 :             :  * Prepare for a posting list split by swapping heap TID in newitem with heap
     996                 :             :  * TID from original posting list (the 'oposting' heap TID located at offset
     997                 :             :  * 'postingoff').  Modifies newitem, so caller should pass their own private
     998                 :             :  * copy that can safely be modified.
     999                 :             :  *
    1000                 :             :  * Returns new posting list tuple, which is palloc()'d in caller's context.
    1001                 :             :  * This is guaranteed to be the same size as 'oposting'.  Modified newitem is
    1002                 :             :  * what caller actually inserts. (This happens inside the same critical
    1003                 :             :  * section that performs an in-place update of old posting list using new
    1004                 :             :  * posting list returned here.)
    1005                 :             :  *
    1006                 :             :  * While the keys from newitem and oposting must be opclass equal, and must
    1007                 :             :  * generate identical output when run through the underlying type's output
    1008                 :             :  * function, it doesn't follow that their representations match exactly.
    1009                 :             :  * Caller must avoid assuming that there can't be representational differences
    1010                 :             :  * that make datums from oposting bigger or smaller than the corresponding
    1011                 :             :  * datums from newitem.  For example, differences in TOAST input state might
    1012                 :             :  * break a faulty assumption about tuple size (the executor is entitled to
    1013                 :             :  * apply TOAST compression based on its own criteria).  It also seems possible
    1014                 :             :  * that further representational variation will be introduced in the future,
    1015                 :             :  * in order to support nbtree features like page-level prefix compression.
    1016                 :             :  *
    1017                 :             :  * See nbtree/README for details on the design of posting list splits.
    1018                 :             :  */
    1019                 :             : IndexTuple
    1020                 :        2766 : _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
    1021                 :             : {
    1022                 :        2766 :         int                     nhtids;
    1023                 :        2766 :         char       *replacepos;
    1024                 :        2766 :         char       *replaceposright;
    1025                 :        2766 :         Size            nmovebytes;
    1026                 :        2766 :         IndexTuple      nposting;
    1027                 :             : 
    1028                 :        2766 :         nhtids = BTreeTupleGetNPosting(oposting);
    1029         [ +  - ]:        2766 :         Assert(_bt_posting_valid(oposting));
    1030                 :             : 
    1031                 :             :         /*
    1032                 :             :          * The postingoff argument originated as a _bt_binsrch_posting() return
    1033                 :             :          * value.  It will be 0 in the event of corruption that makes a leaf page
    1034                 :             :          * contain a non-pivot tuple that's somehow identical to newitem (no two
    1035                 :             :          * non-pivot tuples should ever have the same TID).  This has been known
    1036                 :             :          * to happen in the field from time to time.
    1037                 :             :          *
    1038                 :             :          * Perform a basic sanity check to catch this case now.
    1039                 :             :          */
    1040         [ +  - ]:        2766 :         if (!(postingoff > 0 && postingoff < nhtids))
    1041   [ #  #  #  # ]:           0 :                 elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
    1042                 :             :                          nhtids, postingoff);
    1043                 :             : 
    1044                 :             :         /*
    1045                 :             :          * Move item pointers in posting list to make a gap for the new item's
    1046                 :             :          * heap TID.  We shift TIDs one place to the right, losing original
    1047                 :             :          * rightmost TID. (nmovebytes must not include TIDs to the left of
    1048                 :             :          * postingoff, nor the existing rightmost/max TID that gets overwritten.)
    1049                 :             :          */
    1050                 :        2766 :         nposting = CopyIndexTuple(oposting);
    1051                 :        2766 :         replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
    1052                 :        2766 :         replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
    1053                 :        2766 :         nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
    1054                 :        2766 :         memmove(replaceposright, replacepos, nmovebytes);
    1055                 :             : 
    1056                 :             :         /* Fill the gap at postingoff with TID of new item (original new TID) */
    1057         [ +  - ]:        2766 :         Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
    1058                 :        2766 :         ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
    1059                 :             : 
    1060                 :             :         /* Now copy oposting's rightmost/max TID into new item (final new TID) */
    1061                 :        2766 :         ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
    1062                 :             : 
    1063         [ +  - ]:        2766 :         Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
    1064                 :             :                                                           BTreeTupleGetHeapTID(newitem)) < 0);
    1065         [ +  - ]:        2766 :         Assert(_bt_posting_valid(nposting));
    1066                 :             : 
    1067                 :        5532 :         return nposting;
    1068                 :        2766 : }
    1069                 :             : 
    1070                 :             : /*
    1071                 :             :  * Verify posting list invariants for "posting", which must be a posting list
    1072                 :             :  * tuple.  Used within assertions.
    1073                 :             :  */
    1074                 :             : #ifdef USE_ASSERT_CHECKING
    1075                 :             : static bool
    1076                 :       50212 : _bt_posting_valid(IndexTuple posting)
    1077                 :             : {
    1078                 :       50212 :         ItemPointerData last;
    1079                 :       50212 :         ItemPointer htid;
    1080                 :             : 
    1081   [ +  -  -  + ]:       50212 :         if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
    1082                 :           0 :                 return false;
    1083                 :             : 
    1084                 :             :         /* Remember first heap TID for loop */
    1085                 :       50212 :         ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
    1086         [ +  - ]:       50212 :         if (!ItemPointerIsValid(&last))
    1087                 :           0 :                 return false;
    1088                 :             : 
    1089                 :             :         /* Iterate, starting from second TID */
    1090   [ +  +  -  + ]:     1426867 :         for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
    1091                 :             :         {
    1092                 :     1376655 :                 htid = BTreeTupleGetPostingN(posting, i);
    1093                 :             : 
    1094         [ -  + ]:     1376655 :                 if (!ItemPointerIsValid(htid))
    1095                 :           0 :                         return false;
    1096         [ -  + ]:     1376655 :                 if (ItemPointerCompare(htid, &last) <= 0)
    1097                 :           0 :                         return false;
    1098                 :     1376655 :                 ItemPointerCopy(htid, &last);
    1099                 :     1376655 :         }
    1100                 :             : 
    1101                 :       50212 :         return true;
    1102                 :       50212 : }
    1103                 :             : #endif
        

Generated by: LCOV version 2.3.2-1