LCOV - code coverage report
Current view: top level - src/backend/access/hash - hashpage.c (source / functions) Coverage Total Hit
Test: Code coverage Lines: 82.7 % 623 515
Test Date: 2026-01-26 10:56:24 Functions: 94.7 % 19 18
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
Branches: 53.6 % 291 156

             Branch data     Line data    Source code
       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * hashpage.c
       4                 :             :  *        Hash table page management code for the Postgres hash access method
       5                 :             :  *
       6                 :             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7                 :             :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :             :  *
       9                 :             :  *
      10                 :             :  * IDENTIFICATION
      11                 :             :  *        src/backend/access/hash/hashpage.c
      12                 :             :  *
      13                 :             :  * NOTES
      14                 :             :  *        Postgres hash pages look like ordinary relation pages.  The opaque
      15                 :             :  *        data at high addresses includes information about the page including
      16                 :             :  *        whether a page is an overflow page or a true bucket, the bucket
      17                 :             :  *        number, and the block numbers of the preceding and following pages
      18                 :             :  *        in the same bucket.
      19                 :             :  *
      20                 :             :  *        The first page in a hash relation, page zero, is special -- it stores
      21                 :             :  *        information describing the hash table; it is referred to as the
      22                 :             :  *        "meta page." Pages one and higher store the actual data.
      23                 :             :  *
      24                 :             :  *        There are also bitmap pages, which are not manipulated here;
      25                 :             :  *        see hashovfl.c.
      26                 :             :  *
      27                 :             :  *-------------------------------------------------------------------------
      28                 :             :  */
      29                 :             : #include "postgres.h"
      30                 :             : 
      31                 :             : #include "access/hash.h"
      32                 :             : #include "access/hash_xlog.h"
      33                 :             : #include "access/xloginsert.h"
      34                 :             : #include "miscadmin.h"
      35                 :             : #include "port/pg_bitutils.h"
      36                 :             : #include "storage/predicate.h"
      37                 :             : #include "storage/smgr.h"
      38                 :             : #include "utils/rel.h"
      39                 :             : 
      40                 :             : static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
      41                 :             :                                                                 uint32 nblocks);
      42                 :             : static void _hash_splitbucket(Relation rel, Buffer metabuf,
      43                 :             :                                                           Bucket obucket, Bucket nbucket,
      44                 :             :                                                           Buffer obuf,
      45                 :             :                                                           Buffer nbuf,
      46                 :             :                                                           HTAB *htab,
      47                 :             :                                                           uint32 maxbucket,
      48                 :             :                                                           uint32 highmask, uint32 lowmask);
      49                 :             : static void log_split_page(Relation rel, Buffer buf);
      50                 :             : 
      51                 :             : 
      52                 :             : /*
      53                 :             :  *      _hash_getbuf() -- Get a buffer by block number for read or write.
      54                 :             :  *
      55                 :             :  *              'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
      56                 :             :  *              'flags' is a bitwise OR of the allowed page types.
      57                 :             :  *
      58                 :             :  *              This must be used only to fetch pages that are expected to be valid
      59                 :             :  *              already.  _hash_checkpage() is applied using the given flags.
      60                 :             :  *
      61                 :             :  *              When this routine returns, the appropriate lock is set on the
      62                 :             :  *              requested buffer and its reference count has been incremented
      63                 :             :  *              (ie, the buffer is "locked and pinned").
      64                 :             :  *
      65                 :             :  *              P_NEW is disallowed because this routine can only be used
      66                 :             :  *              to access pages that are known to be before the filesystem EOF.
      67                 :             :  *              Extending the index should be done with _hash_getnewbuf.
      68                 :             :  */
      69                 :             : Buffer
      70                 :      326355 : _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
      71                 :             : {
      72                 :      326355 :         Buffer          buf;
      73                 :             : 
      74         [ +  - ]:      326355 :         if (blkno == P_NEW)
      75   [ #  #  #  # ]:           0 :                 elog(ERROR, "hash AM does not use P_NEW");
      76                 :             : 
      77                 :      326355 :         buf = ReadBuffer(rel, blkno);
      78                 :             : 
      79         [ +  + ]:      326355 :         if (access != HASH_NOLOCK)
      80                 :      206735 :                 LockBuffer(buf, access);
      81                 :             : 
      82                 :             :         /* ref count and lock type are correct */
      83                 :             : 
      84                 :      326355 :         _hash_checkpage(rel, buf, flags);
      85                 :             : 
      86                 :      652710 :         return buf;
      87                 :      326355 : }
      88                 :             : 
      89                 :             : /*
      90                 :             :  * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup.
      91                 :             :  *
      92                 :             :  *              We read the page and try to acquire a cleanup lock.  If we get it,
      93                 :             :  *              we return the buffer; otherwise, we return InvalidBuffer.
      94                 :             :  */
      95                 :             : Buffer
      96                 :         221 : _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags)
      97                 :             : {
      98                 :         221 :         Buffer          buf;
      99                 :             : 
     100         [ +  - ]:         221 :         if (blkno == P_NEW)
     101   [ #  #  #  # ]:           0 :                 elog(ERROR, "hash AM does not use P_NEW");
     102                 :             : 
     103                 :         221 :         buf = ReadBuffer(rel, blkno);
     104                 :             : 
     105         [ +  - ]:         221 :         if (!ConditionalLockBufferForCleanup(buf))
     106                 :             :         {
     107                 :           0 :                 ReleaseBuffer(buf);
     108                 :           0 :                 return InvalidBuffer;
     109                 :             :         }
     110                 :             : 
     111                 :             :         /* ref count and lock type are correct */
     112                 :             : 
     113                 :         221 :         _hash_checkpage(rel, buf, flags);
     114                 :             : 
     115                 :         221 :         return buf;
     116                 :         221 : }
     117                 :             : 
     118                 :             : /*
     119                 :             :  *      _hash_getinitbuf() -- Get and initialize a buffer by block number.
     120                 :             :  *
     121                 :             :  *              This must be used only to fetch pages that are known to be before
     122                 :             :  *              the index's filesystem EOF, but are to be filled from scratch.
     123                 :             :  *              _hash_pageinit() is applied automatically.  Otherwise it has
     124                 :             :  *              effects similar to _hash_getbuf() with access = HASH_WRITE.
     125                 :             :  *
     126                 :             :  *              When this routine returns, a write lock is set on the
     127                 :             :  *              requested buffer and its reference count has been incremented
     128                 :             :  *              (ie, the buffer is "locked and pinned").
     129                 :             :  *
     130                 :             :  *              P_NEW is disallowed because this routine can only be used
     131                 :             :  *              to access pages that are known to be before the filesystem EOF.
     132                 :             :  *              Extending the index should be done with _hash_getnewbuf.
     133                 :             :  */
     134                 :             : Buffer
     135                 :          12 : _hash_getinitbuf(Relation rel, BlockNumber blkno)
     136                 :             : {
     137                 :          12 :         Buffer          buf;
     138                 :             : 
     139         [ +  - ]:          12 :         if (blkno == P_NEW)
     140   [ #  #  #  # ]:           0 :                 elog(ERROR, "hash AM does not use P_NEW");
     141                 :             : 
     142                 :          12 :         buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
     143                 :             :                                                          NULL);
     144                 :             : 
     145                 :             :         /* ref count and lock type are correct */
     146                 :             : 
     147                 :             :         /* initialize the page */
     148                 :          12 :         _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
     149                 :             : 
     150                 :          24 :         return buf;
     151                 :          12 : }
     152                 :             : 
     153                 :             : /*
     154                 :             :  *      _hash_initbuf() -- Get and initialize a buffer by bucket number.
     155                 :             :  */
     156                 :             : void
     157                 :        1200 : _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag,
     158                 :             :                           bool initpage)
     159                 :             : {
     160                 :        1200 :         HashPageOpaque pageopaque;
     161                 :        1200 :         Page            page;
     162                 :             : 
     163                 :        1200 :         page = BufferGetPage(buf);
     164                 :             : 
     165                 :             :         /* initialize the page */
     166         [ +  - ]:        1200 :         if (initpage)
     167                 :           0 :                 _hash_pageinit(page, BufferGetPageSize(buf));
     168                 :             : 
     169                 :        1200 :         pageopaque = HashPageGetOpaque(page);
     170                 :             : 
     171                 :             :         /*
     172                 :             :          * Set hasho_prevblkno with current hashm_maxbucket. This value will be
     173                 :             :          * used to validate cached HashMetaPageData. See
     174                 :             :          * _hash_getbucketbuf_from_hashkey().
     175                 :             :          */
     176                 :        1200 :         pageopaque->hasho_prevblkno = max_bucket;
     177                 :        1200 :         pageopaque->hasho_nextblkno = InvalidBlockNumber;
     178                 :        1200 :         pageopaque->hasho_bucket = num_bucket;
     179                 :        1200 :         pageopaque->hasho_flag = flag;
     180                 :        1200 :         pageopaque->hasho_page_id = HASHO_PAGE_ID;
     181                 :        1200 : }
     182                 :             : 
     183                 :             : /*
     184                 :             :  *      _hash_getnewbuf() -- Get a new page at the end of the index.
     185                 :             :  *
     186                 :             :  *              This has the same API as _hash_getinitbuf, except that we are adding
     187                 :             :  *              a page to the index, and hence expect the page to be past the
     188                 :             :  *              logical EOF.  (However, we have to support the case where it isn't,
     189                 :             :  *              since a prior try might have crashed after extending the filesystem
     190                 :             :  *              EOF but before updating the metapage to reflect the added page.)
     191                 :             :  *
     192                 :             :  *              It is caller's responsibility to ensure that only one process can
     193                 :             :  *              extend the index at a time.  In practice, this function is called
     194                 :             :  *              only while holding write lock on the metapage, because adding a page
     195                 :             :  *              is always associated with an update of metapage data.
     196                 :             :  */
     197                 :             : Buffer
     198                 :        1536 : _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
     199                 :             : {
     200                 :        1536 :         BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum);
     201                 :        1536 :         Buffer          buf;
     202                 :             : 
     203         [ +  - ]:        1536 :         if (blkno == P_NEW)
     204   [ #  #  #  # ]:           0 :                 elog(ERROR, "hash AM does not use P_NEW");
     205         [ +  - ]:        1536 :         if (blkno > nblocks)
     206   [ #  #  #  # ]:           0 :                 elog(ERROR, "access to noncontiguous page in hash index \"%s\"",
     207                 :             :                          RelationGetRelationName(rel));
     208                 :             : 
     209                 :             :         /* smgr insists we explicitly extend the relation */
     210         [ +  + ]:        1536 :         if (blkno == nblocks)
     211                 :             :         {
     212                 :        1315 :                 buf = ExtendBufferedRel(BMR_REL(rel), forkNum, NULL,
     213                 :             :                                                                 EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
     214         [ +  - ]:        1315 :                 if (BufferGetBlockNumber(buf) != blkno)
     215   [ #  #  #  # ]:           0 :                         elog(ERROR, "unexpected hash relation size: %u, should be %u",
     216                 :             :                                  BufferGetBlockNumber(buf), blkno);
     217                 :        1315 :         }
     218                 :             :         else
     219                 :             :         {
     220                 :         221 :                 buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
     221                 :             :                                                                  NULL);
     222                 :             :         }
     223                 :             : 
     224                 :             :         /* ref count and lock type are correct */
     225                 :             : 
     226                 :             :         /* initialize the page */
     227                 :        1536 :         _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
     228                 :             : 
     229                 :        3072 :         return buf;
     230                 :        1536 : }
     231                 :             : 
     232                 :             : /*
     233                 :             :  *      _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
     234                 :             :  *
     235                 :             :  *              This is identical to _hash_getbuf() but also allows a buffer access
     236                 :             :  *              strategy to be specified.  We use this for VACUUM operations.
     237                 :             :  */
     238                 :             : Buffer
     239                 :         199 : _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
     240                 :             :                                                    int access, int flags,
     241                 :             :                                                    BufferAccessStrategy bstrategy)
     242                 :             : {
     243                 :         199 :         Buffer          buf;
     244                 :             : 
     245         [ +  - ]:         199 :         if (blkno == P_NEW)
     246   [ #  #  #  # ]:           0 :                 elog(ERROR, "hash AM does not use P_NEW");
     247                 :             : 
     248                 :         199 :         buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
     249                 :             : 
     250         [ -  + ]:         199 :         if (access != HASH_NOLOCK)
     251                 :         199 :                 LockBuffer(buf, access);
     252                 :             : 
     253                 :             :         /* ref count and lock type are correct */
     254                 :             : 
     255                 :         199 :         _hash_checkpage(rel, buf, flags);
     256                 :             : 
     257                 :         398 :         return buf;
     258                 :         199 : }
     259                 :             : 
     260                 :             : /*
     261                 :             :  *      _hash_relbuf() -- release a locked buffer.
     262                 :             :  *
     263                 :             :  * Lock and pin (refcount) are both dropped.
     264                 :             :  */
     265                 :             : void
     266                 :      192564 : _hash_relbuf(Relation rel, Buffer buf)
     267                 :             : {
     268                 :      192564 :         UnlockReleaseBuffer(buf);
     269                 :      192564 : }
     270                 :             : 
     271                 :             : /*
     272                 :             :  *      _hash_dropbuf() -- release an unlocked buffer.
     273                 :             :  *
     274                 :             :  * This is used to unpin a buffer on which we hold no lock.
     275                 :             :  */
     276                 :             : void
     277                 :      135790 : _hash_dropbuf(Relation rel, Buffer buf)
     278                 :             : {
     279                 :      135790 :         ReleaseBuffer(buf);
     280                 :      135790 : }
     281                 :             : 
     282                 :             : /*
     283                 :             :  *      _hash_dropscanbuf() -- release buffers used in scan.
     284                 :             :  *
     285                 :             :  * This routine unpins the buffers used during scan on which we
     286                 :             :  * hold no lock.
     287                 :             :  */
     288                 :             : void
     289                 :         134 : _hash_dropscanbuf(Relation rel, HashScanOpaque so)
     290                 :             : {
     291                 :             :         /* release pin we hold on primary bucket page */
     292   [ +  +  +  + ]:         134 :         if (BufferIsValid(so->hashso_bucket_buf) &&
     293                 :          60 :                 so->hashso_bucket_buf != so->currPos.buf)
     294                 :          21 :                 _hash_dropbuf(rel, so->hashso_bucket_buf);
     295                 :         134 :         so->hashso_bucket_buf = InvalidBuffer;
     296                 :             : 
     297                 :             :         /* release pin we hold on primary bucket page  of bucket being split */
     298   [ -  +  #  # ]:         134 :         if (BufferIsValid(so->hashso_split_bucket_buf) &&
     299                 :           0 :                 so->hashso_split_bucket_buf != so->currPos.buf)
     300                 :           0 :                 _hash_dropbuf(rel, so->hashso_split_bucket_buf);
     301                 :         134 :         so->hashso_split_bucket_buf = InvalidBuffer;
     302                 :             : 
     303                 :             :         /* release any pin we still hold */
     304         [ +  + ]:         134 :         if (BufferIsValid(so->currPos.buf))
     305                 :          39 :                 _hash_dropbuf(rel, so->currPos.buf);
     306                 :         134 :         so->currPos.buf = InvalidBuffer;
     307                 :             : 
     308                 :             :         /* reset split scan */
     309                 :         134 :         so->hashso_buc_populated = false;
     310                 :         134 :         so->hashso_buc_split = false;
     311                 :         134 : }
     312                 :             : 
     313                 :             : 
     314                 :             : /*
     315                 :             :  *      _hash_init() -- Initialize the metadata page of a hash index,
     316                 :             :  *                              the initial buckets, and the initial bitmap page.
     317                 :             :  *
     318                 :             :  * The initial number of buckets is dependent on num_tuples, an estimate
     319                 :             :  * of the number of tuples to be loaded into the index initially.  The
     320                 :             :  * chosen number of buckets is returned.
     321                 :             :  *
     322                 :             :  * We are fairly cavalier about locking here, since we know that no one else
     323                 :             :  * could be accessing this index.  In particular the rule about not holding
     324                 :             :  * multiple buffer locks is ignored.
     325                 :             :  */
     326                 :             : uint32
     327                 :          30 : _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
     328                 :             : {
     329                 :          30 :         Buffer          metabuf;
     330                 :          30 :         Buffer          buf;
     331                 :          30 :         Buffer          bitmapbuf;
     332                 :          30 :         Page            pg;
     333                 :          30 :         HashMetaPage metap;
     334                 :          30 :         RegProcedure procid;
     335                 :          30 :         int32           data_width;
     336                 :          30 :         int32           item_width;
     337                 :          30 :         int32           ffactor;
     338                 :          30 :         uint32          num_buckets;
     339                 :          30 :         uint32          i;
     340                 :          30 :         bool            use_wal;
     341                 :             : 
     342                 :             :         /* safety check */
     343         [ +  - ]:          30 :         if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)
     344   [ #  #  #  # ]:           0 :                 elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
     345                 :             :                          RelationGetRelationName(rel));
     346                 :             : 
     347                 :             :         /*
     348                 :             :          * WAL log creation of pages if the relation is persistent, or this is the
     349                 :             :          * init fork.  Init forks for unlogged relations always need to be WAL
     350                 :             :          * logged.
     351                 :             :          */
     352   [ +  +  +  -  :          30 :         use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM;
             +  +  +  - ]
     353                 :             : 
     354                 :             :         /*
     355                 :             :          * Determine the target fill factor (in tuples per bucket) for this index.
     356                 :             :          * The idea is to make the fill factor correspond to pages about as full
     357                 :             :          * as the user-settable fillfactor parameter says.  We can compute it
     358                 :             :          * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
     359                 :             :          */
     360                 :          30 :         data_width = sizeof(uint32);
     361                 :          30 :         item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
     362                 :             :                 sizeof(ItemIdData);             /* include the line pointer */
     363   [ +  -  +  + ]:          30 :         ffactor = HashGetTargetPageUsage(rel) / item_width;
     364                 :             :         /* keep to a sane range */
     365         [ +  - ]:          30 :         if (ffactor < 10)
     366                 :           0 :                 ffactor = 10;
     367                 :             : 
     368                 :          30 :         procid = index_getprocid(rel, 1, HASHSTANDARD_PROC);
     369                 :             : 
     370                 :             :         /*
     371                 :             :          * We initialize the metapage, the first N bucket pages, and the first
     372                 :             :          * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
     373                 :             :          * calls to occur.  This ensures that the smgr level has the right idea of
     374                 :             :          * the physical index length.
     375                 :             :          *
     376                 :             :          * Critical section not required, because on error the creation of the
     377                 :             :          * whole relation will be rolled back.
     378                 :             :          */
     379                 :          30 :         metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
     380                 :          30 :         _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false);
     381                 :          30 :         MarkBufferDirty(metabuf);
     382                 :             : 
     383                 :          30 :         pg = BufferGetPage(metabuf);
     384                 :          30 :         metap = HashPageGetMeta(pg);
     385                 :             : 
     386                 :             :         /* XLOG stuff */
     387         [ +  + ]:          30 :         if (use_wal)
     388                 :             :         {
     389                 :           1 :                 xl_hash_init_meta_page xlrec;
     390                 :           1 :                 XLogRecPtr      recptr;
     391                 :             : 
     392                 :           1 :                 xlrec.num_tuples = num_tuples;
     393                 :           1 :                 xlrec.procid = metap->hashm_procid;
     394                 :           1 :                 xlrec.ffactor = metap->hashm_ffactor;
     395                 :             : 
     396                 :           1 :                 XLogBeginInsert();
     397                 :           1 :                 XLogRegisterData(&xlrec, SizeOfHashInitMetaPage);
     398                 :           1 :                 XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
     399                 :             : 
     400                 :           1 :                 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE);
     401                 :             : 
     402                 :           1 :                 PageSetLSN(BufferGetPage(metabuf), recptr);
     403                 :           1 :         }
     404                 :             : 
     405                 :          30 :         num_buckets = metap->hashm_maxbucket + 1;
     406                 :             : 
     407                 :             :         /*
     408                 :             :          * Release buffer lock on the metapage while we initialize buckets.
     409                 :             :          * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
     410                 :             :          * won't accomplish anything.  It's a bad idea to hold buffer locks for
     411                 :             :          * long intervals in any case, since that can block the bgwriter.
     412                 :             :          */
     413                 :          30 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     414                 :             : 
     415                 :             :         /*
     416                 :             :          * Initialize and WAL Log the first N buckets
     417                 :             :          */
     418         [ +  + ]:        1230 :         for (i = 0; i < num_buckets; i++)
     419                 :             :         {
     420                 :        1200 :                 BlockNumber blkno;
     421                 :             : 
     422                 :             :                 /* Allow interrupts, in case N is huge */
     423         [ +  - ]:        1200 :                 CHECK_FOR_INTERRUPTS();
     424                 :             : 
     425         [ +  + ]:        1200 :                 blkno = BUCKET_TO_BLKNO(metap, i);
     426                 :        1200 :                 buf = _hash_getnewbuf(rel, blkno, forkNum);
     427                 :        1200 :                 _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false);
     428                 :        1200 :                 MarkBufferDirty(buf);
     429                 :             : 
     430         [ +  + ]:        1200 :                 if (use_wal)
     431                 :           4 :                         log_newpage(&rel->rd_locator,
     432                 :           2 :                                                 forkNum,
     433                 :           2 :                                                 blkno,
     434                 :           2 :                                                 BufferGetPage(buf),
     435                 :             :                                                 true);
     436                 :        1200 :                 _hash_relbuf(rel, buf);
     437                 :        1200 :         }
     438                 :             : 
     439                 :             :         /* Now reacquire buffer lock on metapage */
     440                 :          30 :         LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
     441                 :             : 
     442                 :             :         /*
     443                 :             :          * Initialize bitmap page
     444                 :             :          */
     445                 :          30 :         bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum);
     446                 :          30 :         _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false);
     447                 :          30 :         MarkBufferDirty(bitmapbuf);
     448                 :             : 
     449                 :             :         /* add the new bitmap page to the metapage's list of bitmaps */
     450                 :             :         /* metapage already has a write lock */
     451         [ +  - ]:          30 :         if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
     452   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     453                 :             :                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     454                 :             :                                  errmsg("out of overflow pages in hash index \"%s\"",
     455                 :             :                                                 RelationGetRelationName(rel))));
     456                 :             : 
     457                 :          30 :         metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1;
     458                 :             : 
     459                 :          30 :         metap->hashm_nmaps++;
     460                 :          30 :         MarkBufferDirty(metabuf);
     461                 :             : 
     462                 :             :         /* XLOG stuff */
     463         [ +  + ]:          30 :         if (use_wal)
     464                 :             :         {
     465                 :           1 :                 xl_hash_init_bitmap_page xlrec;
     466                 :           1 :                 XLogRecPtr      recptr;
     467                 :             : 
     468                 :           1 :                 xlrec.bmsize = metap->hashm_bmsize;
     469                 :             : 
     470                 :           1 :                 XLogBeginInsert();
     471                 :           1 :                 XLogRegisterData(&xlrec, SizeOfHashInitBitmapPage);
     472                 :           1 :                 XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT);
     473                 :             : 
     474                 :             :                 /*
     475                 :             :                  * This is safe only because nobody else can be modifying the index at
     476                 :             :                  * this stage; it's only visible to the transaction that is creating
     477                 :             :                  * it.
     478                 :             :                  */
     479                 :           1 :                 XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
     480                 :             : 
     481                 :           1 :                 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE);
     482                 :             : 
     483                 :           1 :                 PageSetLSN(BufferGetPage(bitmapbuf), recptr);
     484                 :           1 :                 PageSetLSN(BufferGetPage(metabuf), recptr);
     485                 :           1 :         }
     486                 :             : 
     487                 :             :         /* all done */
     488                 :          30 :         _hash_relbuf(rel, bitmapbuf);
     489                 :          30 :         _hash_relbuf(rel, metabuf);
     490                 :             : 
     491                 :          60 :         return num_buckets;
     492                 :          30 : }
     493                 :             : 
     494                 :             : /*
     495                 :             :  *      _hash_init_metabuffer() -- Initialize the metadata page of a hash index.
     496                 :             :  */
     497                 :             : void
     498                 :          30 : _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
     499                 :             :                                           uint16 ffactor, bool initpage)
     500                 :             : {
     501                 :          30 :         HashMetaPage metap;
     502                 :          30 :         HashPageOpaque pageopaque;
     503                 :          30 :         Page            page;
     504                 :          30 :         double          dnumbuckets;
     505                 :          30 :         uint32          num_buckets;
     506                 :          30 :         uint32          spare_index;
     507                 :          30 :         uint32          lshift;
     508                 :             : 
     509                 :             :         /*
     510                 :             :          * Choose the number of initial bucket pages to match the fill factor
     511                 :             :          * given the estimated number of tuples.  We round up the result to the
     512                 :             :          * total number of buckets which has to be allocated before using its
     513                 :             :          * hashm_spares element. However always force at least 2 bucket pages. The
     514                 :             :          * upper limit is determined by considerations explained in
     515                 :             :          * _hash_expandtable().
     516                 :             :          */
     517                 :          30 :         dnumbuckets = num_tuples / ffactor;
     518         [ +  + ]:          30 :         if (dnumbuckets <= 2.0)
     519                 :           7 :                 num_buckets = 2;
     520         [ -  + ]:          23 :         else if (dnumbuckets >= (double) 0x40000000)
     521                 :           0 :                 num_buckets = 0x40000000;
     522                 :             :         else
     523                 :          23 :                 num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets));
     524                 :             : 
     525                 :          30 :         spare_index = _hash_spareindex(num_buckets);
     526         [ +  - ]:          30 :         Assert(spare_index < HASH_MAX_SPLITPOINTS);
     527                 :             : 
     528                 :          30 :         page = BufferGetPage(buf);
     529         [ +  - ]:          30 :         if (initpage)
     530                 :           0 :                 _hash_pageinit(page, BufferGetPageSize(buf));
     531                 :             : 
     532                 :          30 :         pageopaque = HashPageGetOpaque(page);
     533                 :          30 :         pageopaque->hasho_prevblkno = InvalidBlockNumber;
     534                 :          30 :         pageopaque->hasho_nextblkno = InvalidBlockNumber;
     535                 :          30 :         pageopaque->hasho_bucket = InvalidBucket;
     536                 :          30 :         pageopaque->hasho_flag = LH_META_PAGE;
     537                 :          30 :         pageopaque->hasho_page_id = HASHO_PAGE_ID;
     538                 :             : 
     539                 :          30 :         metap = HashPageGetMeta(page);
     540                 :             : 
     541                 :          30 :         metap->hashm_magic = HASH_MAGIC;
     542                 :          30 :         metap->hashm_version = HASH_VERSION;
     543                 :          30 :         metap->hashm_ntuples = 0;
     544                 :          30 :         metap->hashm_nmaps = 0;
     545                 :          30 :         metap->hashm_ffactor = ffactor;
     546                 :          30 :         metap->hashm_bsize = HashGetMaxBitmapSize(page);
     547                 :             : 
     548                 :             :         /* find largest bitmap array size that will fit in page size */
     549                 :          30 :         lshift = pg_leftmost_one_pos32(metap->hashm_bsize);
     550         [ +  - ]:          30 :         Assert(lshift > 0);
     551                 :          30 :         metap->hashm_bmsize = 1 << lshift;
     552                 :          30 :         metap->hashm_bmshift = lshift + BYTE_TO_BIT;
     553         [ +  - ]:          30 :         Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
     554                 :             : 
     555                 :             :         /*
     556                 :             :          * Label the index with its primary hash support function's OID.  This is
     557                 :             :          * pretty useless for normal operation (in fact, hashm_procid is not used
     558                 :             :          * anywhere), but it might be handy for forensic purposes so we keep it.
     559                 :             :          */
     560                 :          30 :         metap->hashm_procid = procid;
     561                 :             : 
     562                 :             :         /*
     563                 :             :          * We initialize the index with N buckets, 0 .. N-1, occupying physical
     564                 :             :          * blocks 1 to N.  The first freespace bitmap page is in block N+1.
     565                 :             :          */
     566                 :          30 :         metap->hashm_maxbucket = num_buckets - 1;
     567                 :             : 
     568                 :             :         /*
     569                 :             :          * Set highmask as next immediate ((2 ^ x) - 1), which should be
     570                 :             :          * sufficient to cover num_buckets.
     571                 :             :          */
     572                 :          30 :         metap->hashm_highmask = pg_nextpower2_32(num_buckets + 1) - 1;
     573                 :          30 :         metap->hashm_lowmask = (metap->hashm_highmask >> 1);
     574                 :             : 
     575   [ -  +  #  #  :          30 :         MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
          #  #  #  #  #  
                      # ]
     576   [ -  +  #  #  :          30 :         MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
          #  #  #  #  #  
                      # ]
     577                 :             : 
     578                 :             :         /* Set up mapping for one spare page after the initial splitpoints */
     579                 :          30 :         metap->hashm_spares[spare_index] = 1;
     580                 :          30 :         metap->hashm_ovflpoint = spare_index;
     581                 :          30 :         metap->hashm_firstfree = 0;
     582                 :             : 
     583                 :             :         /*
     584                 :             :          * Set pd_lower just past the end of the metadata.  This is essential,
     585                 :             :          * because without doing so, metadata will be lost if xlog.c compresses
     586                 :             :          * the page.
     587                 :             :          */
     588                 :          30 :         ((PageHeader) page)->pd_lower =
     589                 :          30 :                 ((char *) metap + sizeof(HashMetaPageData)) - (char *) page;
     590                 :          30 : }
     591                 :             : 
     592                 :             : /*
     593                 :             :  *      _hash_pageinit() -- Initialize a new hash index page.
     594                 :             :  */
     595                 :             : void
     596                 :        1588 : _hash_pageinit(Page page, Size size)
     597                 :             : {
     598                 :        1588 :         PageInit(page, size, sizeof(HashPageOpaqueData));
     599                 :        1588 : }
     600                 :             : 
     601                 :             : /*
     602                 :             :  * Attempt to expand the hash table by creating one new bucket.
     603                 :             :  *
     604                 :             :  * This will silently do nothing if we don't get cleanup lock on old or
     605                 :             :  * new bucket.
     606                 :             :  *
     607                 :             :  * Complete the pending splits and remove the tuples from old bucket,
     608                 :             :  * if there are any left over from the previous split.
     609                 :             :  *
     610                 :             :  * The caller must hold a pin, but no lock, on the metapage buffer.
     611                 :             :  * The buffer is returned in the same state.
     612                 :             :  */
     613                 :             : void
     614                 :         221 : _hash_expandtable(Relation rel, Buffer metabuf)
     615                 :             : {
     616                 :         221 :         HashMetaPage metap;
     617                 :         221 :         Bucket          old_bucket;
     618                 :         221 :         Bucket          new_bucket;
     619                 :         221 :         uint32          spare_ndx;
     620                 :         221 :         BlockNumber start_oblkno;
     621                 :         221 :         BlockNumber start_nblkno;
     622                 :         221 :         Buffer          buf_nblkno;
     623                 :         221 :         Buffer          buf_oblkno;
     624                 :         221 :         Page            opage;
     625                 :         221 :         Page            npage;
     626                 :         221 :         HashPageOpaque oopaque;
     627                 :         221 :         HashPageOpaque nopaque;
     628                 :         221 :         uint32          maxbucket;
     629                 :         221 :         uint32          highmask;
     630                 :         221 :         uint32          lowmask;
     631                 :         221 :         bool            metap_update_masks = false;
     632                 :         221 :         bool            metap_update_splitpoint = false;
     633                 :             : 
     634                 :             : restart_expand:
     635                 :             : 
     636                 :             :         /*
     637                 :             :          * Write-lock the meta page.  It used to be necessary to acquire a
     638                 :             :          * heavyweight lock to begin a split, but that is no longer required.
     639                 :             :          */
     640                 :         221 :         LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
     641                 :             : 
     642                 :         221 :         _hash_checkpage(rel, metabuf, LH_META_PAGE);
     643                 :         221 :         metap = HashPageGetMeta(BufferGetPage(metabuf));
     644                 :             : 
     645                 :             :         /*
     646                 :             :          * Check to see if split is still needed; someone else might have already
     647                 :             :          * done one while we waited for the lock.
     648                 :             :          *
     649                 :             :          * Make sure this stays in sync with _hash_doinsert()
     650                 :             :          */
     651   [ +  -  +  - ]:         442 :         if (metap->hashm_ntuples <=
     652                 :         221 :                 (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
     653                 :           0 :                 goto fail;
     654                 :             : 
     655                 :             :         /*
     656                 :             :          * Can't split anymore if maxbucket has reached its maximum possible
     657                 :             :          * value.
     658                 :             :          *
     659                 :             :          * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
     660                 :             :          * the calculation maxbucket+1 mustn't overflow).  Currently we restrict
     661                 :             :          * to half that to prevent failure of pg_ceil_log2_32() and insufficient
     662                 :             :          * space in hashm_spares[].  It's moot anyway because an index with 2^32
     663                 :             :          * buckets would certainly overflow BlockNumber and hence
     664                 :             :          * _hash_alloc_buckets() would fail, but if we supported buckets smaller
     665                 :             :          * than a disk block then this would be an independent constraint.
     666                 :             :          *
     667                 :             :          * If you change this, see also the maximum initial number of buckets in
     668                 :             :          * _hash_init().
     669                 :             :          */
     670         [ +  - ]:         221 :         if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
     671                 :           0 :                 goto fail;
     672                 :             : 
     673                 :             :         /*
     674                 :             :          * Determine which bucket is to be split, and attempt to take cleanup lock
     675                 :             :          * on the old bucket.  If we can't get the lock, give up.
     676                 :             :          *
     677                 :             :          * The cleanup lock protects us not only against other backends, but
     678                 :             :          * against our own backend as well.
     679                 :             :          *
     680                 :             :          * The cleanup lock is mainly to protect the split from concurrent
     681                 :             :          * inserts. See src/backend/access/hash/README, Lock Definitions for
     682                 :             :          * further details.  Due to this locking restriction, if there is any
     683                 :             :          * pending scan, the split will give up which is not good, but harmless.
     684                 :             :          */
     685                 :         221 :         new_bucket = metap->hashm_maxbucket + 1;
     686                 :             : 
     687                 :         221 :         old_bucket = (new_bucket & metap->hashm_lowmask);
     688                 :             : 
     689         [ +  + ]:         221 :         start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
     690                 :             : 
     691                 :         221 :         buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE);
     692         [ -  + ]:         221 :         if (!buf_oblkno)
     693                 :           0 :                 goto fail;
     694                 :             : 
     695                 :         221 :         opage = BufferGetPage(buf_oblkno);
     696                 :         221 :         oopaque = HashPageGetOpaque(opage);
     697                 :             : 
     698                 :             :         /*
     699                 :             :          * We want to finish the split from a bucket as there is no apparent
     700                 :             :          * benefit by not doing so and it will make the code complicated to finish
     701                 :             :          * the split that involves multiple buckets considering the case where new
     702                 :             :          * split also fails.  We don't need to consider the new bucket for
     703                 :             :          * completing the split here as it is not possible that a re-split of new
     704                 :             :          * bucket starts when there is still a pending split from old bucket.
     705                 :             :          */
     706         [ -  + ]:         221 :         if (H_BUCKET_BEING_SPLIT(oopaque))
     707                 :             :         {
     708                 :             :                 /*
     709                 :             :                  * Copy bucket mapping info now; refer the comment in code below where
     710                 :             :                  * we copy this information before calling _hash_splitbucket to see
     711                 :             :                  * why this is okay.
     712                 :             :                  */
     713                 :           0 :                 maxbucket = metap->hashm_maxbucket;
     714                 :           0 :                 highmask = metap->hashm_highmask;
     715                 :           0 :                 lowmask = metap->hashm_lowmask;
     716                 :             : 
     717                 :             :                 /*
     718                 :             :                  * Release the lock on metapage and old_bucket, before completing the
     719                 :             :                  * split.
     720                 :             :                  */
     721                 :           0 :                 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     722                 :           0 :                 LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK);
     723                 :             : 
     724                 :           0 :                 _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket,
     725                 :           0 :                                                    highmask, lowmask);
     726                 :             : 
     727                 :             :                 /* release the pin on old buffer and retry for expand. */
     728                 :           0 :                 _hash_dropbuf(rel, buf_oblkno);
     729                 :             : 
     730                 :           0 :                 goto restart_expand;
     731                 :             :         }
     732                 :             : 
     733                 :             :         /*
     734                 :             :          * Clean the tuples remained from the previous split.  This operation
     735                 :             :          * requires cleanup lock and we already have one on the old bucket, so
     736                 :             :          * let's do it. We also don't want to allow further splits from the bucket
     737                 :             :          * till the garbage of previous split is cleaned.  This has two
     738                 :             :          * advantages; first, it helps in avoiding the bloat due to garbage and
     739                 :             :          * second is, during cleanup of bucket, we are always sure that the
     740                 :             :          * garbage tuples belong to most recently split bucket.  On the contrary,
     741                 :             :          * if we allow cleanup of bucket after meta page is updated to indicate
     742                 :             :          * the new split and before the actual split, the cleanup operation won't
     743                 :             :          * be able to decide whether the tuple has been moved to the newly created
     744                 :             :          * bucket and ended up deleting such tuples.
     745                 :             :          */
     746         [ -  + ]:         221 :         if (H_NEEDS_SPLIT_CLEANUP(oopaque))
     747                 :             :         {
     748                 :             :                 /*
     749                 :             :                  * Copy bucket mapping info now; refer to the comment in code below
     750                 :             :                  * where we copy this information before calling _hash_splitbucket to
     751                 :             :                  * see why this is okay.
     752                 :             :                  */
     753                 :           0 :                 maxbucket = metap->hashm_maxbucket;
     754                 :           0 :                 highmask = metap->hashm_highmask;
     755                 :           0 :                 lowmask = metap->hashm_lowmask;
     756                 :             : 
     757                 :             :                 /* Release the metapage lock. */
     758                 :           0 :                 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     759                 :             : 
     760                 :           0 :                 hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL,
     761                 :           0 :                                                   maxbucket, highmask, lowmask, NULL, NULL, true,
     762                 :             :                                                   NULL, NULL);
     763                 :             : 
     764                 :           0 :                 _hash_dropbuf(rel, buf_oblkno);
     765                 :             : 
     766                 :           0 :                 goto restart_expand;
     767                 :             :         }
     768                 :             : 
     769                 :             :         /*
     770                 :             :          * There shouldn't be any active scan on new bucket.
     771                 :             :          *
     772                 :             :          * Note: it is safe to compute the new bucket's blkno here, even though we
     773                 :             :          * may still need to update the BUCKET_TO_BLKNO mapping.  This is because
     774                 :             :          * the current value of hashm_spares[hashm_ovflpoint] correctly shows
     775                 :             :          * where we are going to put a new splitpoint's worth of buckets.
     776                 :             :          */
     777         [ +  - ]:         221 :         start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
     778                 :             : 
     779                 :             :         /*
     780                 :             :          * If the split point is increasing we need to allocate a new batch of
     781                 :             :          * bucket pages.
     782                 :             :          */
     783                 :         221 :         spare_ndx = _hash_spareindex(new_bucket + 1);
     784         [ +  + ]:         221 :         if (spare_ndx > metap->hashm_ovflpoint)
     785                 :             :         {
     786                 :           9 :                 uint32          buckets_to_add;
     787                 :             : 
     788         [ +  - ]:           9 :                 Assert(spare_ndx == metap->hashm_ovflpoint + 1);
     789                 :             : 
     790                 :             :                 /*
     791                 :             :                  * We treat allocation of buckets as a separate WAL-logged action.
     792                 :             :                  * Even if we fail after this operation, won't leak bucket pages;
     793                 :             :                  * rather, the next split will consume this space. In any case, even
     794                 :             :                  * without failure we don't use all the space in one split operation.
     795                 :             :                  */
     796                 :           9 :                 buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket;
     797         [ +  - ]:           9 :                 if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add))
     798                 :             :                 {
     799                 :             :                         /* can't split due to BlockNumber overflow */
     800                 :           0 :                         _hash_relbuf(rel, buf_oblkno);
     801                 :           0 :                         goto fail;
     802                 :             :                 }
     803      [ +  -  - ]:           9 :         }
     804                 :             : 
     805                 :             :         /*
     806                 :             :          * Physically allocate the new bucket's primary page.  We want to do this
     807                 :             :          * before changing the metapage's mapping info, in case we can't get the
     808                 :             :          * disk space.
     809                 :             :          *
     810                 :             :          * XXX It doesn't make sense to call _hash_getnewbuf first, zeroing the
     811                 :             :          * buffer, and then only afterwards check whether we have a cleanup lock.
     812                 :             :          * However, since no scan can be accessing the buffer yet, any concurrent
     813                 :             :          * accesses will just be from processes like the bgwriter or checkpointer
     814                 :             :          * which don't care about its contents, so it doesn't really matter.
     815                 :             :          */
     816                 :         221 :         buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM);
     817         [ +  - ]:         221 :         if (!IsBufferCleanupOK(buf_nblkno))
     818                 :             :         {
     819                 :           0 :                 _hash_relbuf(rel, buf_oblkno);
     820                 :           0 :                 _hash_relbuf(rel, buf_nblkno);
     821                 :           0 :                 goto fail;
     822                 :             :         }
     823                 :             : 
     824                 :             :         /*
     825                 :             :          * Since we are scribbling on the pages in the shared buffers, establish a
     826                 :             :          * critical section.  Any failure in this next code leaves us with a big
     827                 :             :          * problem: the metapage is effectively corrupt but could get written back
     828                 :             :          * to disk.
     829                 :             :          */
     830                 :         221 :         START_CRIT_SECTION();
     831                 :             : 
     832                 :             :         /*
     833                 :             :          * Okay to proceed with split.  Update the metapage bucket mapping info.
     834                 :             :          */
     835                 :         221 :         metap->hashm_maxbucket = new_bucket;
     836                 :             : 
     837         [ +  + ]:         221 :         if (new_bucket > metap->hashm_highmask)
     838                 :             :         {
     839                 :             :                 /* Starting a new doubling */
     840                 :           3 :                 metap->hashm_lowmask = metap->hashm_highmask;
     841                 :           3 :                 metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
     842                 :           3 :                 metap_update_masks = true;
     843                 :           3 :         }
     844                 :             : 
     845                 :             :         /*
     846                 :             :          * If the split point is increasing we need to adjust the hashm_spares[]
     847                 :             :          * array and hashm_ovflpoint so that future overflow pages will be created
     848                 :             :          * beyond this new batch of bucket pages.
     849                 :             :          */
     850         [ +  + ]:         221 :         if (spare_ndx > metap->hashm_ovflpoint)
     851                 :             :         {
     852                 :           9 :                 metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
     853                 :           9 :                 metap->hashm_ovflpoint = spare_ndx;
     854                 :           9 :                 metap_update_splitpoint = true;
     855                 :           9 :         }
     856                 :             : 
     857                 :         221 :         MarkBufferDirty(metabuf);
     858                 :             : 
     859                 :             :         /*
     860                 :             :          * Copy bucket mapping info now; this saves re-accessing the meta page
     861                 :             :          * inside _hash_splitbucket's inner loop.  Note that once we drop the
     862                 :             :          * split lock, other splits could begin, so these values might be out of
     863                 :             :          * date before _hash_splitbucket finishes.  That's okay, since all it
     864                 :             :          * needs is to tell which of these two buckets to map hashkeys into.
     865                 :             :          */
     866                 :         221 :         maxbucket = metap->hashm_maxbucket;
     867                 :         221 :         highmask = metap->hashm_highmask;
     868                 :         221 :         lowmask = metap->hashm_lowmask;
     869                 :             : 
     870                 :         221 :         opage = BufferGetPage(buf_oblkno);
     871                 :         221 :         oopaque = HashPageGetOpaque(opage);
     872                 :             : 
     873                 :             :         /*
     874                 :             :          * Mark the old bucket to indicate that split is in progress.  (At
     875                 :             :          * operation end, we will clear the split-in-progress flag.)  Also, for a
     876                 :             :          * primary bucket page, hasho_prevblkno stores the number of buckets that
     877                 :             :          * existed as of the last split, so we must update that value here.
     878                 :             :          */
     879                 :         221 :         oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
     880                 :         221 :         oopaque->hasho_prevblkno = maxbucket;
     881                 :             : 
     882                 :         221 :         MarkBufferDirty(buf_oblkno);
     883                 :             : 
     884                 :         221 :         npage = BufferGetPage(buf_nblkno);
     885                 :             : 
     886                 :             :         /*
     887                 :             :          * initialize the new bucket's primary page and mark it to indicate that
     888                 :             :          * split is in progress.
     889                 :             :          */
     890                 :         221 :         nopaque = HashPageGetOpaque(npage);
     891                 :         221 :         nopaque->hasho_prevblkno = maxbucket;
     892                 :         221 :         nopaque->hasho_nextblkno = InvalidBlockNumber;
     893                 :         221 :         nopaque->hasho_bucket = new_bucket;
     894                 :         221 :         nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
     895                 :         221 :         nopaque->hasho_page_id = HASHO_PAGE_ID;
     896                 :             : 
     897                 :         221 :         MarkBufferDirty(buf_nblkno);
     898                 :             : 
     899                 :             :         /* XLOG stuff */
     900   [ +  -  +  -  :         221 :         if (RelationNeedsWAL(rel))
             +  +  +  + ]
     901                 :             :         {
     902                 :          95 :                 xl_hash_split_allocate_page xlrec;
     903                 :          95 :                 XLogRecPtr      recptr;
     904                 :             : 
     905                 :          95 :                 xlrec.new_bucket = maxbucket;
     906                 :          95 :                 xlrec.old_bucket_flag = oopaque->hasho_flag;
     907                 :          95 :                 xlrec.new_bucket_flag = nopaque->hasho_flag;
     908                 :          95 :                 xlrec.flags = 0;
     909                 :             : 
     910                 :          95 :                 XLogBeginInsert();
     911                 :             : 
     912                 :          95 :                 XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD);
     913                 :          95 :                 XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT);
     914                 :          95 :                 XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD);
     915                 :             : 
     916         [ +  + ]:          95 :                 if (metap_update_masks)
     917                 :             :                 {
     918                 :           3 :                         xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS;
     919                 :           3 :                         XLogRegisterBufData(2, &metap->hashm_lowmask, sizeof(uint32));
     920                 :           3 :                         XLogRegisterBufData(2, &metap->hashm_highmask, sizeof(uint32));
     921                 :           3 :                 }
     922                 :             : 
     923         [ +  + ]:          95 :                 if (metap_update_splitpoint)
     924                 :             :                 {
     925                 :           5 :                         xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT;
     926                 :           5 :                         XLogRegisterBufData(2, &metap->hashm_ovflpoint,
     927                 :             :                                                                 sizeof(uint32));
     928                 :           5 :                         XLogRegisterBufData(2,
     929                 :           5 :                                                                 &metap->hashm_spares[metap->hashm_ovflpoint],
     930                 :             :                                                                 sizeof(uint32));
     931                 :           5 :                 }
     932                 :             : 
     933                 :          95 :                 XLogRegisterData(&xlrec, SizeOfHashSplitAllocPage);
     934                 :             : 
     935                 :          95 :                 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE);
     936                 :             : 
     937                 :          95 :                 PageSetLSN(BufferGetPage(buf_oblkno), recptr);
     938                 :          95 :                 PageSetLSN(BufferGetPage(buf_nblkno), recptr);
     939                 :          95 :                 PageSetLSN(BufferGetPage(metabuf), recptr);
     940                 :          95 :         }
     941                 :             : 
     942         [ +  - ]:         221 :         END_CRIT_SECTION();
     943                 :             : 
     944                 :             :         /* drop lock, but keep pin */
     945                 :         221 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     946                 :             : 
     947                 :             :         /* Relocate records to the new bucket */
     948                 :         442 :         _hash_splitbucket(rel, metabuf,
     949                 :         221 :                                           old_bucket, new_bucket,
     950                 :         221 :                                           buf_oblkno, buf_nblkno, NULL,
     951                 :         221 :                                           maxbucket, highmask, lowmask);
     952                 :             : 
     953                 :             :         /* all done, now release the pins on primary buckets. */
     954                 :         221 :         _hash_dropbuf(rel, buf_oblkno);
     955                 :         221 :         _hash_dropbuf(rel, buf_nblkno);
     956                 :             : 
     957                 :         221 :         return;
     958                 :             : 
     959                 :             :         /* Here if decide not to split or fail to acquire old bucket lock */
     960                 :             : fail:
     961                 :             : 
     962                 :             :         /* We didn't write the metapage, so just drop lock */
     963                 :           0 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     964         [ -  + ]:         221 : }
     965                 :             : 
     966                 :             : 
     967                 :             : /*
     968                 :             :  * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
     969                 :             :  *
     970                 :             :  * This does not need to initialize the new bucket pages; we'll do that as
     971                 :             :  * each one is used by _hash_expandtable().  But we have to extend the logical
     972                 :             :  * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
     973                 :             :  * sync with ours, so that we don't get complaints from smgr.
     974                 :             :  *
     975                 :             :  * We do this by writing a page of zeroes at the end of the splitpoint range.
     976                 :             :  * We expect that the filesystem will ensure that the intervening pages read
     977                 :             :  * as zeroes too.  On many filesystems this "hole" will not be allocated
     978                 :             :  * immediately, which means that the index file may end up more fragmented
     979                 :             :  * than if we forced it all to be allocated now; but since we don't scan
     980                 :             :  * hash indexes sequentially anyway, that probably doesn't matter.
     981                 :             :  *
     982                 :             :  * XXX It's annoying that this code is executed with the metapage lock held.
     983                 :             :  * We need to interlock against _hash_addovflpage() adding a new overflow page
     984                 :             :  * concurrently, but it'd likely be better to use LockRelationForExtension
     985                 :             :  * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
     986                 :             :  * so it may not be worth worrying about.
     987                 :             :  *
     988                 :             :  * Returns true if successful, or false if allocation failed due to
     989                 :             :  * BlockNumber overflow.
     990                 :             :  */
     991                 :             : static bool
     992                 :           9 : _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
     993                 :             : {
     994                 :           9 :         BlockNumber lastblock;
     995                 :           9 :         PGIOAlignedBlock zerobuf;
     996                 :           9 :         Page            page;
     997                 :           9 :         HashPageOpaque ovflopaque;
     998                 :             : 
     999                 :           9 :         lastblock = firstblock + nblocks - 1;
    1000                 :             : 
    1001                 :             :         /*
    1002                 :             :          * Check for overflow in block number calculation; if so, we cannot extend
    1003                 :             :          * the index anymore.
    1004                 :             :          */
    1005   [ +  -  -  + ]:           9 :         if (lastblock < firstblock || lastblock == InvalidBlockNumber)
    1006                 :           0 :                 return false;
    1007                 :             : 
    1008                 :           9 :         page = (Page) zerobuf.data;
    1009                 :             : 
    1010                 :             :         /*
    1011                 :             :          * Initialize the page.  Just zeroing the page won't work; see
    1012                 :             :          * _hash_freeovflpage for similar usage.  We take care to make the special
    1013                 :             :          * space valid for the benefit of tools such as pageinspect.
    1014                 :             :          */
    1015                 :           9 :         _hash_pageinit(page, BLCKSZ);
    1016                 :             : 
    1017                 :           9 :         ovflopaque = HashPageGetOpaque(page);
    1018                 :             : 
    1019                 :           9 :         ovflopaque->hasho_prevblkno = InvalidBlockNumber;
    1020                 :           9 :         ovflopaque->hasho_nextblkno = InvalidBlockNumber;
    1021                 :           9 :         ovflopaque->hasho_bucket = InvalidBucket;
    1022                 :           9 :         ovflopaque->hasho_flag = LH_UNUSED_PAGE;
    1023                 :           9 :         ovflopaque->hasho_page_id = HASHO_PAGE_ID;
    1024                 :             : 
    1025   [ +  -  +  -  :           9 :         if (RelationNeedsWAL(rel))
             +  +  +  + ]
    1026                 :          10 :                 log_newpage(&rel->rd_locator,
    1027                 :             :                                         MAIN_FORKNUM,
    1028                 :           5 :                                         lastblock,
    1029                 :           5 :                                         zerobuf.data,
    1030                 :             :                                         true);
    1031                 :             : 
    1032                 :           9 :         PageSetChecksumInplace(page, lastblock);
    1033                 :           9 :         smgrextend(RelationGetSmgr(rel), MAIN_FORKNUM, lastblock, zerobuf.data,
    1034                 :             :                            false);
    1035                 :             : 
    1036                 :           9 :         return true;
    1037                 :           9 : }
    1038                 :             : 
    1039                 :             : 
    1040                 :             : /*
    1041                 :             :  * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
    1042                 :             :  *
    1043                 :             :  * This routine is used to partition the tuples between old and new bucket and
    1044                 :             :  * is used to finish the incomplete split operations.  To finish the previously
    1045                 :             :  * interrupted split operation, the caller needs to fill htab.  If htab is set,
    1046                 :             :  * then we skip the movement of tuples that exists in htab, otherwise NULL
    1047                 :             :  * value of htab indicates movement of all the tuples that belong to the new
    1048                 :             :  * bucket.
    1049                 :             :  *
    1050                 :             :  * We are splitting a bucket that consists of a base bucket page and zero
    1051                 :             :  * or more overflow (bucket chain) pages.  We must relocate tuples that
    1052                 :             :  * belong in the new bucket.
    1053                 :             :  *
    1054                 :             :  * The caller must hold cleanup locks on both buckets to ensure that
    1055                 :             :  * no one else is trying to access them (see README).
    1056                 :             :  *
    1057                 :             :  * The caller must hold a pin, but no lock, on the metapage buffer.
    1058                 :             :  * The buffer is returned in the same state.  (The metapage is only
    1059                 :             :  * touched if it becomes necessary to add or remove overflow pages.)
    1060                 :             :  *
    1061                 :             :  * Split needs to retain pin on primary bucket pages of both old and new
    1062                 :             :  * buckets till end of operation.  This is to prevent vacuum from starting
    1063                 :             :  * while a split is in progress.
    1064                 :             :  *
    1065                 :             :  * In addition, the caller must have created the new bucket's base page,
    1066                 :             :  * which is passed in buffer nbuf, pinned and write-locked.  The lock will be
    1067                 :             :  * released here and pin must be released by the caller.  (The API is set up
    1068                 :             :  * this way because we must do _hash_getnewbuf() before releasing the metapage
    1069                 :             :  * write lock.  So instead of passing the new bucket's start block number, we
    1070                 :             :  * pass an actual buffer.)
    1071                 :             :  */
    1072                 :             : static void
    1073                 :         221 : _hash_splitbucket(Relation rel,
    1074                 :             :                                   Buffer metabuf,
    1075                 :             :                                   Bucket obucket,
    1076                 :             :                                   Bucket nbucket,
    1077                 :             :                                   Buffer obuf,
    1078                 :             :                                   Buffer nbuf,
    1079                 :             :                                   HTAB *htab,
    1080                 :             :                                   uint32 maxbucket,
    1081                 :             :                                   uint32 highmask,
    1082                 :             :                                   uint32 lowmask)
    1083                 :             : {
    1084                 :         221 :         Buffer          bucket_obuf;
    1085                 :         221 :         Buffer          bucket_nbuf;
    1086                 :         221 :         Page            opage;
    1087                 :         221 :         Page            npage;
    1088                 :         221 :         HashPageOpaque oopaque;
    1089                 :         221 :         HashPageOpaque nopaque;
    1090                 :         221 :         OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
    1091                 :         221 :         IndexTuple      itups[MaxIndexTuplesPerPage];
    1092                 :         221 :         Size            all_tups_size = 0;
    1093                 :         221 :         int                     i;
    1094                 :         221 :         uint16          nitups = 0;
    1095                 :             : 
    1096                 :         221 :         bucket_obuf = obuf;
    1097                 :         221 :         opage = BufferGetPage(obuf);
    1098                 :         221 :         oopaque = HashPageGetOpaque(opage);
    1099                 :             : 
    1100                 :         221 :         bucket_nbuf = nbuf;
    1101                 :         221 :         npage = BufferGetPage(nbuf);
    1102                 :         221 :         nopaque = HashPageGetOpaque(npage);
    1103                 :             : 
    1104                 :             :         /* Copy the predicate locks from old bucket to new bucket. */
    1105                 :         442 :         PredicateLockPageSplit(rel,
    1106                 :         221 :                                                    BufferGetBlockNumber(bucket_obuf),
    1107                 :         221 :                                                    BufferGetBlockNumber(bucket_nbuf));
    1108                 :             : 
    1109                 :             :         /*
    1110                 :             :          * Partition the tuples in the old bucket between the old bucket and the
    1111                 :             :          * new bucket, advancing along the old bucket's overflow bucket chain and
    1112                 :             :          * adding overflow pages to the new bucket as needed.  Outer loop iterates
    1113                 :             :          * once per page in old bucket.
    1114                 :             :          */
    1115                 :         277 :         for (;;)
    1116                 :             :         {
    1117                 :         277 :                 BlockNumber oblkno;
    1118                 :         277 :                 OffsetNumber ooffnum;
    1119                 :         277 :                 OffsetNumber omaxoffnum;
    1120                 :             : 
    1121                 :             :                 /* Scan each tuple in old page */
    1122                 :         277 :                 omaxoffnum = PageGetMaxOffsetNumber(opage);
    1123         [ +  + ]:       50540 :                 for (ooffnum = FirstOffsetNumber;
    1124                 :       50540 :                          ooffnum <= omaxoffnum;
    1125                 :       50263 :                          ooffnum = OffsetNumberNext(ooffnum))
    1126                 :             :                 {
    1127                 :       50263 :                         IndexTuple      itup;
    1128                 :       50263 :                         Size            itemsz;
    1129                 :       50263 :                         Bucket          bucket;
    1130                 :       50263 :                         bool            found = false;
    1131                 :             : 
    1132                 :             :                         /* skip dead tuples */
    1133         [ -  + ]:       50263 :                         if (ItemIdIsDead(PageGetItemId(opage, ooffnum)))
    1134                 :           0 :                                 continue;
    1135                 :             : 
    1136                 :             :                         /*
    1137                 :             :                          * Before inserting a tuple, probe the hash table containing TIDs
    1138                 :             :                          * of tuples belonging to new bucket, if we find a match, then
    1139                 :             :                          * skip that tuple, else fetch the item's hash key (conveniently
    1140                 :             :                          * stored in the item) and determine which bucket it now belongs
    1141                 :             :                          * in.
    1142                 :             :                          */
    1143                 :      100526 :                         itup = (IndexTuple) PageGetItem(opage,
    1144                 :       50263 :                                                                                         PageGetItemId(opage, ooffnum));
    1145                 :             : 
    1146         [ +  - ]:       50263 :                         if (htab)
    1147                 :           0 :                                 (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found);
    1148                 :             : 
    1149         [ -  + ]:       50263 :                         if (found)
    1150                 :           0 :                                 continue;
    1151                 :             : 
    1152                 :      100526 :                         bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
    1153                 :       50263 :                                                                                   maxbucket, highmask, lowmask);
    1154                 :             : 
    1155         [ +  + ]:       50263 :                         if (bucket == nbucket)
    1156                 :             :                         {
    1157                 :       20512 :                                 IndexTuple      new_itup;
    1158                 :             : 
    1159                 :             :                                 /*
    1160                 :             :                                  * make a copy of index tuple as we have to scribble on it.
    1161                 :             :                                  */
    1162                 :       20512 :                                 new_itup = CopyIndexTuple(itup);
    1163                 :             : 
    1164                 :             :                                 /*
    1165                 :             :                                  * mark the index tuple as moved by split, such tuples are
    1166                 :             :                                  * skipped by scan if there is split in progress for a bucket.
    1167                 :             :                                  */
    1168                 :       20512 :                                 new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK;
    1169                 :             : 
    1170                 :             :                                 /*
    1171                 :             :                                  * insert the tuple into the new bucket.  if it doesn't fit on
    1172                 :             :                                  * the current page in the new bucket, we must allocate a new
    1173                 :             :                                  * overflow page and place the tuple on that page instead.
    1174                 :             :                                  */
    1175                 :       20512 :                                 itemsz = IndexTupleSize(new_itup);
    1176                 :       20512 :                                 itemsz = MAXALIGN(itemsz);
    1177                 :             : 
    1178         [ +  + ]:       20512 :                                 if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz))
    1179                 :             :                                 {
    1180                 :             :                                         /*
    1181                 :             :                                          * Change the shared buffer state in critical section,
    1182                 :             :                                          * otherwise any error could make it unrecoverable.
    1183                 :             :                                          */
    1184                 :          13 :                                         START_CRIT_SECTION();
    1185                 :             : 
    1186                 :          13 :                                         _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
    1187                 :          13 :                                         MarkBufferDirty(nbuf);
    1188                 :             :                                         /* log the split operation before releasing the lock */
    1189                 :          13 :                                         log_split_page(rel, nbuf);
    1190                 :             : 
    1191         [ -  + ]:          13 :                                         END_CRIT_SECTION();
    1192                 :             : 
    1193                 :             :                                         /* drop lock, but keep pin */
    1194                 :          13 :                                         LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1195                 :             : 
    1196                 :             :                                         /* be tidy */
    1197         [ +  + ]:        5304 :                                         for (i = 0; i < nitups; i++)
    1198                 :        5291 :                                                 pfree(itups[i]);
    1199                 :          13 :                                         nitups = 0;
    1200                 :          13 :                                         all_tups_size = 0;
    1201                 :             : 
    1202                 :             :                                         /* chain to a new overflow page */
    1203                 :          13 :                                         nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf));
    1204                 :          13 :                                         npage = BufferGetPage(nbuf);
    1205                 :          13 :                                         nopaque = HashPageGetOpaque(npage);
    1206                 :          13 :                                 }
    1207                 :             : 
    1208                 :       20512 :                                 itups[nitups++] = new_itup;
    1209                 :       20512 :                                 all_tups_size += itemsz;
    1210                 :       20512 :                         }
    1211                 :             :                         else
    1212                 :             :                         {
    1213                 :             :                                 /*
    1214                 :             :                                  * the tuple stays on this page, so nothing to do.
    1215                 :             :                                  */
    1216         [ -  + ]:       29751 :                                 Assert(bucket == obucket);
    1217                 :             :                         }
    1218         [ -  + ]:       50263 :                 }
    1219                 :             : 
    1220                 :         277 :                 oblkno = oopaque->hasho_nextblkno;
    1221                 :             : 
    1222                 :             :                 /* retain the pin on the old primary bucket */
    1223         [ +  + ]:         277 :                 if (obuf == bucket_obuf)
    1224                 :         221 :                         LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
    1225                 :             :                 else
    1226                 :          56 :                         _hash_relbuf(rel, obuf);
    1227                 :             : 
    1228                 :             :                 /* Exit loop if no more overflow pages in old bucket */
    1229         [ +  + ]:         277 :                 if (!BlockNumberIsValid(oblkno))
    1230                 :             :                 {
    1231                 :             :                         /*
    1232                 :             :                          * Change the shared buffer state in critical section, otherwise
    1233                 :             :                          * any error could make it unrecoverable.
    1234                 :             :                          */
    1235                 :         221 :                         START_CRIT_SECTION();
    1236                 :             : 
    1237                 :         221 :                         _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
    1238                 :         221 :                         MarkBufferDirty(nbuf);
    1239                 :             :                         /* log the split operation before releasing the lock */
    1240                 :         221 :                         log_split_page(rel, nbuf);
    1241                 :             : 
    1242         [ -  + ]:         221 :                         END_CRIT_SECTION();
    1243                 :             : 
    1244         [ +  + ]:         221 :                         if (nbuf == bucket_nbuf)
    1245                 :         220 :                                 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1246                 :             :                         else
    1247                 :           1 :                                 _hash_relbuf(rel, nbuf);
    1248                 :             : 
    1249                 :             :                         /* be tidy */
    1250         [ +  + ]:       15442 :                         for (i = 0; i < nitups; i++)
    1251                 :       15221 :                                 pfree(itups[i]);
    1252                 :         221 :                         break;
    1253                 :             :                 }
    1254                 :             : 
    1255                 :             :                 /* Else, advance to next old page */
    1256                 :          56 :                 obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE);
    1257                 :          56 :                 opage = BufferGetPage(obuf);
    1258                 :          56 :                 oopaque = HashPageGetOpaque(opage);
    1259         [ +  + ]:         277 :         }
    1260                 :             : 
    1261                 :             :         /*
    1262                 :             :          * We're at the end of the old bucket chain, so we're done partitioning
    1263                 :             :          * the tuples.  Mark the old and new buckets to indicate split is
    1264                 :             :          * finished.
    1265                 :             :          *
    1266                 :             :          * To avoid deadlocks due to locking order of buckets, first lock the old
    1267                 :             :          * bucket and then the new bucket.
    1268                 :             :          */
    1269                 :         221 :         LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE);
    1270                 :         221 :         opage = BufferGetPage(bucket_obuf);
    1271                 :         221 :         oopaque = HashPageGetOpaque(opage);
    1272                 :             : 
    1273                 :         221 :         LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE);
    1274                 :         221 :         npage = BufferGetPage(bucket_nbuf);
    1275                 :         221 :         nopaque = HashPageGetOpaque(npage);
    1276                 :             : 
    1277                 :         221 :         START_CRIT_SECTION();
    1278                 :             : 
    1279                 :         221 :         oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT;
    1280                 :         221 :         nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED;
    1281                 :             : 
    1282                 :             :         /*
    1283                 :             :          * After the split is finished, mark the old bucket to indicate that it
    1284                 :             :          * contains deletable tuples.  We will clear split-cleanup flag after
    1285                 :             :          * deleting such tuples either at the end of split or at the next split
    1286                 :             :          * from old bucket or at the time of vacuum.
    1287                 :             :          */
    1288                 :         221 :         oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP;
    1289                 :             : 
    1290                 :             :         /*
    1291                 :             :          * now write the buffers, here we don't release the locks as caller is
    1292                 :             :          * responsible to release locks.
    1293                 :             :          */
    1294                 :         221 :         MarkBufferDirty(bucket_obuf);
    1295                 :         221 :         MarkBufferDirty(bucket_nbuf);
    1296                 :             : 
    1297   [ +  -  +  -  :         221 :         if (RelationNeedsWAL(rel))
             +  +  +  + ]
    1298                 :             :         {
    1299                 :          95 :                 XLogRecPtr      recptr;
    1300                 :          95 :                 xl_hash_split_complete xlrec;
    1301                 :             : 
    1302                 :          95 :                 xlrec.old_bucket_flag = oopaque->hasho_flag;
    1303                 :          95 :                 xlrec.new_bucket_flag = nopaque->hasho_flag;
    1304                 :             : 
    1305                 :          95 :                 XLogBeginInsert();
    1306                 :             : 
    1307                 :          95 :                 XLogRegisterData(&xlrec, SizeOfHashSplitComplete);
    1308                 :             : 
    1309                 :          95 :                 XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD);
    1310                 :          95 :                 XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD);
    1311                 :             : 
    1312                 :          95 :                 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE);
    1313                 :             : 
    1314                 :          95 :                 PageSetLSN(BufferGetPage(bucket_obuf), recptr);
    1315                 :          95 :                 PageSetLSN(BufferGetPage(bucket_nbuf), recptr);
    1316                 :          95 :         }
    1317                 :             : 
    1318         [ +  - ]:         221 :         END_CRIT_SECTION();
    1319                 :             : 
    1320                 :             :         /*
    1321                 :             :          * If possible, clean up the old bucket.  We might not be able to do this
    1322                 :             :          * if someone else has a pin on it, but if not then we can go ahead.  This
    1323                 :             :          * isn't absolutely necessary, but it reduces bloat; if we don't do it
    1324                 :             :          * now, VACUUM will do it eventually, but maybe not until new overflow
    1325                 :             :          * pages have been allocated.  Note that there's no need to clean up the
    1326                 :             :          * new bucket.
    1327                 :             :          */
    1328         [ +  - ]:         221 :         if (IsBufferCleanupOK(bucket_obuf))
    1329                 :             :         {
    1330                 :         221 :                 LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
    1331                 :         442 :                 hashbucketcleanup(rel, obucket, bucket_obuf,
    1332                 :         221 :                                                   BufferGetBlockNumber(bucket_obuf), NULL,
    1333                 :         221 :                                                   maxbucket, highmask, lowmask, NULL, NULL, true,
    1334                 :             :                                                   NULL, NULL);
    1335                 :         221 :         }
    1336                 :             :         else
    1337                 :             :         {
    1338                 :           0 :                 LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
    1339                 :           0 :                 LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK);
    1340                 :             :         }
    1341                 :         221 : }
    1342                 :             : 
    1343                 :             : /*
    1344                 :             :  *      _hash_finish_split() -- Finish the previously interrupted split operation
    1345                 :             :  *
    1346                 :             :  * To complete the split operation, we form the hash table of TIDs in new
    1347                 :             :  * bucket which is then used by split operation to skip tuples that are
    1348                 :             :  * already moved before the split operation was previously interrupted.
    1349                 :             :  *
    1350                 :             :  * The caller must hold a pin, but no lock, on the metapage and old bucket's
    1351                 :             :  * primary page buffer.  The buffers are returned in the same state.  (The
    1352                 :             :  * metapage is only touched if it becomes necessary to add or remove overflow
    1353                 :             :  * pages.)
    1354                 :             :  */
    1355                 :             : void
    1356                 :           0 : _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
    1357                 :             :                                    uint32 maxbucket, uint32 highmask, uint32 lowmask)
    1358                 :             : {
    1359                 :           0 :         HASHCTL         hash_ctl;
    1360                 :           0 :         HTAB       *tidhtab;
    1361                 :           0 :         Buffer          bucket_nbuf = InvalidBuffer;
    1362                 :           0 :         Buffer          nbuf;
    1363                 :           0 :         Page            npage;
    1364                 :           0 :         BlockNumber nblkno;
    1365                 :           0 :         BlockNumber bucket_nblkno;
    1366                 :           0 :         HashPageOpaque npageopaque;
    1367                 :           0 :         Bucket          nbucket;
    1368                 :           0 :         bool            found;
    1369                 :             : 
    1370                 :             :         /* Initialize hash tables used to track TIDs */
    1371                 :           0 :         hash_ctl.keysize = sizeof(ItemPointerData);
    1372                 :           0 :         hash_ctl.entrysize = sizeof(ItemPointerData);
    1373                 :           0 :         hash_ctl.hcxt = CurrentMemoryContext;
    1374                 :             : 
    1375                 :           0 :         tidhtab =
    1376                 :           0 :                 hash_create("bucket ctids",
    1377                 :             :                                         256,            /* arbitrary initial size */
    1378                 :             :                                         &hash_ctl,
    1379                 :             :                                         HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
    1380                 :             : 
    1381                 :           0 :         bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket);
    1382                 :             : 
    1383                 :             :         /*
    1384                 :             :          * Scan the new bucket and build hash table of TIDs
    1385                 :             :          */
    1386                 :           0 :         for (;;)
    1387                 :             :         {
    1388                 :           0 :                 OffsetNumber noffnum;
    1389                 :           0 :                 OffsetNumber nmaxoffnum;
    1390                 :             : 
    1391                 :           0 :                 nbuf = _hash_getbuf(rel, nblkno, HASH_READ,
    1392                 :             :                                                         LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
    1393                 :             : 
    1394                 :             :                 /* remember the primary bucket buffer to acquire cleanup lock on it. */
    1395         [ #  # ]:           0 :                 if (nblkno == bucket_nblkno)
    1396                 :           0 :                         bucket_nbuf = nbuf;
    1397                 :             : 
    1398                 :           0 :                 npage = BufferGetPage(nbuf);
    1399                 :           0 :                 npageopaque = HashPageGetOpaque(npage);
    1400                 :             : 
    1401                 :             :                 /* Scan each tuple in new page */
    1402                 :           0 :                 nmaxoffnum = PageGetMaxOffsetNumber(npage);
    1403         [ #  # ]:           0 :                 for (noffnum = FirstOffsetNumber;
    1404                 :           0 :                          noffnum <= nmaxoffnum;
    1405                 :           0 :                          noffnum = OffsetNumberNext(noffnum))
    1406                 :             :                 {
    1407                 :           0 :                         IndexTuple      itup;
    1408                 :             : 
    1409                 :             :                         /* Fetch the item's TID and insert it in hash table. */
    1410                 :           0 :                         itup = (IndexTuple) PageGetItem(npage,
    1411                 :           0 :                                                                                         PageGetItemId(npage, noffnum));
    1412                 :             : 
    1413                 :           0 :                         (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found);
    1414                 :             : 
    1415         [ #  # ]:           0 :                         Assert(!found);
    1416                 :           0 :                 }
    1417                 :             : 
    1418                 :           0 :                 nblkno = npageopaque->hasho_nextblkno;
    1419                 :             : 
    1420                 :             :                 /*
    1421                 :             :                  * release our write lock without modifying buffer and ensure to
    1422                 :             :                  * retain the pin on primary bucket.
    1423                 :             :                  */
    1424         [ #  # ]:           0 :                 if (nbuf == bucket_nbuf)
    1425                 :           0 :                         LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1426                 :             :                 else
    1427                 :           0 :                         _hash_relbuf(rel, nbuf);
    1428                 :             : 
    1429                 :             :                 /* Exit loop if no more overflow pages in new bucket */
    1430         [ #  # ]:           0 :                 if (!BlockNumberIsValid(nblkno))
    1431                 :           0 :                         break;
    1432         [ #  # ]:           0 :         }
    1433                 :             : 
    1434                 :             :         /*
    1435                 :             :          * Conditionally get the cleanup lock on old and new buckets to perform
    1436                 :             :          * the split operation.  If we don't get the cleanup locks, silently give
    1437                 :             :          * up and next insertion on old bucket will try again to complete the
    1438                 :             :          * split.
    1439                 :             :          */
    1440         [ #  # ]:           0 :         if (!ConditionalLockBufferForCleanup(obuf))
    1441                 :             :         {
    1442                 :           0 :                 hash_destroy(tidhtab);
    1443                 :           0 :                 return;
    1444                 :             :         }
    1445         [ #  # ]:           0 :         if (!ConditionalLockBufferForCleanup(bucket_nbuf))
    1446                 :             :         {
    1447                 :           0 :                 LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
    1448                 :           0 :                 hash_destroy(tidhtab);
    1449                 :           0 :                 return;
    1450                 :             :         }
    1451                 :             : 
    1452                 :           0 :         npage = BufferGetPage(bucket_nbuf);
    1453                 :           0 :         npageopaque = HashPageGetOpaque(npage);
    1454                 :           0 :         nbucket = npageopaque->hasho_bucket;
    1455                 :             : 
    1456                 :           0 :         _hash_splitbucket(rel, metabuf, obucket,
    1457                 :           0 :                                           nbucket, obuf, bucket_nbuf, tidhtab,
    1458                 :           0 :                                           maxbucket, highmask, lowmask);
    1459                 :             : 
    1460                 :           0 :         _hash_dropbuf(rel, bucket_nbuf);
    1461                 :           0 :         hash_destroy(tidhtab);
    1462                 :           0 : }
    1463                 :             : 
    1464                 :             : /*
    1465                 :             :  *      log_split_page() -- Log the split operation
    1466                 :             :  *
    1467                 :             :  *      We log the split operation when the new page in new bucket gets full,
    1468                 :             :  *      so we log the entire page.
    1469                 :             :  *
    1470                 :             :  *      'buf' must be locked by the caller which is also responsible for unlocking
    1471                 :             :  *      it.
    1472                 :             :  */
    1473                 :             : static void
    1474                 :         234 : log_split_page(Relation rel, Buffer buf)
    1475                 :             : {
    1476   [ +  -  +  -  :         234 :         if (RelationNeedsWAL(rel))
             +  +  +  + ]
    1477                 :             :         {
    1478                 :         108 :                 XLogRecPtr      recptr;
    1479                 :             : 
    1480                 :         108 :                 XLogBeginInsert();
    1481                 :             : 
    1482                 :         108 :                 XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
    1483                 :             : 
    1484                 :         108 :                 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE);
    1485                 :             : 
    1486                 :         108 :                 PageSetLSN(BufferGetPage(buf), recptr);
    1487                 :         108 :         }
    1488                 :         234 : }
    1489                 :             : 
    1490                 :             : /*
    1491                 :             :  *      _hash_getcachedmetap() -- Returns cached metapage data.
    1492                 :             :  *
    1493                 :             :  *      If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
    1494                 :             :  *      the metapage.  If not set, we'll set it before returning if we have to
    1495                 :             :  *      refresh the cache, and return with a pin but no lock on it; caller is
    1496                 :             :  *      responsible for releasing the pin.
    1497                 :             :  *
    1498                 :             :  *      We refresh the cache if it's not initialized yet or force_refresh is true.
    1499                 :             :  */
    1500                 :             : HashMetaPage
    1501                 :      119787 : _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
    1502                 :             : {
    1503                 :      119787 :         Page            page;
    1504                 :             : 
    1505         [ +  - ]:      119787 :         Assert(metabuf);
    1506   [ +  +  +  + ]:      119787 :         if (force_refresh || rel->rd_amcache == NULL)
    1507                 :             :         {
    1508                 :         143 :                 char       *cache = NULL;
    1509                 :             : 
    1510                 :             :                 /*
    1511                 :             :                  * It's important that we don't set rd_amcache to an invalid value.
    1512                 :             :                  * Either MemoryContextAlloc or _hash_getbuf could fail, so don't
    1513                 :             :                  * install a pointer to the newly-allocated storage in the actual
    1514                 :             :                  * relcache entry until both have succeeded.
    1515                 :             :                  */
    1516         [ +  + ]:         143 :                 if (rel->rd_amcache == NULL)
    1517                 :          36 :                         cache = MemoryContextAlloc(rel->rd_indexcxt,
    1518                 :             :                                                                            sizeof(HashMetaPageData));
    1519                 :             : 
    1520                 :             :                 /* Read the metapage. */
    1521         [ -  + ]:         143 :                 if (BufferIsValid(*metabuf))
    1522                 :           0 :                         LockBuffer(*metabuf, BUFFER_LOCK_SHARE);
    1523                 :             :                 else
    1524                 :         143 :                         *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,
    1525                 :             :                                                                         LH_META_PAGE);
    1526                 :         143 :                 page = BufferGetPage(*metabuf);
    1527                 :             : 
    1528                 :             :                 /* Populate the cache. */
    1529         [ +  + ]:         143 :                 if (rel->rd_amcache == NULL)
    1530                 :          36 :                         rel->rd_amcache = cache;
    1531                 :         143 :                 memcpy(rel->rd_amcache, HashPageGetMeta(page),
    1532                 :             :                            sizeof(HashMetaPageData));
    1533                 :             : 
    1534                 :             :                 /* Release metapage lock, but keep the pin. */
    1535                 :         143 :                 LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);
    1536                 :         143 :         }
    1537                 :             : 
    1538                 :      239574 :         return (HashMetaPage) rel->rd_amcache;
    1539                 :      119787 : }
    1540                 :             : 
    1541                 :             : /*
    1542                 :             :  *      _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
    1543                 :             :  *                                                                               hashkey.
    1544                 :             :  *
    1545                 :             :  *      Bucket pages do not move or get removed once they are allocated. This give
    1546                 :             :  *      us an opportunity to use the previously saved metapage contents to reach
    1547                 :             :  *      the target bucket buffer, instead of reading from the metapage every time.
    1548                 :             :  *      This saves one buffer access every time we want to reach the target bucket
    1549                 :             :  *      buffer, which is very helpful savings in bufmgr traffic and contention.
    1550                 :             :  *
    1551                 :             :  *      The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
    1552                 :             :  *      bucket buffer has to be locked for reading or writing.
    1553                 :             :  *
    1554                 :             :  *      The out parameter cachedmetap is set with metapage contents used for
    1555                 :             :  *      hashkey to bucket buffer mapping. Some callers need this info to reach the
    1556                 :             :  *      old bucket in case of bucket split, see _hash_doinsert().
    1557                 :             :  */
    1558                 :             : Buffer
    1559                 :      119676 : _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access,
    1560                 :             :                                                                 HashMetaPage *cachedmetap)
    1561                 :             : {
    1562                 :      119676 :         HashMetaPage metap;
    1563                 :      119676 :         Buffer          buf;
    1564                 :      119676 :         Buffer          metabuf = InvalidBuffer;
    1565                 :      119676 :         Page            page;
    1566                 :      119676 :         Bucket          bucket;
    1567                 :      119676 :         BlockNumber blkno;
    1568                 :      119676 :         HashPageOpaque opaque;
    1569                 :             : 
    1570                 :             :         /* We read from target bucket buffer, hence locking is must. */
    1571   [ +  +  +  - ]:      119676 :         Assert(access == HASH_READ || access == HASH_WRITE);
    1572                 :             : 
    1573                 :      119676 :         metap = _hash_getcachedmetap(rel, &metabuf, false);
    1574         [ +  - ]:      119676 :         Assert(metap != NULL);
    1575                 :             : 
    1576                 :             :         /*
    1577                 :             :          * Loop until we get a lock on the correct target bucket.
    1578                 :             :          */
    1579                 :      119783 :         for (;;)
    1580                 :             :         {
    1581                 :             :                 /*
    1582                 :             :                  * Compute the target bucket number, and convert to block number.
    1583                 :             :                  */
    1584                 :      239566 :                 bucket = _hash_hashkey2bucket(hashkey,
    1585                 :      119783 :                                                                           metap->hashm_maxbucket,
    1586                 :      119783 :                                                                           metap->hashm_highmask,
    1587                 :      119783 :                                                                           metap->hashm_lowmask);
    1588                 :             : 
    1589         [ +  + ]:      119783 :                 blkno = BUCKET_TO_BLKNO(metap, bucket);
    1590                 :             : 
    1591                 :             :                 /* Fetch the primary bucket page for the bucket */
    1592                 :      119783 :                 buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
    1593                 :      119783 :                 page = BufferGetPage(buf);
    1594                 :      119783 :                 opaque = HashPageGetOpaque(page);
    1595         [ +  - ]:      119783 :                 Assert(opaque->hasho_bucket == bucket);
    1596         [ +  - ]:      119783 :                 Assert(opaque->hasho_prevblkno != InvalidBlockNumber);
    1597                 :             : 
    1598                 :             :                 /*
    1599                 :             :                  * If this bucket hasn't been split, we're done.
    1600                 :             :                  */
    1601         [ +  + ]:      119783 :                 if (opaque->hasho_prevblkno <= metap->hashm_maxbucket)
    1602                 :      119676 :                         break;
    1603                 :             : 
    1604                 :             :                 /* Drop lock on this buffer, update cached metapage, and retry. */
    1605                 :         107 :                 _hash_relbuf(rel, buf);
    1606                 :         107 :                 metap = _hash_getcachedmetap(rel, &metabuf, true);
    1607         [ +  - ]:         107 :                 Assert(metap != NULL);
    1608                 :             :         }
    1609                 :             : 
    1610         [ +  + ]:      119676 :         if (BufferIsValid(metabuf))
    1611                 :         143 :                 _hash_dropbuf(rel, metabuf);
    1612                 :             : 
    1613         [ +  + ]:      119676 :         if (cachedmetap)
    1614                 :      119616 :                 *cachedmetap = metap;
    1615                 :             : 
    1616                 :      239352 :         return buf;
    1617                 :      119676 : }
        

Generated by: LCOV version 2.3.2-1