LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsort.c (source / functions) Coverage Total Hit
Test: Code coverage Lines: 97.6 % 716 699
Test Date: 2026-01-26 10:56:24 Functions: 100.0 % 22 22
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
Branches: 71.7 % 247 177

             Branch data     Line data    Source code
       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * nbtsort.c
       4                 :             :  *              Build a btree from sorted input by loading leaf pages sequentially.
       5                 :             :  *
       6                 :             :  * NOTES
       7                 :             :  *
       8                 :             :  * We use tuplesort.c to sort the given index tuples into order.
       9                 :             :  * Then we scan the index tuples in order and build the btree pages
      10                 :             :  * for each level.  We load source tuples into leaf-level pages.
      11                 :             :  * Whenever we fill a page at one level, we add a link to it to its
      12                 :             :  * parent level (starting a new parent level if necessary).  When
      13                 :             :  * done, we write out each final page on each level, adding it to
      14                 :             :  * its parent level.  When we have only one page on a level, it must be
      15                 :             :  * the root -- it can be attached to the btree metapage and we are done.
      16                 :             :  *
      17                 :             :  * It is not wise to pack the pages entirely full, since then *any*
      18                 :             :  * insertion would cause a split (and not only of the leaf page; the need
      19                 :             :  * for a split would cascade right up the tree).  The steady-state load
      20                 :             :  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
      21                 :             :  * pages to the user-controllable fill factor (default 90%) while upper pages
      22                 :             :  * are always packed to 70%.  This gives us reasonable density (there aren't
      23                 :             :  * many upper pages if the keys are reasonable-size) without risking a lot of
      24                 :             :  * cascading splits during early insertions.
      25                 :             :  *
      26                 :             :  * We use the bulk smgr loading facility to bypass the buffer cache and
      27                 :             :  * WAL-log the pages efficiently.
      28                 :             :  *
      29                 :             :  * This code isn't concerned about the FSM at all. The caller is responsible
      30                 :             :  * for initializing that.
      31                 :             :  *
      32                 :             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      33                 :             :  * Portions Copyright (c) 1994, Regents of the University of California
      34                 :             :  *
      35                 :             :  * IDENTIFICATION
      36                 :             :  *        src/backend/access/nbtree/nbtsort.c
      37                 :             :  *
      38                 :             :  *-------------------------------------------------------------------------
      39                 :             :  */
      40                 :             : 
      41                 :             : #include "postgres.h"
      42                 :             : 
      43                 :             : #include "access/nbtree.h"
      44                 :             : #include "access/parallel.h"
      45                 :             : #include "access/relscan.h"
      46                 :             : #include "access/table.h"
      47                 :             : #include "access/tableam.h"
      48                 :             : #include "access/xact.h"
      49                 :             : #include "catalog/index.h"
      50                 :             : #include "commands/progress.h"
      51                 :             : #include "executor/instrument.h"
      52                 :             : #include "miscadmin.h"
      53                 :             : #include "pgstat.h"
      54                 :             : #include "storage/bulk_write.h"
      55                 :             : #include "tcop/tcopprot.h"
      56                 :             : #include "utils/rel.h"
      57                 :             : #include "utils/sortsupport.h"
      58                 :             : #include "utils/tuplesort.h"
      59                 :             : 
      60                 :             : 
      61                 :             : /* Magic numbers for parallel state sharing */
      62                 :             : #define PARALLEL_KEY_BTREE_SHARED               UINT64CONST(0xA000000000000001)
      63                 :             : #define PARALLEL_KEY_TUPLESORT                  UINT64CONST(0xA000000000000002)
      64                 :             : #define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)
      65                 :             : #define PARALLEL_KEY_QUERY_TEXT                 UINT64CONST(0xA000000000000004)
      66                 :             : #define PARALLEL_KEY_WAL_USAGE                  UINT64CONST(0xA000000000000005)
      67                 :             : #define PARALLEL_KEY_BUFFER_USAGE               UINT64CONST(0xA000000000000006)
      68                 :             : 
      69                 :             : /*
      70                 :             :  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
      71                 :             :  * parallel index builds.  This may be useful as a debugging aid.
      72                 :             : #undef DISABLE_LEADER_PARTICIPATION
      73                 :             :  */
      74                 :             : 
      75                 :             : /*
      76                 :             :  * Status record for spooling/sorting phase.  (Note we may have two of
      77                 :             :  * these due to the special requirements for uniqueness-checking with
      78                 :             :  * dead tuples.)
      79                 :             :  */
      80                 :             : typedef struct BTSpool
      81                 :             : {
      82                 :             :         Tuplesortstate *sortstate;      /* state data for tuplesort.c */
      83                 :             :         Relation        heap;
      84                 :             :         Relation        index;
      85                 :             :         bool            isunique;
      86                 :             :         bool            nulls_not_distinct;
      87                 :             : } BTSpool;
      88                 :             : 
      89                 :             : /*
      90                 :             :  * Status for index builds performed in parallel.  This is allocated in a
      91                 :             :  * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
      92                 :             :  * entry, private to tuplesort.c but allocated by this module on its behalf.
      93                 :             :  */
      94                 :             : typedef struct BTShared
      95                 :             : {
      96                 :             :         /*
      97                 :             :          * These fields are not modified during the sort.  They primarily exist
      98                 :             :          * for the benefit of worker processes that need to create BTSpool state
      99                 :             :          * corresponding to that used by the leader.
     100                 :             :          */
     101                 :             :         Oid                     heaprelid;
     102                 :             :         Oid                     indexrelid;
     103                 :             :         bool            isunique;
     104                 :             :         bool            nulls_not_distinct;
     105                 :             :         bool            isconcurrent;
     106                 :             :         int                     scantuplesortstates;
     107                 :             : 
     108                 :             :         /* Query ID, for report in worker processes */
     109                 :             :         int64           queryid;
     110                 :             : 
     111                 :             :         /*
     112                 :             :          * workersdonecv is used to monitor the progress of workers.  All parallel
     113                 :             :          * participants must indicate that they are done before leader can use
     114                 :             :          * mutable state that workers maintain during scan (and before leader can
     115                 :             :          * proceed to tuplesort_performsort()).
     116                 :             :          */
     117                 :             :         ConditionVariable workersdonecv;
     118                 :             : 
     119                 :             :         /*
     120                 :             :          * mutex protects all fields before heapdesc.
     121                 :             :          *
     122                 :             :          * These fields contain status information of interest to B-Tree index
     123                 :             :          * builds that must work just the same when an index is built in parallel.
     124                 :             :          */
     125                 :             :         slock_t         mutex;
     126                 :             : 
     127                 :             :         /*
     128                 :             :          * Mutable state that is maintained by workers, and reported back to
     129                 :             :          * leader at end of parallel scan.
     130                 :             :          *
     131                 :             :          * nparticipantsdone is number of worker processes finished.
     132                 :             :          *
     133                 :             :          * reltuples is the total number of input heap tuples.
     134                 :             :          *
     135                 :             :          * havedead indicates if RECENTLY_DEAD tuples were encountered during
     136                 :             :          * build.
     137                 :             :          *
     138                 :             :          * indtuples is the total number of tuples that made it into the index.
     139                 :             :          *
     140                 :             :          * brokenhotchain indicates if any worker detected a broken HOT chain
     141                 :             :          * during build.
     142                 :             :          */
     143                 :             :         int                     nparticipantsdone;
     144                 :             :         double          reltuples;
     145                 :             :         bool            havedead;
     146                 :             :         double          indtuples;
     147                 :             :         bool            brokenhotchain;
     148                 :             : 
     149                 :             :         /*
     150                 :             :          * ParallelTableScanDescData data follows. Can't directly embed here, as
     151                 :             :          * implementations of the parallel table scan desc interface might need
     152                 :             :          * stronger alignment.
     153                 :             :          */
     154                 :             : } BTShared;
     155                 :             : 
     156                 :             : /*
     157                 :             :  * Return pointer to a BTShared's parallel table scan.
     158                 :             :  *
     159                 :             :  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
     160                 :             :  * MAXALIGN.
     161                 :             :  */
     162                 :             : #define ParallelTableScanFromBTShared(shared) \
     163                 :             :         (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
     164                 :             : 
     165                 :             : /*
     166                 :             :  * Status for leader in parallel index build.
     167                 :             :  */
     168                 :             : typedef struct BTLeader
     169                 :             : {
     170                 :             :         /* parallel context itself */
     171                 :             :         ParallelContext *pcxt;
     172                 :             : 
     173                 :             :         /*
     174                 :             :          * nparticipanttuplesorts is the exact number of worker processes
     175                 :             :          * successfully launched, plus one leader process if it participates as a
     176                 :             :          * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
     177                 :             :          * participating as a worker).
     178                 :             :          */
     179                 :             :         int                     nparticipanttuplesorts;
     180                 :             : 
     181                 :             :         /*
     182                 :             :          * Leader process convenience pointers to shared state (leader avoids TOC
     183                 :             :          * lookups).
     184                 :             :          *
     185                 :             :          * btshared is the shared state for entire build.  sharedsort is the
     186                 :             :          * shared, tuplesort-managed state passed to each process tuplesort.
     187                 :             :          * sharedsort2 is the corresponding btspool2 shared state, used only when
     188                 :             :          * building unique indexes.  snapshot is the snapshot used by the scan iff
     189                 :             :          * an MVCC snapshot is required.
     190                 :             :          */
     191                 :             :         BTShared   *btshared;
     192                 :             :         Sharedsort *sharedsort;
     193                 :             :         Sharedsort *sharedsort2;
     194                 :             :         Snapshot        snapshot;
     195                 :             :         WalUsage   *walusage;
     196                 :             :         BufferUsage *bufferusage;
     197                 :             : } BTLeader;
     198                 :             : 
     199                 :             : /*
     200                 :             :  * Working state for btbuild and its callback.
     201                 :             :  *
     202                 :             :  * When parallel CREATE INDEX is used, there is a BTBuildState for each
     203                 :             :  * participant.
     204                 :             :  */
     205                 :             : typedef struct BTBuildState
     206                 :             : {
     207                 :             :         bool            isunique;
     208                 :             :         bool            nulls_not_distinct;
     209                 :             :         bool            havedead;
     210                 :             :         Relation        heap;
     211                 :             :         BTSpool    *spool;
     212                 :             : 
     213                 :             :         /*
     214                 :             :          * spool2 is needed only when the index is a unique index. Dead tuples are
     215                 :             :          * put into spool2 instead of spool in order to avoid uniqueness check.
     216                 :             :          */
     217                 :             :         BTSpool    *spool2;
     218                 :             :         double          indtuples;
     219                 :             : 
     220                 :             :         /*
     221                 :             :          * btleader is only present when a parallel index build is performed, and
     222                 :             :          * only in the leader process. (Actually, only the leader has a
     223                 :             :          * BTBuildState.  Workers have their own spool and spool2, though.)
     224                 :             :          */
     225                 :             :         BTLeader   *btleader;
     226                 :             : } BTBuildState;
     227                 :             : 
     228                 :             : /*
     229                 :             :  * Status record for a btree page being built.  We have one of these
     230                 :             :  * for each active tree level.
     231                 :             :  */
     232                 :             : typedef struct BTPageState
     233                 :             : {
     234                 :             :         BulkWriteBuffer btps_buf;       /* workspace for page building */
     235                 :             :         BlockNumber btps_blkno;         /* block # to write this page at */
     236                 :             :         IndexTuple      btps_lowkey;    /* page's strict lower bound pivot tuple */
     237                 :             :         OffsetNumber btps_lastoff;      /* last item offset loaded */
     238                 :             :         Size            btps_lastextra; /* last item's extra posting list space */
     239                 :             :         uint32          btps_level;             /* tree level (0 = leaf) */
     240                 :             :         Size            btps_full;              /* "full" if less than this much free space */
     241                 :             :         struct BTPageState *btps_next;  /* link to parent level, if any */
     242                 :             : } BTPageState;
     243                 :             : 
     244                 :             : /*
     245                 :             :  * Overall status record for index writing phase.
     246                 :             :  */
     247                 :             : typedef struct BTWriteState
     248                 :             : {
     249                 :             :         Relation        heap;
     250                 :             :         Relation        index;
     251                 :             :         BulkWriteState *bulkstate;
     252                 :             :         BTScanInsert inskey;            /* generic insertion scankey */
     253                 :             :         BlockNumber btws_pages_alloced; /* # pages allocated */
     254                 :             : } BTWriteState;
     255                 :             : 
     256                 :             : 
     257                 :             : static double _bt_spools_heapscan(Relation heap, Relation index,
     258                 :             :                                                                   BTBuildState *buildstate, IndexInfo *indexInfo);
     259                 :             : static void _bt_spooldestroy(BTSpool *btspool);
     260                 :             : static void _bt_spool(BTSpool *btspool, const ItemPointerData *self,
     261                 :             :                                           const Datum *values, const bool *isnull);
     262                 :             : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
     263                 :             : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
     264                 :             :                                                            bool *isnull, bool tupleIsAlive, void *state);
     265                 :             : static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
     266                 :             : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
     267                 :             : static void _bt_slideleft(Page rightmostpage);
     268                 :             : static void _bt_sortaddtup(Page page, Size itemsize,
     269                 :             :                                                    const IndexTupleData *itup, OffsetNumber itup_off,
     270                 :             :                                                    bool newfirstdataitem);
     271                 :             : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
     272                 :             :                                                  IndexTuple itup, Size truncextra);
     273                 :             : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
     274                 :             :                                                                                   BTPageState *state,
     275                 :             :                                                                                   BTDedupState dstate);
     276                 :             : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
     277                 :             : static void _bt_load(BTWriteState *wstate,
     278                 :             :                                          BTSpool *btspool, BTSpool *btspool2);
     279                 :             : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
     280                 :             :                                                            int request);
     281                 :             : static void _bt_end_parallel(BTLeader *btleader);
     282                 :             : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
     283                 :             : static double _bt_parallel_heapscan(BTBuildState *buildstate,
     284                 :             :                                                                         bool *brokenhotchain);
     285                 :             : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
     286                 :             : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
     287                 :             :                                                                            BTShared *btshared, Sharedsort *sharedsort,
     288                 :             :                                                                            Sharedsort *sharedsort2, int sortmem,
     289                 :             :                                                                            bool progress);
     290                 :             : 
     291                 :             : 
     292                 :             : /*
     293                 :             :  *      btbuild() -- build a new btree index.
     294                 :             :  */
     295                 :             : IndexBuildResult *
     296                 :        3711 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     297                 :             : {
     298                 :        3711 :         IndexBuildResult *result;
     299                 :        3711 :         BTBuildState buildstate;
     300                 :        3711 :         double          reltuples;
     301                 :             : 
     302                 :             : #ifdef BTREE_BUILD_STATS
     303                 :             :         if (log_btree_build_stats)
     304                 :             :                 ResetUsage();
     305                 :             : #endif                                                  /* BTREE_BUILD_STATS */
     306                 :             : 
     307                 :        3711 :         buildstate.isunique = indexInfo->ii_Unique;
     308                 :        3711 :         buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     309                 :        3711 :         buildstate.havedead = false;
     310                 :        3711 :         buildstate.heap = heap;
     311                 :        3711 :         buildstate.spool = NULL;
     312                 :        3711 :         buildstate.spool2 = NULL;
     313                 :        3711 :         buildstate.indtuples = 0;
     314                 :        3711 :         buildstate.btleader = NULL;
     315                 :             : 
     316                 :             :         /*
     317                 :             :          * We expect to be called exactly once for any index relation. If that's
     318                 :             :          * not the case, big trouble's what we have.
     319                 :             :          */
     320         [ +  - ]:        3711 :         if (RelationGetNumberOfBlocks(index) != 0)
     321   [ #  #  #  # ]:           0 :                 elog(ERROR, "index \"%s\" already contains data",
     322                 :             :                          RelationGetRelationName(index));
     323                 :             : 
     324                 :        3711 :         reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
     325                 :             : 
     326                 :             :         /*
     327                 :             :          * Finish the build by (1) completing the sort of the spool file, (2)
     328                 :             :          * inserting the sorted tuples into btree pages and (3) building the upper
     329                 :             :          * levels.  Finally, it may also be necessary to end use of parallelism.
     330                 :             :          */
     331                 :        3711 :         _bt_leafbuild(buildstate.spool, buildstate.spool2);
     332                 :        3711 :         _bt_spooldestroy(buildstate.spool);
     333         [ +  + ]:        3711 :         if (buildstate.spool2)
     334                 :           2 :                 _bt_spooldestroy(buildstate.spool2);
     335         [ +  + ]:        3711 :         if (buildstate.btleader)
     336                 :          27 :                 _bt_end_parallel(buildstate.btleader);
     337                 :             : 
     338                 :        3711 :         result = palloc_object(IndexBuildResult);
     339                 :             : 
     340                 :        3711 :         result->heap_tuples = reltuples;
     341                 :        3711 :         result->index_tuples = buildstate.indtuples;
     342                 :             : 
     343                 :             : #ifdef BTREE_BUILD_STATS
     344                 :             :         if (log_btree_build_stats)
     345                 :             :         {
     346                 :             :                 ShowUsage("BTREE BUILD STATS");
     347                 :             :                 ResetUsage();
     348                 :             :         }
     349                 :             : #endif                                                  /* BTREE_BUILD_STATS */
     350                 :             : 
     351                 :        7422 :         return result;
     352                 :        3711 : }
     353                 :             : 
     354                 :             : /*
     355                 :             :  * Create and initialize one or two spool structures, and save them in caller's
     356                 :             :  * buildstate argument.  May also fill-in fields within indexInfo used by index
     357                 :             :  * builds.
     358                 :             :  *
     359                 :             :  * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
     360                 :             :  * routine encapsulates all aspects of managing parallelism.  Caller need only
     361                 :             :  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
     362                 :             :  *
     363                 :             :  * Returns the total number of heap tuples scanned.
     364                 :             :  */
     365                 :             : static double
     366                 :        3711 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
     367                 :             :                                         IndexInfo *indexInfo)
     368                 :             : {
     369                 :        3711 :         BTSpool    *btspool = palloc0_object(BTSpool);
     370                 :        3711 :         SortCoordinate coordinate = NULL;
     371                 :        3711 :         double          reltuples = 0;
     372                 :             : 
     373                 :             :         /*
     374                 :             :          * We size the sort area as maintenance_work_mem rather than work_mem to
     375                 :             :          * speed index creation.  This should be OK since a single backend can't
     376                 :             :          * run multiple index creations in parallel (see also: notes on
     377                 :             :          * parallelism and maintenance_work_mem below).
     378                 :             :          */
     379                 :        3711 :         btspool->heap = heap;
     380                 :        3711 :         btspool->index = index;
     381                 :        3711 :         btspool->isunique = indexInfo->ii_Unique;
     382                 :        3711 :         btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     383                 :             : 
     384                 :             :         /* Save as primary spool */
     385                 :        3711 :         buildstate->spool = btspool;
     386                 :             : 
     387                 :             :         /* Report table scan phase started */
     388                 :        3711 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     389                 :             :                                                                  PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
     390                 :             : 
     391                 :             :         /* Attempt to launch parallel worker scan when required */
     392         [ +  + ]:        3711 :         if (indexInfo->ii_ParallelWorkers > 0)
     393                 :          54 :                 _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
     394                 :          27 :                                                    indexInfo->ii_ParallelWorkers);
     395                 :             : 
     396                 :             :         /*
     397                 :             :          * If parallel build requested and at least one worker process was
     398                 :             :          * successfully launched, set up coordination state
     399                 :             :          */
     400         [ +  + ]:        3711 :         if (buildstate->btleader)
     401                 :             :         {
     402                 :          27 :                 coordinate = palloc0_object(SortCoordinateData);
     403                 :          27 :                 coordinate->isWorker = false;
     404                 :          27 :                 coordinate->nParticipants =
     405                 :          27 :                         buildstate->btleader->nparticipanttuplesorts;
     406                 :          27 :                 coordinate->sharedsort = buildstate->btleader->sharedsort;
     407                 :          27 :         }
     408                 :             : 
     409                 :             :         /*
     410                 :             :          * Begin serial/leader tuplesort.
     411                 :             :          *
     412                 :             :          * In cases where parallelism is involved, the leader receives the same
     413                 :             :          * share of maintenance_work_mem as a serial sort (it is generally treated
     414                 :             :          * in the same way as a serial sort once we return).  Parallel worker
     415                 :             :          * Tuplesortstates will have received only a fraction of
     416                 :             :          * maintenance_work_mem, though.
     417                 :             :          *
     418                 :             :          * We rely on the lifetime of the Leader Tuplesortstate almost not
     419                 :             :          * overlapping with any worker Tuplesortstate's lifetime.  There may be
     420                 :             :          * some small overlap, but that's okay because we rely on leader
     421                 :             :          * Tuplesortstate only allocating a small, fixed amount of memory here.
     422                 :             :          * When its tuplesort_performsort() is called (by our caller), and
     423                 :             :          * significant amounts of memory are likely to be used, all workers must
     424                 :             :          * have already freed almost all memory held by their Tuplesortstates
     425                 :             :          * (they are about to go away completely, too).  The overall effect is
     426                 :             :          * that maintenance_work_mem always represents an absolute high watermark
     427                 :             :          * on the amount of memory used by a CREATE INDEX operation, regardless of
     428                 :             :          * the use of parallelism or any other factor.
     429                 :             :          */
     430                 :        3711 :         buildstate->spool->sortstate =
     431                 :        7422 :                 tuplesort_begin_index_btree(heap, index, buildstate->isunique,
     432                 :        3711 :                                                                         buildstate->nulls_not_distinct,
     433                 :        3711 :                                                                         maintenance_work_mem, coordinate,
     434                 :             :                                                                         TUPLESORT_NONE);
     435                 :             : 
     436                 :             :         /*
     437                 :             :          * If building a unique index, put dead tuples in a second spool to keep
     438                 :             :          * them out of the uniqueness check.  We expect that the second spool (for
     439                 :             :          * dead tuples) won't get very full, so we give it only work_mem.
     440                 :             :          */
     441         [ +  + ]:        3711 :         if (indexInfo->ii_Unique)
     442                 :             :         {
     443                 :        2615 :                 BTSpool    *btspool2 = palloc0_object(BTSpool);
     444                 :        2615 :                 SortCoordinate coordinate2 = NULL;
     445                 :             : 
     446                 :             :                 /* Initialize secondary spool */
     447                 :        2615 :                 btspool2->heap = heap;
     448                 :        2615 :                 btspool2->index = index;
     449                 :        2615 :                 btspool2->isunique = false;
     450                 :             :                 /* Save as secondary spool */
     451                 :        2615 :                 buildstate->spool2 = btspool2;
     452                 :             : 
     453         [ +  + ]:        2615 :                 if (buildstate->btleader)
     454                 :             :                 {
     455                 :             :                         /*
     456                 :             :                          * Set up non-private state that is passed to
     457                 :             :                          * tuplesort_begin_index_btree() about the basic high level
     458                 :             :                          * coordination of a parallel sort.
     459                 :             :                          */
     460                 :          13 :                         coordinate2 = palloc0_object(SortCoordinateData);
     461                 :          13 :                         coordinate2->isWorker = false;
     462                 :          13 :                         coordinate2->nParticipants =
     463                 :          13 :                                 buildstate->btleader->nparticipanttuplesorts;
     464                 :          13 :                         coordinate2->sharedsort = buildstate->btleader->sharedsort2;
     465                 :          13 :                 }
     466                 :             : 
     467                 :             :                 /*
     468                 :             :                  * We expect that the second one (for dead tuples) won't get very
     469                 :             :                  * full, so we give it only work_mem
     470                 :             :                  */
     471                 :        2615 :                 buildstate->spool2->sortstate =
     472                 :        5230 :                         tuplesort_begin_index_btree(heap, index, false, false, work_mem,
     473                 :        2615 :                                                                                 coordinate2, TUPLESORT_NONE);
     474                 :        2615 :         }
     475                 :             : 
     476                 :             :         /* Fill spool using either serial or parallel heap scan */
     477         [ +  + ]:        3711 :         if (!buildstate->btleader)
     478                 :        7368 :                 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     479                 :        3684 :                                                                                    _bt_build_callback, buildstate,
     480                 :             :                                                                                    NULL);
     481                 :             :         else
     482                 :          54 :                 reltuples = _bt_parallel_heapscan(buildstate,
     483                 :          27 :                                                                                   &indexInfo->ii_BrokenHotChain);
     484                 :             : 
     485                 :             :         /*
     486                 :             :          * Set the progress target for the next phase.  Reset the block number
     487                 :             :          * values set by table_index_build_scan
     488                 :             :          */
     489                 :             :         {
     490                 :        3711 :                 const int       progress_index[] = {
     491                 :             :                         PROGRESS_CREATEIDX_TUPLES_TOTAL,
     492                 :             :                         PROGRESS_SCAN_BLOCKS_TOTAL,
     493                 :             :                         PROGRESS_SCAN_BLOCKS_DONE
     494                 :             :                 };
     495                 :        7422 :                 const int64 progress_vals[] = {
     496                 :        3711 :                         buildstate->indtuples,
     497                 :             :                         0, 0
     498                 :             :                 };
     499                 :             : 
     500                 :        3711 :                 pgstat_progress_update_multi_param(3, progress_index, progress_vals);
     501                 :        3711 :         }
     502                 :             : 
     503                 :             :         /* okay, all heap tuples are spooled */
     504   [ +  +  +  + ]:        3711 :         if (buildstate->spool2 && !buildstate->havedead)
     505                 :             :         {
     506                 :             :                 /* spool2 turns out to be unnecessary */
     507                 :        2613 :                 _bt_spooldestroy(buildstate->spool2);
     508                 :        2613 :                 buildstate->spool2 = NULL;
     509                 :        2613 :         }
     510                 :             : 
     511                 :        7422 :         return reltuples;
     512                 :        3711 : }
     513                 :             : 
     514                 :             : /*
     515                 :             :  * clean up a spool structure and its substructures.
     516                 :             :  */
     517                 :             : static void
     518                 :        6310 : _bt_spooldestroy(BTSpool *btspool)
     519                 :             : {
     520                 :        6310 :         tuplesort_end(btspool->sortstate);
     521                 :        6310 :         pfree(btspool);
     522                 :        6310 : }
     523                 :             : 
     524                 :             : /*
     525                 :             :  * spool an index entry into the sort file.
     526                 :             :  */
     527                 :             : static void
     528                 :     1154983 : _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
     529                 :             : {
     530                 :     2309966 :         tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
     531                 :     1154983 :                                                                   self, values, isnull);
     532                 :     1154983 : }
     533                 :             : 
     534                 :             : /*
     535                 :             :  * given a spool loaded by successive calls to _bt_spool,
     536                 :             :  * create an entire btree.
     537                 :             :  */
     538                 :             : static void
     539                 :        3709 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
     540                 :             : {
     541                 :        3709 :         BTWriteState wstate;
     542                 :             : 
     543                 :             : #ifdef BTREE_BUILD_STATS
     544                 :             :         if (log_btree_build_stats)
     545                 :             :         {
     546                 :             :                 ShowUsage("BTREE BUILD (Spool) STATISTICS");
     547                 :             :                 ResetUsage();
     548                 :             :         }
     549                 :             : #endif                                                  /* BTREE_BUILD_STATS */
     550                 :             : 
     551                 :             :         /* Execute the sort */
     552                 :        3709 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     553                 :             :                                                                  PROGRESS_BTREE_PHASE_PERFORMSORT_1);
     554                 :        3709 :         tuplesort_performsort(btspool->sortstate);
     555         [ +  + ]:        3709 :         if (btspool2)
     556                 :             :         {
     557                 :           2 :                 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     558                 :             :                                                                          PROGRESS_BTREE_PHASE_PERFORMSORT_2);
     559                 :           2 :                 tuplesort_performsort(btspool2->sortstate);
     560                 :           2 :         }
     561                 :             : 
     562                 :        3709 :         wstate.heap = btspool->heap;
     563                 :        3709 :         wstate.index = btspool->index;
     564                 :        3709 :         wstate.inskey = _bt_mkscankey(wstate.index, NULL);
     565                 :             :         /* _bt_mkscankey() won't set allequalimage without metapage */
     566                 :        3709 :         wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
     567                 :             : 
     568                 :             :         /* reserve the metapage */
     569                 :        3709 :         wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
     570                 :             : 
     571                 :        3709 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     572                 :             :                                                                  PROGRESS_BTREE_PHASE_LEAF_LOAD);
     573                 :        3709 :         _bt_load(&wstate, btspool, btspool2);
     574                 :        3709 : }
     575                 :             : 
     576                 :             : /*
     577                 :             :  * Per-tuple callback for table_index_build_scan
     578                 :             :  */
     579                 :             : static void
     580                 :     1154983 : _bt_build_callback(Relation index,
     581                 :             :                                    ItemPointer tid,
     582                 :             :                                    Datum *values,
     583                 :             :                                    bool *isnull,
     584                 :             :                                    bool tupleIsAlive,
     585                 :             :                                    void *state)
     586                 :             : {
     587                 :     1154983 :         BTBuildState *buildstate = (BTBuildState *) state;
     588                 :             : 
     589                 :             :         /*
     590                 :             :          * insert the index tuple into the appropriate spool file for subsequent
     591                 :             :          * processing
     592                 :             :          */
     593   [ +  +  +  + ]:     1154983 :         if (tupleIsAlive || buildstate->spool2 == NULL)
     594                 :     1154976 :                 _bt_spool(buildstate->spool, tid, values, isnull);
     595                 :             :         else
     596                 :             :         {
     597                 :             :                 /* dead tuples are put into spool2 */
     598                 :           7 :                 buildstate->havedead = true;
     599                 :           7 :                 _bt_spool(buildstate->spool2, tid, values, isnull);
     600                 :             :         }
     601                 :             : 
     602                 :     1154983 :         buildstate->indtuples += 1;
     603                 :     1154983 : }
     604                 :             : 
     605                 :             : /*
     606                 :             :  * allocate workspace for a new, clean btree page, not linked to any siblings.
     607                 :             :  */
     608                 :             : static BulkWriteBuffer
     609                 :        3981 : _bt_blnewpage(BTWriteState *wstate, uint32 level)
     610                 :             : {
     611                 :        3981 :         BulkWriteBuffer buf;
     612                 :        3981 :         Page            page;
     613                 :        3981 :         BTPageOpaque opaque;
     614                 :             : 
     615                 :        3981 :         buf = smgr_bulk_get_buf(wstate->bulkstate);
     616                 :        3981 :         page = (Page) buf;
     617                 :             : 
     618                 :             :         /* Zero the page and set up standard page header info */
     619                 :        3981 :         _bt_pageinit(page, BLCKSZ);
     620                 :             : 
     621                 :             :         /* Initialize BT opaque state */
     622                 :        3981 :         opaque = BTPageGetOpaque(page);
     623                 :        3981 :         opaque->btpo_prev = opaque->btpo_next = P_NONE;
     624                 :        3981 :         opaque->btpo_level = level;
     625                 :        3981 :         opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
     626                 :        3981 :         opaque->btpo_cycleid = 0;
     627                 :             : 
     628                 :             :         /* Make the P_HIKEY line pointer appear allocated */
     629                 :        3981 :         ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
     630                 :             : 
     631                 :        7962 :         return buf;
     632                 :        3981 : }
     633                 :             : 
     634                 :             : /*
     635                 :             :  * emit a completed btree page, and release the working storage.
     636                 :             :  */
     637                 :             : static void
     638                 :        7676 : _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
     639                 :             : {
     640                 :        7676 :         smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
     641                 :             :         /* smgr_bulk_write took ownership of 'buf' */
     642                 :        7676 : }
     643                 :             : 
     644                 :             : /*
     645                 :             :  * allocate and initialize a new BTPageState.  the returned structure
     646                 :             :  * is suitable for immediate use by _bt_buildadd.
     647                 :             :  */
     648                 :             : static BTPageState *
     649                 :         632 : _bt_pagestate(BTWriteState *wstate, uint32 level)
     650                 :             : {
     651                 :         632 :         BTPageState *state = palloc0_object(BTPageState);
     652                 :             : 
     653                 :             :         /* create initial page for level */
     654                 :         632 :         state->btps_buf = _bt_blnewpage(wstate, level);
     655                 :             : 
     656                 :             :         /* and assign it a page position */
     657                 :         632 :         state->btps_blkno = wstate->btws_pages_alloced++;
     658                 :             : 
     659                 :         632 :         state->btps_lowkey = NULL;
     660                 :             :         /* initialize lastoff so first item goes into P_FIRSTKEY */
     661                 :         632 :         state->btps_lastoff = P_HIKEY;
     662                 :         632 :         state->btps_lastextra = 0;
     663                 :         632 :         state->btps_level = level;
     664                 :             :         /* set "full" threshold based on level.  See notes at head of file. */
     665         [ +  + ]:         632 :         if (level > 0)
     666                 :         122 :                 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
     667                 :             :         else
     668   [ +  -  -  + ]:         510 :                 state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
     669                 :             : 
     670                 :             :         /* no parent level, yet */
     671                 :         632 :         state->btps_next = NULL;
     672                 :             : 
     673                 :        1264 :         return state;
     674                 :         632 : }
     675                 :             : 
     676                 :             : /*
     677                 :             :  * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
     678                 :             :  * P_HIKEY, overwriting P_HIKEY).
     679                 :             :  *
     680                 :             :  * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
     681                 :             :  * rightmost page on its level is not supposed to get a high key.  Now that
     682                 :             :  * it's clear that this page is a rightmost page, remove the unneeded empty
     683                 :             :  * P_HIKEY line pointer space.
     684                 :             :  */
     685                 :             : static void
     686                 :         632 : _bt_slideleft(Page rightmostpage)
     687                 :             : {
     688                 :         632 :         OffsetNumber off;
     689                 :         632 :         OffsetNumber maxoff;
     690                 :         632 :         ItemId          previi;
     691                 :             : 
     692                 :         632 :         maxoff = PageGetMaxOffsetNumber(rightmostpage);
     693         [ +  - ]:         632 :         Assert(maxoff >= P_FIRSTKEY);
     694                 :         632 :         previi = PageGetItemId(rightmostpage, P_HIKEY);
     695         [ +  + ]:       25948 :         for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
     696                 :             :         {
     697                 :       25316 :                 ItemId          thisii = PageGetItemId(rightmostpage, off);
     698                 :             : 
     699                 :       25316 :                 *previi = *thisii;
     700                 :       25316 :                 previi = thisii;
     701                 :       25316 :         }
     702                 :         632 :         ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
     703                 :         632 : }
     704                 :             : 
     705                 :             : /*
     706                 :             :  * Add an item to a page being built.
     707                 :             :  *
     708                 :             :  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
     709                 :             :  * raises an error directly.
     710                 :             :  *
     711                 :             :  * Note that our nbtsort.c caller does not know yet if the page will be
     712                 :             :  * rightmost.  Offset P_FIRSTKEY is always assumed to be the first data key by
     713                 :             :  * caller.  Page that turns out to be the rightmost on its level is fixed by
     714                 :             :  * calling _bt_slideleft().
     715                 :             :  */
     716                 :             : static void
     717                 :      849817 : _bt_sortaddtup(Page page,
     718                 :             :                            Size itemsize,
     719                 :             :                            const IndexTupleData *itup,
     720                 :             :                            OffsetNumber itup_off,
     721                 :             :                            bool newfirstdataitem)
     722                 :             : {
     723                 :      849817 :         IndexTupleData trunctuple;
     724                 :             : 
     725         [ +  + ]:      849817 :         if (newfirstdataitem)
     726                 :             :         {
     727                 :         139 :                 trunctuple = *itup;
     728                 :         139 :                 trunctuple.t_info = sizeof(IndexTupleData);
     729                 :         139 :                 BTreeTupleSetNAtts(&trunctuple, 0, false);
     730                 :         139 :                 itup = &trunctuple;
     731                 :         139 :                 itemsize = sizeof(IndexTupleData);
     732                 :         139 :         }
     733                 :             : 
     734         [ +  - ]:      849817 :         if (PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber)
     735   [ #  #  #  # ]:           0 :                 elog(ERROR, "failed to add item to the index page");
     736                 :      849817 : }
     737                 :             : 
     738                 :             : /*----------
     739                 :             :  * Add an item to a disk page from the sort output (or add a posting list
     740                 :             :  * item formed from the sort output).
     741                 :             :  *
     742                 :             :  * We must be careful to observe the page layout conventions of nbtsearch.c:
     743                 :             :  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
     744                 :             :  * - on non-leaf pages, the key portion of the first item need not be
     745                 :             :  *       stored, we should store only the link.
     746                 :             :  *
     747                 :             :  * A leaf page being built looks like:
     748                 :             :  *
     749                 :             :  * +----------------+---------------------------------+
     750                 :             :  * | PageHeaderData | linp0 linp1 linp2 ...           |
     751                 :             :  * +-----------+----+---------------------------------+
     752                 :             :  * | ... linpN |                                                                          |
     753                 :             :  * +-----------+--------------------------------------+
     754                 :             :  * |     ^ last                                                                           |
     755                 :             :  * |                                                                                              |
     756                 :             :  * +-------------+------------------------------------+
     757                 :             :  * |                     | itemN ...                          |
     758                 :             :  * +-------------+------------------+-----------------+
     759                 :             :  * |              ... item3 item2 item1 | "special space" |
     760                 :             :  * +--------------------------------+-----------------+
     761                 :             :  *
     762                 :             :  * Contrast this with the diagram in bufpage.h; note the mismatch
     763                 :             :  * between linps and items.  This is because we reserve linp0 as a
     764                 :             :  * placeholder for the pointer to the "high key" item; when we have
     765                 :             :  * filled up the page, we will set linp0 to point to itemN and clear
     766                 :             :  * linpN.  On the other hand, if we find this is the last (rightmost)
     767                 :             :  * page, we leave the items alone and slide the linp array over.  If
     768                 :             :  * the high key is to be truncated, offset 1 is deleted, and we insert
     769                 :             :  * the truncated high key at offset 1.
     770                 :             :  *
     771                 :             :  * 'last' pointer indicates the last offset added to the page.
     772                 :             :  *
     773                 :             :  * 'truncextra' is the size of the posting list in itup, if any.  This
     774                 :             :  * information is stashed for the next call here, when we may benefit
     775                 :             :  * from considering the impact of truncating away the posting list on
     776                 :             :  * the page before deciding to finish the page off.  Posting lists are
     777                 :             :  * often relatively large, so it is worth going to the trouble of
     778                 :             :  * accounting for the saving from truncating away the posting list of
     779                 :             :  * the tuple that becomes the high key (that may be the only way to
     780                 :             :  * get close to target free space on the page).  Note that this is
     781                 :             :  * only used for the soft fillfactor-wise limit, not the critical hard
     782                 :             :  * limit.
     783                 :             :  *----------
     784                 :             :  */
     785                 :             : static void
     786                 :      846468 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
     787                 :             :                          Size truncextra)
     788                 :             : {
     789                 :      846468 :         BulkWriteBuffer nbuf;
     790                 :      846468 :         Page            npage;
     791                 :      846468 :         BlockNumber nblkno;
     792                 :      846468 :         OffsetNumber last_off;
     793                 :      846468 :         Size            last_truncextra;
     794                 :      846468 :         Size            pgspc;
     795                 :      846468 :         Size            itupsz;
     796                 :      846468 :         bool            isleaf;
     797                 :             : 
     798                 :             :         /*
     799                 :             :          * This is a handy place to check for cancel interrupts during the btree
     800                 :             :          * load phase of index creation.
     801                 :             :          */
     802         [ +  - ]:      846468 :         CHECK_FOR_INTERRUPTS();
     803                 :             : 
     804                 :      846468 :         nbuf = state->btps_buf;
     805                 :      846468 :         npage = (Page) nbuf;
     806                 :      846468 :         nblkno = state->btps_blkno;
     807                 :      846468 :         last_off = state->btps_lastoff;
     808                 :      846468 :         last_truncextra = state->btps_lastextra;
     809                 :      846468 :         state->btps_lastextra = truncextra;
     810                 :             : 
     811                 :      846468 :         pgspc = PageGetFreeSpace(npage);
     812                 :      846468 :         itupsz = IndexTupleSize(itup);
     813                 :      846468 :         itupsz = MAXALIGN(itupsz);
     814                 :             :         /* Leaf case has slightly different rules due to suffix truncation */
     815                 :      846468 :         isleaf = (state->btps_level == 0);
     816                 :             : 
     817                 :             :         /*
     818                 :             :          * Check whether the new item can fit on a btree page on current level at
     819                 :             :          * all.
     820                 :             :          *
     821                 :             :          * Every newly built index will treat heap TID as part of the keyspace,
     822                 :             :          * which imposes the requirement that new high keys must occasionally have
     823                 :             :          * a heap TID appended within _bt_truncate().  That may leave a new pivot
     824                 :             :          * tuple one or two MAXALIGN() quantums larger than the original
     825                 :             :          * firstright tuple it's derived from.  v4 deals with the problem by
     826                 :             :          * decreasing the limit on the size of tuples inserted on the leaf level
     827                 :             :          * by the same small amount.  Enforce the new v4+ limit on the leaf level,
     828                 :             :          * and the old limit on internal levels, since pivot tuples may need to
     829                 :             :          * make use of the reserved space.  This should never fail on internal
     830                 :             :          * pages.
     831                 :             :          */
     832         [ +  + ]:      846468 :         if (unlikely(itupsz > BTMaxItemSize))
     833                 :          88 :                 _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
     834                 :          44 :                                                          itup);
     835                 :             : 
     836                 :             :         /*
     837                 :             :          * Check to see if current page will fit new item, with space left over to
     838                 :             :          * append a heap TID during suffix truncation when page is a leaf page.
     839                 :             :          *
     840                 :             :          * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
     841                 :             :          * high key with heap TID when finishing off a leaf page, since we rely on
     842                 :             :          * _bt_check_third_page() rejecting oversized non-pivot tuples.  On
     843                 :             :          * internal pages we can always fit 3 pivot tuples with larger internal
     844                 :             :          * page tuple limit (includes page high key).
     845                 :             :          *
     846                 :             :          * Most of the time, a page is only "full" in the sense that the soft
     847                 :             :          * fillfactor-wise limit has been exceeded.  However, we must always leave
     848                 :             :          * at least two items plus a high key on each page before starting a new
     849                 :             :          * page.  Disregard fillfactor and insert on "full" current page if we
     850                 :             :          * don't have the minimum number of items yet.  (Note that we deliberately
     851                 :             :          * assume that suffix truncation neither enlarges nor shrinks new high key
     852                 :             :          * when applying soft limit, except when last tuple has a posting list.)
     853                 :             :          */
     854   [ +  +  +  - ]:      846468 :         Assert(last_truncextra == 0 || isleaf);
     855   [ +  +  +  - ]:      849544 :         if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
     856         [ +  + ]:      846195 :                 (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
     857                 :             :         {
     858                 :             :                 /*
     859                 :             :                  * Finish off the page and write it out.
     860                 :             :                  */
     861                 :        3349 :                 BulkWriteBuffer obuf = nbuf;
     862                 :        3349 :                 Page            opage = npage;
     863                 :        3349 :                 BlockNumber oblkno = nblkno;
     864                 :        3349 :                 ItemId          ii;
     865                 :        3349 :                 ItemId          hii;
     866                 :        3349 :                 IndexTuple      oitup;
     867                 :             : 
     868                 :             :                 /* Create new page of same level */
     869                 :        3349 :                 nbuf = _bt_blnewpage(wstate, state->btps_level);
     870                 :        3349 :                 npage = (Page) nbuf;
     871                 :             : 
     872                 :             :                 /* and assign it a page position */
     873                 :        3349 :                 nblkno = wstate->btws_pages_alloced++;
     874                 :             : 
     875                 :             :                 /*
     876                 :             :                  * We copy the last item on the page into the new page, and then
     877                 :             :                  * rearrange the old page so that the 'last item' becomes its high key
     878                 :             :                  * rather than a true data item.  There had better be at least two
     879                 :             :                  * items on the page already, else the page would be empty of useful
     880                 :             :                  * data.
     881                 :             :                  */
     882         [ +  - ]:        3349 :                 Assert(last_off > P_FIRSTKEY);
     883                 :        3349 :                 ii = PageGetItemId(opage, last_off);
     884                 :        3349 :                 oitup = (IndexTuple) PageGetItem(opage, ii);
     885                 :        6698 :                 _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
     886                 :        3349 :                                            !isleaf);
     887                 :             : 
     888                 :             :                 /*
     889                 :             :                  * Move 'last' into the high key position on opage.  _bt_blnewpage()
     890                 :             :                  * allocated empty space for a line pointer when opage was first
     891                 :             :                  * created, so this is a matter of rearranging already-allocated space
     892                 :             :                  * on page, and initializing high key line pointer. (Actually, leaf
     893                 :             :                  * pages must also swap oitup with a truncated version of oitup, which
     894                 :             :                  * is sometimes larger than oitup, though never by more than the space
     895                 :             :                  * needed to append a heap TID.)
     896                 :             :                  */
     897                 :        3349 :                 hii = PageGetItemId(opage, P_HIKEY);
     898                 :        3349 :                 *hii = *ii;
     899                 :        3349 :                 ItemIdSetUnused(ii);    /* redundant */
     900                 :        3349 :                 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
     901                 :             : 
     902         [ +  + ]:        3349 :                 if (isleaf)
     903                 :             :                 {
     904                 :        3332 :                         IndexTuple      lastleft;
     905                 :        3332 :                         IndexTuple      truncated;
     906                 :             : 
     907                 :             :                         /*
     908                 :             :                          * Truncate away any unneeded attributes from high key on leaf
     909                 :             :                          * level.  This is only done at the leaf level because downlinks
     910                 :             :                          * in internal pages are either negative infinity items, or get
     911                 :             :                          * their contents from copying from one level down.  See also:
     912                 :             :                          * _bt_split().
     913                 :             :                          *
     914                 :             :                          * We don't try to bias our choice of split point to make it more
     915                 :             :                          * likely that _bt_truncate() can truncate away more attributes,
     916                 :             :                          * whereas the split point used within _bt_split() is chosen much
     917                 :             :                          * more delicately.  Even still, the lastleft and firstright
     918                 :             :                          * tuples passed to _bt_truncate() here are at least not fully
     919                 :             :                          * equal to each other when deduplication is used, unless there is
     920                 :             :                          * a large group of duplicates (also, unique index builds usually
     921                 :             :                          * have few or no spool2 duplicates).  When the split point is
     922                 :             :                          * between two unequal tuples, _bt_truncate() will avoid including
     923                 :             :                          * a heap TID in the new high key, which is the most important
     924                 :             :                          * benefit of suffix truncation.
     925                 :             :                          *
     926                 :             :                          * Overwrite the old item with new truncated high key directly.
     927                 :             :                          * oitup is already located at the physical beginning of tuple
     928                 :             :                          * space, so this should directly reuse the existing tuple space.
     929                 :             :                          */
     930                 :        3332 :                         ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
     931                 :        3332 :                         lastleft = (IndexTuple) PageGetItem(opage, ii);
     932                 :             : 
     933         [ +  - ]:        3332 :                         Assert(IndexTupleSize(oitup) > last_truncextra);
     934                 :        6664 :                         truncated = _bt_truncate(wstate->index, lastleft, oitup,
     935                 :        3332 :                                                                          wstate->inskey);
     936         [ +  - ]:        3332 :                         if (!PageIndexTupleOverwrite(opage, P_HIKEY, truncated, IndexTupleSize(truncated)))
     937   [ #  #  #  # ]:           0 :                                 elog(ERROR, "failed to add high key to the index page");
     938                 :        3332 :                         pfree(truncated);
     939                 :             : 
     940                 :             :                         /* oitup should continue to point to the page's high key */
     941                 :        3332 :                         hii = PageGetItemId(opage, P_HIKEY);
     942                 :        3332 :                         oitup = (IndexTuple) PageGetItem(opage, hii);
     943                 :        3332 :                 }
     944                 :             : 
     945                 :             :                 /*
     946                 :             :                  * Link the old page into its parent, using its low key.  If we don't
     947                 :             :                  * have a parent, we have to create one; this adds a new btree level.
     948                 :             :                  */
     949         [ +  + ]:        3349 :                 if (state->btps_next == NULL)
     950                 :         122 :                         state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
     951                 :             : 
     952   [ +  -  -  +  :        3349 :                 Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
             +  -  #  # ]
     953                 :             :                                 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
     954                 :             :                                 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
     955                 :             :                            P_LEFTMOST(BTPageGetOpaque(opage)));
     956   [ +  -  +  +  :        3349 :                 Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
                   +  - ]
     957                 :             :                            !P_LEFTMOST(BTPageGetOpaque(opage)));
     958                 :        3349 :                 BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
     959                 :        3349 :                 _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
     960                 :        3349 :                 pfree(state->btps_lowkey);
     961                 :             : 
     962                 :             :                 /*
     963                 :             :                  * Save a copy of the high key from the old page.  It is also the low
     964                 :             :                  * key for the new page.
     965                 :             :                  */
     966                 :        3349 :                 state->btps_lowkey = CopyIndexTuple(oitup);
     967                 :             : 
     968                 :             :                 /*
     969                 :             :                  * Set the sibling links for both pages.
     970                 :             :                  */
     971                 :             :                 {
     972                 :        3349 :                         BTPageOpaque oopaque = BTPageGetOpaque(opage);
     973                 :        3349 :                         BTPageOpaque nopaque = BTPageGetOpaque(npage);
     974                 :             : 
     975                 :        3349 :                         oopaque->btpo_next = nblkno;
     976                 :        3349 :                         nopaque->btpo_prev = oblkno;
     977                 :        3349 :                         nopaque->btpo_next = P_NONE; /* redundant */
     978                 :        3349 :                 }
     979                 :             : 
     980                 :             :                 /*
     981                 :             :                  * Write out the old page. _bt_blwritepage takes ownership of the
     982                 :             :                  * 'opage' buffer.
     983                 :             :                  */
     984                 :        3349 :                 _bt_blwritepage(wstate, obuf, oblkno);
     985                 :             : 
     986                 :             :                 /*
     987                 :             :                  * Reset last_off to point to new page
     988                 :             :                  */
     989                 :        3349 :                 last_off = P_FIRSTKEY;
     990                 :        3349 :         }
     991                 :             : 
     992                 :             :         /*
     993                 :             :          * By here, either original page is still the current page, or a new page
     994                 :             :          * was created that became the current page.  Either way, the current page
     995                 :             :          * definitely has space for new item.
     996                 :             :          *
     997                 :             :          * If the new item is the first for its page, it must also be the first
     998                 :             :          * item on its entire level.  On later same-level pages, a low key for a
     999                 :             :          * page will be copied from the prior page in the code above.  Generate a
    1000                 :             :          * minus infinity low key here instead.
    1001                 :             :          */
    1002         [ +  + ]:      846468 :         if (last_off == P_HIKEY)
    1003                 :             :         {
    1004         [ +  - ]:         632 :                 Assert(state->btps_lowkey == NULL);
    1005                 :         632 :                 state->btps_lowkey = palloc0_object(IndexTupleData);
    1006                 :         632 :                 state->btps_lowkey->t_info = sizeof(IndexTupleData);
    1007                 :         632 :                 BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
    1008                 :         632 :         }
    1009                 :             : 
    1010                 :             :         /*
    1011                 :             :          * Add the new item into the current page.
    1012                 :             :          */
    1013                 :      846468 :         last_off = OffsetNumberNext(last_off);
    1014                 :      849939 :         _bt_sortaddtup(npage, itupsz, itup, last_off,
    1015         [ +  + ]:      846468 :                                    !isleaf && last_off == P_FIRSTKEY);
    1016                 :             : 
    1017                 :      846468 :         state->btps_buf = nbuf;
    1018                 :      846468 :         state->btps_blkno = nblkno;
    1019                 :      846468 :         state->btps_lastoff = last_off;
    1020                 :      846468 : }
    1021                 :             : 
    1022                 :             : /*
    1023                 :             :  * Finalize pending posting list tuple, and add it to the index.  Final tuple
    1024                 :             :  * is based on saved base tuple, and saved list of heap TIDs.
    1025                 :             :  *
    1026                 :             :  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
    1027                 :             :  * using _bt_buildadd().
    1028                 :             :  */
    1029                 :             : static void
    1030                 :      678923 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
    1031                 :             :                                                           BTDedupState dstate)
    1032                 :             : {
    1033         [ +  - ]:      678923 :         Assert(dstate->nitems > 0);
    1034                 :             : 
    1035         [ +  + ]:      678923 :         if (dstate->nitems == 1)
    1036                 :      673744 :                 _bt_buildadd(wstate, state, dstate->base, 0);
    1037                 :             :         else
    1038                 :             :         {
    1039                 :        5179 :                 IndexTuple      postingtuple;
    1040                 :        5179 :                 Size            truncextra;
    1041                 :             : 
    1042                 :             :                 /* form a tuple with a posting list */
    1043                 :       10358 :                 postingtuple = _bt_form_posting(dstate->base,
    1044                 :        5179 :                                                                                 dstate->htids,
    1045                 :        5179 :                                                                                 dstate->nhtids);
    1046                 :             :                 /* Calculate posting list overhead */
    1047                 :       10358 :                 truncextra = IndexTupleSize(postingtuple) -
    1048                 :        5179 :                         BTreeTupleGetPostingOffset(postingtuple);
    1049                 :             : 
    1050                 :        5179 :                 _bt_buildadd(wstate, state, postingtuple, truncextra);
    1051                 :        5179 :                 pfree(postingtuple);
    1052                 :        5179 :         }
    1053                 :             : 
    1054                 :      678923 :         dstate->nmaxitems = 0;
    1055                 :      678923 :         dstate->nhtids = 0;
    1056                 :      678923 :         dstate->nitems = 0;
    1057                 :      678923 :         dstate->phystupsize = 0;
    1058                 :      678923 : }
    1059                 :             : 
    1060                 :             : /*
    1061                 :             :  * Finish writing out the completed btree.
    1062                 :             :  */
    1063                 :             : static void
    1064                 :        3695 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
    1065                 :             : {
    1066                 :        3695 :         BTPageState *s;
    1067                 :        3695 :         BlockNumber rootblkno = P_NONE;
    1068                 :        3695 :         uint32          rootlevel = 0;
    1069                 :        3695 :         BulkWriteBuffer metabuf;
    1070                 :             : 
    1071                 :             :         /*
    1072                 :             :          * Each iteration of this loop completes one more level of the tree.
    1073                 :             :          */
    1074         [ +  + ]:        4327 :         for (s = state; s != NULL; s = s->btps_next)
    1075                 :             :         {
    1076                 :         632 :                 BlockNumber blkno;
    1077                 :         632 :                 BTPageOpaque opaque;
    1078                 :             : 
    1079                 :         632 :                 blkno = s->btps_blkno;
    1080                 :         632 :                 opaque = BTPageGetOpaque((Page) s->btps_buf);
    1081                 :             : 
    1082                 :             :                 /*
    1083                 :             :                  * We have to link the last page on this level to somewhere.
    1084                 :             :                  *
    1085                 :             :                  * If we're at the top, it's the root, so attach it to the metapage.
    1086                 :             :                  * Otherwise, add an entry for it to its parent using its low key.
    1087                 :             :                  * This may cause the last page of the parent level to split, but
    1088                 :             :                  * that's not a problem -- we haven't gotten to it yet.
    1089                 :             :                  */
    1090         [ +  + ]:         632 :                 if (s->btps_next == NULL)
    1091                 :             :                 {
    1092                 :         510 :                         opaque->btpo_flags |= BTP_ROOT;
    1093                 :         510 :                         rootblkno = blkno;
    1094                 :         510 :                         rootlevel = s->btps_level;
    1095                 :         510 :                 }
    1096                 :             :                 else
    1097                 :             :                 {
    1098   [ +  -  -  +  :         122 :                         Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
             +  -  #  # ]
    1099                 :             :                                         IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
    1100                 :             :                                         BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
    1101                 :             :                                    P_LEFTMOST(opaque));
    1102   [ +  -  +  -  :         122 :                         Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
                   -  + ]
    1103                 :             :                                    !P_LEFTMOST(opaque));
    1104                 :         122 :                         BTreeTupleSetDownLink(s->btps_lowkey, blkno);
    1105                 :         122 :                         _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
    1106                 :         122 :                         pfree(s->btps_lowkey);
    1107                 :         122 :                         s->btps_lowkey = NULL;
    1108                 :             :                 }
    1109                 :             : 
    1110                 :             :                 /*
    1111                 :             :                  * This is the rightmost page, so the ItemId array needs to be slid
    1112                 :             :                  * back one slot.  Then we can dump out the page.
    1113                 :             :                  */
    1114                 :         632 :                 _bt_slideleft((Page) s->btps_buf);
    1115                 :         632 :                 _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
    1116                 :         632 :                 s->btps_buf = NULL;          /* writepage took ownership of the buffer */
    1117                 :         632 :         }
    1118                 :             : 
    1119                 :             :         /*
    1120                 :             :          * As the last step in the process, construct the metapage and make it
    1121                 :             :          * point to the new root (unless we had no data at all, in which case it's
    1122                 :             :          * set to point to "P_NONE").  This changes the index to the "valid" state
    1123                 :             :          * by filling in a valid magic number in the metapage.
    1124                 :             :          */
    1125                 :        3695 :         metabuf = smgr_bulk_get_buf(wstate->bulkstate);
    1126                 :        7390 :         _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
    1127                 :        3695 :                                          wstate->inskey->allequalimage);
    1128                 :        3695 :         _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
    1129                 :        3695 : }
    1130                 :             : 
    1131                 :             : /*
    1132                 :             :  * Read tuples in correct sort order from tuplesort, and load them into
    1133                 :             :  * btree leaves.
    1134                 :             :  */
    1135                 :             : static void
    1136                 :        3695 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
    1137                 :             : {
    1138                 :        3695 :         BTPageState *state = NULL;
    1139                 :        3695 :         bool            merge = (btspool2 != NULL);
    1140                 :        3695 :         IndexTuple      itup,
    1141                 :        3695 :                                 itup2 = NULL;
    1142                 :        3695 :         bool            load1;
    1143                 :        3695 :         TupleDesc       tupdes = RelationGetDescr(wstate->index);
    1144                 :        3695 :         int                     i,
    1145                 :        3695 :                                 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
    1146                 :        3695 :         SortSupport sortKeys;
    1147                 :        3695 :         int64           tuples_done = 0;
    1148                 :        3695 :         bool            deduplicate;
    1149                 :             : 
    1150                 :        3695 :         wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
    1151                 :             : 
    1152   [ +  +  +  + ]:        4752 :         deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
    1153   [ +  -  +  + ]:        1057 :                 BTGetDeduplicateItems(wstate->index);
    1154                 :             : 
    1155         [ +  + ]:        3695 :         if (merge)
    1156                 :             :         {
    1157                 :             :                 /*
    1158                 :             :                  * Another BTSpool for dead tuples exists. Now we have to merge
    1159                 :             :                  * btspool and btspool2.
    1160                 :             :                  */
    1161                 :             : 
    1162                 :             :                 /* the preparation of merge */
    1163                 :           2 :                 itup = tuplesort_getindextuple(btspool->sortstate, true);
    1164                 :           2 :                 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1165                 :             : 
    1166                 :             :                 /* Prepare SortSupport data for each column */
    1167                 :           2 :                 sortKeys = palloc0_array(SortSupportData, keysz);
    1168                 :             : 
    1169         [ +  + ]:           4 :                 for (i = 0; i < keysz; i++)
    1170                 :             :                 {
    1171                 :           2 :                         SortSupport sortKey = sortKeys + i;
    1172                 :           2 :                         ScanKey         scanKey = wstate->inskey->scankeys + i;
    1173                 :           2 :                         bool            reverse;
    1174                 :             : 
    1175                 :           2 :                         sortKey->ssup_cxt = CurrentMemoryContext;
    1176                 :           2 :                         sortKey->ssup_collation = scanKey->sk_collation;
    1177                 :           2 :                         sortKey->ssup_nulls_first =
    1178                 :           2 :                                 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
    1179                 :           2 :                         sortKey->ssup_attno = scanKey->sk_attno;
    1180                 :             :                         /* Abbreviation is not supported here */
    1181                 :           2 :                         sortKey->abbreviate = false;
    1182                 :             : 
    1183         [ +  - ]:           2 :                         Assert(sortKey->ssup_attno != 0);
    1184                 :             : 
    1185                 :           2 :                         reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;
    1186                 :             : 
    1187                 :           2 :                         PrepareSortSupportFromIndexRel(wstate->index, reverse, sortKey);
    1188                 :           2 :                 }
    1189                 :             : 
    1190                 :          20 :                 for (;;)
    1191                 :             :                 {
    1192                 :          20 :                         load1 = true;           /* load BTSpool next ? */
    1193         [ +  + ]:          20 :                         if (itup2 == NULL)
    1194                 :             :                         {
    1195         [ +  + ]:           4 :                                 if (itup == NULL)
    1196                 :           2 :                                         break;
    1197                 :           2 :                         }
    1198         [ +  + ]:          16 :                         else if (itup != NULL)
    1199                 :             :                         {
    1200                 :          15 :                                 int32           compare = 0;
    1201                 :             : 
    1202         [ +  + ]:          21 :                                 for (i = 1; i <= keysz; i++)
    1203                 :             :                                 {
    1204                 :          15 :                                         SortSupport entry;
    1205                 :          15 :                                         Datum           attrDatum1,
    1206                 :             :                                                                 attrDatum2;
    1207                 :          15 :                                         bool            isNull1,
    1208                 :             :                                                                 isNull2;
    1209                 :             : 
    1210                 :          15 :                                         entry = sortKeys + i - 1;
    1211                 :          15 :                                         attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
    1212                 :          15 :                                         attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
    1213                 :             : 
    1214                 :          30 :                                         compare = ApplySortComparator(attrDatum1, isNull1,
    1215                 :          15 :                                                                                                   attrDatum2, isNull2,
    1216                 :          15 :                                                                                                   entry);
    1217         [ +  + ]:          15 :                                         if (compare > 0)
    1218                 :             :                                         {
    1219                 :           1 :                                                 load1 = false;
    1220                 :           1 :                                                 break;
    1221                 :             :                                         }
    1222         [ +  + ]:          14 :                                         else if (compare < 0)
    1223                 :           8 :                                                 break;
    1224      [ -  +  + ]:          15 :                                 }
    1225                 :             : 
    1226                 :             :                                 /*
    1227                 :             :                                  * If key values are equal, we sort on ItemPointer.  This is
    1228                 :             :                                  * required for btree indexes, since heap TID is treated as an
    1229                 :             :                                  * implicit last key attribute in order to ensure that all
    1230                 :             :                                  * keys in the index are physically unique.
    1231                 :             :                                  */
    1232         [ +  + ]:          15 :                                 if (compare == 0)
    1233                 :             :                                 {
    1234                 :           6 :                                         compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
    1235         [ -  + ]:           6 :                                         Assert(compare != 0);
    1236         [ +  + ]:           6 :                                         if (compare > 0)
    1237                 :           5 :                                                 load1 = false;
    1238                 :           6 :                                 }
    1239                 :          15 :                         }
    1240                 :             :                         else
    1241                 :           1 :                                 load1 = false;
    1242                 :             : 
    1243                 :             :                         /* When we see first tuple, create first index page */
    1244         [ +  + ]:          18 :                         if (state == NULL)
    1245                 :           2 :                                 state = _bt_pagestate(wstate, 0);
    1246                 :             : 
    1247         [ +  + ]:          18 :                         if (load1)
    1248                 :             :                         {
    1249                 :          11 :                                 _bt_buildadd(wstate, state, itup, 0);
    1250                 :          11 :                                 itup = tuplesort_getindextuple(btspool->sortstate, true);
    1251                 :          11 :                         }
    1252                 :             :                         else
    1253                 :             :                         {
    1254                 :           7 :                                 _bt_buildadd(wstate, state, itup2, 0);
    1255                 :           7 :                                 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1256                 :             :                         }
    1257                 :             : 
    1258                 :             :                         /* Report progress */
    1259                 :          18 :                         pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1260                 :          18 :                                                                                  ++tuples_done);
    1261                 :             :                 }
    1262                 :           2 :                 pfree(sortKeys);
    1263                 :           2 :         }
    1264         [ +  + ]:        3693 :         else if (deduplicate)
    1265                 :             :         {
    1266                 :             :                 /* merge is unnecessary, deduplicate into posting lists */
    1267                 :        1057 :                 BTDedupState dstate;
    1268                 :             : 
    1269                 :        1057 :                 dstate = palloc_object(BTDedupStateData);
    1270                 :        1057 :                 dstate->deduplicate = true; /* unused */
    1271                 :        1057 :                 dstate->nmaxitems = 0;       /* unused */
    1272                 :        1057 :                 dstate->maxpostingsize = 0; /* set later */
    1273                 :             :                 /* Metadata about base tuple of current pending posting list */
    1274                 :        1057 :                 dstate->base = NULL;
    1275                 :        1057 :                 dstate->baseoff = InvalidOffsetNumber;       /* unused */
    1276                 :        1057 :                 dstate->basetupsize = 0;
    1277                 :             :                 /* Metadata about current pending posting list TIDs */
    1278                 :        1057 :                 dstate->htids = NULL;
    1279                 :        1057 :                 dstate->nhtids = 0;
    1280                 :        1057 :                 dstate->nitems = 0;
    1281                 :        1057 :                 dstate->phystupsize = 0;     /* unused */
    1282                 :        1057 :                 dstate->nintervals = 0; /* unused */
    1283                 :             : 
    1284   [ +  +  +  + ]:      991903 :                 while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1285                 :      991903 :                                                                                            true)) != NULL)
    1286                 :             :                 {
    1287                 :             :                         /* When we see first tuple, create first index page */
    1288         [ +  + ]:      990846 :                         if (state == NULL)
    1289                 :             :                         {
    1290                 :         280 :                                 state = _bt_pagestate(wstate, 0);
    1291                 :             : 
    1292                 :             :                                 /*
    1293                 :             :                                  * Limit size of posting list tuples to 1/10 space we want to
    1294                 :             :                                  * leave behind on the page, plus space for final item's line
    1295                 :             :                                  * pointer.  This is equal to the space that we'd like to
    1296                 :             :                                  * leave behind on each leaf page when fillfactor is 90,
    1297                 :             :                                  * allowing us to get close to fillfactor% space utilization
    1298                 :             :                                  * when there happen to be a great many duplicates.  (This
    1299                 :             :                                  * makes higher leaf fillfactor settings ineffective when
    1300                 :             :                                  * building indexes that have many duplicates, but packing
    1301                 :             :                                  * leaf pages full with few very large tuples doesn't seem
    1302                 :             :                                  * like a useful goal.)
    1303                 :             :                                  */
    1304                 :         280 :                                 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
    1305                 :             :                                         sizeof(ItemIdData);
    1306         [ +  - ]:         280 :                                 Assert(dstate->maxpostingsize <= BTMaxItemSize &&
    1307                 :             :                                            dstate->maxpostingsize <= INDEX_SIZE_MASK);
    1308                 :         280 :                                 dstate->htids = palloc(dstate->maxpostingsize);
    1309                 :             : 
    1310                 :             :                                 /* start new pending posting list with itup copy */
    1311                 :         280 :                                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1312                 :             :                                                                                 InvalidOffsetNumber);
    1313                 :         280 :                         }
    1314                 :     1981132 :                         else if (_bt_keep_natts_fast(wstate->index, dstate->base,
    1315   [ +  +  +  +  :     1981132 :                                                                                  itup) > keysz &&
                   +  + ]
    1316                 :      313930 :                                          _bt_dedup_save_htid(dstate, itup))
    1317                 :             :                         {
    1318                 :             :                                 /*
    1319                 :             :                                  * Tuple is equal to base tuple of pending posting list.  Heap
    1320                 :             :                                  * TID from itup has been saved in state.
    1321                 :             :                                  */
    1322                 :      311923 :                         }
    1323                 :             :                         else
    1324                 :             :                         {
    1325                 :             :                                 /*
    1326                 :             :                                  * Tuple is not equal to pending posting list tuple, or
    1327                 :             :                                  * _bt_dedup_save_htid() opted to not merge current item into
    1328                 :             :                                  * pending posting list.
    1329                 :             :                                  */
    1330                 :      678643 :                                 _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1331                 :      678643 :                                 pfree(dstate->base);
    1332                 :             : 
    1333                 :             :                                 /* start new pending posting list with itup copy */
    1334                 :      678643 :                                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1335                 :             :                                                                                 InvalidOffsetNumber);
    1336                 :             :                         }
    1337                 :             : 
    1338                 :             :                         /* Report progress */
    1339                 :      990846 :                         pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1340                 :      990846 :                                                                                  ++tuples_done);
    1341                 :             :                 }
    1342                 :             : 
    1343         [ +  + ]:        1057 :                 if (state)
    1344                 :             :                 {
    1345                 :             :                         /*
    1346                 :             :                          * Handle the last item (there must be a last item when the
    1347                 :             :                          * tuplesort returned one or more tuples)
    1348                 :             :                          */
    1349                 :         280 :                         _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1350                 :         280 :                         pfree(dstate->base);
    1351                 :         280 :                         pfree(dstate->htids);
    1352                 :         280 :                 }
    1353                 :             : 
    1354                 :        1057 :                 pfree(dstate);
    1355                 :        1057 :         }
    1356                 :             :         else
    1357                 :             :         {
    1358                 :             :                 /* merging and deduplication are both unnecessary */
    1359   [ +  +  +  + ]:      166692 :                 while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1360                 :      166692 :                                                                                            true)) != NULL)
    1361                 :             :                 {
    1362                 :             :                         /* When we see first tuple, create first index page */
    1363         [ +  + ]:      164056 :                         if (state == NULL)
    1364                 :         228 :                                 state = _bt_pagestate(wstate, 0);
    1365                 :             : 
    1366                 :      164056 :                         _bt_buildadd(wstate, state, itup, 0);
    1367                 :             : 
    1368                 :             :                         /* Report progress */
    1369                 :      164056 :                         pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1370                 :      164056 :                                                                                  ++tuples_done);
    1371                 :             :                 }
    1372                 :             :         }
    1373                 :             : 
    1374                 :             :         /* Close down final pages and write the metapage */
    1375                 :        3695 :         _bt_uppershutdown(wstate, state);
    1376                 :        3695 :         smgr_bulk_finish(wstate->bulkstate);
    1377                 :        3695 : }
    1378                 :             : 
    1379                 :             : /*
    1380                 :             :  * Create parallel context, and launch workers for leader.
    1381                 :             :  *
    1382                 :             :  * buildstate argument should be initialized (with the exception of the
    1383                 :             :  * tuplesort state in spools, which may later be created based on shared
    1384                 :             :  * state initially set up here).
    1385                 :             :  *
    1386                 :             :  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
    1387                 :             :  *
    1388                 :             :  * request is the target number of parallel worker processes to launch.
    1389                 :             :  *
    1390                 :             :  * Sets buildstate's BTLeader, which caller must use to shut down parallel
    1391                 :             :  * mode by passing it to _bt_end_parallel() at the very end of its index
    1392                 :             :  * build.  If not even a single worker process can be launched, this is
    1393                 :             :  * never set, and caller should proceed with a serial index build.
    1394                 :             :  */
    1395                 :             : static void
    1396                 :          27 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
    1397                 :             : {
    1398                 :          27 :         ParallelContext *pcxt;
    1399                 :          27 :         int                     scantuplesortstates;
    1400                 :          27 :         Snapshot        snapshot;
    1401                 :          27 :         Size            estbtshared;
    1402                 :          27 :         Size            estsort;
    1403                 :          27 :         BTShared   *btshared;
    1404                 :          27 :         Sharedsort *sharedsort;
    1405                 :          27 :         Sharedsort *sharedsort2;
    1406                 :          27 :         BTSpool    *btspool = buildstate->spool;
    1407                 :          27 :         BTLeader   *btleader = palloc0_object(BTLeader);
    1408                 :          27 :         WalUsage   *walusage;
    1409                 :          27 :         BufferUsage *bufferusage;
    1410                 :          27 :         bool            leaderparticipates = true;
    1411                 :          27 :         int                     querylen;
    1412                 :             : 
    1413                 :             : #ifdef DISABLE_LEADER_PARTICIPATION
    1414                 :             :         leaderparticipates = false;
    1415                 :             : #endif
    1416                 :             : 
    1417                 :             :         /*
    1418                 :             :          * Enter parallel mode, and create context for parallel build of btree
    1419                 :             :          * index
    1420                 :             :          */
    1421                 :          27 :         EnterParallelMode();
    1422         [ +  - ]:          27 :         Assert(request > 0);
    1423                 :          27 :         pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
    1424                 :          27 :                                                                  request);
    1425                 :             : 
    1426         [ +  - ]:          27 :         scantuplesortstates = leaderparticipates ? request + 1 : request;
    1427                 :             : 
    1428                 :             :         /*
    1429                 :             :          * Prepare for scan of the base relation.  In a normal index build, we use
    1430                 :             :          * SnapshotAny because we must retrieve all tuples and do our own time
    1431                 :             :          * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
    1432                 :             :          * concurrent build, we take a regular MVCC snapshot and index whatever's
    1433                 :             :          * live according to that.
    1434                 :             :          */
    1435         [ -  + ]:          27 :         if (!isconcurrent)
    1436                 :          27 :                 snapshot = SnapshotAny;
    1437                 :             :         else
    1438                 :           0 :                 snapshot = RegisterSnapshot(GetTransactionSnapshot());
    1439                 :             : 
    1440                 :             :         /*
    1441                 :             :          * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
    1442                 :             :          * PARALLEL_KEY_TUPLESORT tuplesort workspace
    1443                 :             :          */
    1444                 :          27 :         estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
    1445                 :          27 :         shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
    1446                 :          27 :         estsort = tuplesort_estimate_shared(scantuplesortstates);
    1447                 :          27 :         shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1448                 :             : 
    1449                 :             :         /*
    1450                 :             :          * Unique case requires a second spool, and so we may have to account for
    1451                 :             :          * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
    1452                 :             :          */
    1453         [ +  + ]:          27 :         if (!btspool->isunique)
    1454                 :          14 :                 shm_toc_estimate_keys(&pcxt->estimator, 2);
    1455                 :             :         else
    1456                 :             :         {
    1457                 :          13 :                 shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1458                 :          13 :                 shm_toc_estimate_keys(&pcxt->estimator, 3);
    1459                 :             :         }
    1460                 :             : 
    1461                 :             :         /*
    1462                 :             :          * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
    1463                 :             :          * and PARALLEL_KEY_BUFFER_USAGE.
    1464                 :             :          *
    1465                 :             :          * If there are no extensions loaded that care, we could skip this.  We
    1466                 :             :          * have no way of knowing whether anyone's looking at pgWalUsage or
    1467                 :             :          * pgBufferUsage, so do it unconditionally.
    1468                 :             :          */
    1469                 :          27 :         shm_toc_estimate_chunk(&pcxt->estimator,
    1470                 :             :                                                    mul_size(sizeof(WalUsage), pcxt->nworkers));
    1471                 :          27 :         shm_toc_estimate_keys(&pcxt->estimator, 1);
    1472                 :          27 :         shm_toc_estimate_chunk(&pcxt->estimator,
    1473                 :             :                                                    mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1474                 :          27 :         shm_toc_estimate_keys(&pcxt->estimator, 1);
    1475                 :             : 
    1476                 :             :         /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
    1477         [ +  - ]:          27 :         if (debug_query_string)
    1478                 :             :         {
    1479                 :          27 :                 querylen = strlen(debug_query_string);
    1480                 :          27 :                 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
    1481                 :          27 :                 shm_toc_estimate_keys(&pcxt->estimator, 1);
    1482                 :          27 :         }
    1483                 :             :         else
    1484                 :           0 :                 querylen = 0;                   /* keep compiler quiet */
    1485                 :             : 
    1486                 :             :         /* Everyone's had a chance to ask for space, so now create the DSM */
    1487                 :          27 :         InitializeParallelDSM(pcxt);
    1488                 :             : 
    1489                 :             :         /* If no DSM segment was available, back out (do serial build) */
    1490         [ +  - ]:          27 :         if (pcxt->seg == NULL)
    1491                 :             :         {
    1492   [ #  #  #  # ]:           0 :                 if (IsMVCCSnapshot(snapshot))
    1493                 :           0 :                         UnregisterSnapshot(snapshot);
    1494                 :           0 :                 DestroyParallelContext(pcxt);
    1495                 :           0 :                 ExitParallelMode();
    1496                 :           0 :                 return;
    1497                 :             :         }
    1498                 :             : 
    1499                 :             :         /* Store shared build state, for which we reserved space */
    1500                 :          27 :         btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
    1501                 :             :         /* Initialize immutable state */
    1502                 :          27 :         btshared->heaprelid = RelationGetRelid(btspool->heap);
    1503                 :          27 :         btshared->indexrelid = RelationGetRelid(btspool->index);
    1504                 :          27 :         btshared->isunique = btspool->isunique;
    1505                 :          27 :         btshared->nulls_not_distinct = btspool->nulls_not_distinct;
    1506                 :          27 :         btshared->isconcurrent = isconcurrent;
    1507                 :          27 :         btshared->scantuplesortstates = scantuplesortstates;
    1508                 :          27 :         btshared->queryid = pgstat_get_my_query_id();
    1509                 :          27 :         ConditionVariableInit(&btshared->workersdonecv);
    1510                 :          27 :         SpinLockInit(&btshared->mutex);
    1511                 :             :         /* Initialize mutable state */
    1512                 :          27 :         btshared->nparticipantsdone = 0;
    1513                 :          27 :         btshared->reltuples = 0.0;
    1514                 :          27 :         btshared->havedead = false;
    1515                 :          27 :         btshared->indtuples = 0.0;
    1516                 :          27 :         btshared->brokenhotchain = false;
    1517                 :          54 :         table_parallelscan_initialize(btspool->heap,
    1518                 :          27 :                                                                   ParallelTableScanFromBTShared(btshared),
    1519                 :          27 :                                                                   snapshot);
    1520                 :             : 
    1521                 :             :         /*
    1522                 :             :          * Store shared tuplesort-private state, for which we reserved space.
    1523                 :             :          * Then, initialize opaque state using tuplesort routine.
    1524                 :             :          */
    1525                 :          27 :         sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1526                 :          54 :         tuplesort_initialize_shared(sharedsort, scantuplesortstates,
    1527                 :          27 :                                                                 pcxt->seg);
    1528                 :             : 
    1529                 :          27 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
    1530                 :          27 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
    1531                 :             : 
    1532                 :             :         /* Unique case requires a second spool, and associated shared state */
    1533         [ +  + ]:          27 :         if (!btspool->isunique)
    1534                 :          14 :                 sharedsort2 = NULL;
    1535                 :             :         else
    1536                 :             :         {
    1537                 :             :                 /*
    1538                 :             :                  * Store additional shared tuplesort-private state, for which we
    1539                 :             :                  * reserved space.  Then, initialize opaque state using tuplesort
    1540                 :             :                  * routine.
    1541                 :             :                  */
    1542                 :          13 :                 sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1543                 :          26 :                 tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
    1544                 :          13 :                                                                         pcxt->seg);
    1545                 :             : 
    1546                 :          13 :                 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
    1547                 :             :         }
    1548                 :             : 
    1549                 :             :         /* Store query string for workers */
    1550         [ -  + ]:          27 :         if (debug_query_string)
    1551                 :             :         {
    1552                 :          27 :                 char       *sharedquery;
    1553                 :             : 
    1554                 :          27 :                 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
    1555                 :          27 :                 memcpy(sharedquery, debug_query_string, querylen + 1);
    1556                 :          27 :                 shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
    1557                 :          27 :         }
    1558                 :             : 
    1559                 :             :         /*
    1560                 :             :          * Allocate space for each worker's WalUsage and BufferUsage; no need to
    1561                 :             :          * initialize.
    1562                 :             :          */
    1563                 :          54 :         walusage = shm_toc_allocate(pcxt->toc,
    1564                 :          27 :                                                                 mul_size(sizeof(WalUsage), pcxt->nworkers));
    1565                 :          27 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
    1566                 :          54 :         bufferusage = shm_toc_allocate(pcxt->toc,
    1567                 :          27 :                                                                    mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1568                 :          27 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
    1569                 :             : 
    1570                 :             :         /* Launch workers, saving status for leader/caller */
    1571                 :          27 :         LaunchParallelWorkers(pcxt);
    1572                 :          27 :         btleader->pcxt = pcxt;
    1573                 :          27 :         btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
    1574         [ -  + ]:          27 :         if (leaderparticipates)
    1575                 :          27 :                 btleader->nparticipanttuplesorts++;
    1576                 :          27 :         btleader->btshared = btshared;
    1577                 :          27 :         btleader->sharedsort = sharedsort;
    1578                 :          27 :         btleader->sharedsort2 = sharedsort2;
    1579                 :          27 :         btleader->snapshot = snapshot;
    1580                 :          27 :         btleader->walusage = walusage;
    1581                 :          27 :         btleader->bufferusage = bufferusage;
    1582                 :             : 
    1583                 :             :         /* If no workers were successfully launched, back out (do serial build) */
    1584         [ +  - ]:          27 :         if (pcxt->nworkers_launched == 0)
    1585                 :             :         {
    1586                 :           0 :                 _bt_end_parallel(btleader);
    1587                 :           0 :                 return;
    1588                 :             :         }
    1589                 :             : 
    1590                 :             :         /* Save leader state now that it's clear build will be parallel */
    1591                 :          27 :         buildstate->btleader = btleader;
    1592                 :             : 
    1593                 :             :         /* Join heap scan ourselves */
    1594         [ -  + ]:          27 :         if (leaderparticipates)
    1595                 :          27 :                 _bt_leader_participate_as_worker(buildstate);
    1596                 :             : 
    1597                 :             :         /*
    1598                 :             :          * Caller needs to wait for all launched workers when we return.  Make
    1599                 :             :          * sure that the failure-to-start case will not hang forever.
    1600                 :             :          */
    1601                 :          27 :         WaitForParallelWorkersToAttach(pcxt);
    1602         [ -  + ]:          27 : }
    1603                 :             : 
    1604                 :             : /*
    1605                 :             :  * Shut down workers, destroy parallel context, and end parallel mode.
    1606                 :             :  */
    1607                 :             : static void
    1608                 :          27 : _bt_end_parallel(BTLeader *btleader)
    1609                 :             : {
    1610                 :          27 :         int                     i;
    1611                 :             : 
    1612                 :             :         /* Shutdown worker processes */
    1613                 :          27 :         WaitForParallelWorkersToFinish(btleader->pcxt);
    1614                 :             : 
    1615                 :             :         /*
    1616                 :             :          * Next, accumulate WAL usage.  (This must wait for the workers to finish,
    1617                 :             :          * or we might get incomplete data.)
    1618                 :             :          */
    1619         [ +  + ]:          54 :         for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
    1620                 :          27 :                 InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
    1621                 :             : 
    1622                 :             :         /* Free last reference to MVCC snapshot, if one was used */
    1623   [ +  -  -  + ]:          27 :         if (IsMVCCSnapshot(btleader->snapshot))
    1624                 :           0 :                 UnregisterSnapshot(btleader->snapshot);
    1625                 :          27 :         DestroyParallelContext(btleader->pcxt);
    1626                 :          27 :         ExitParallelMode();
    1627                 :          27 : }
    1628                 :             : 
    1629                 :             : /*
    1630                 :             :  * Returns size of shared memory required to store state for a parallel
    1631                 :             :  * btree index build based on the snapshot its parallel scan will use.
    1632                 :             :  */
    1633                 :             : static Size
    1634                 :          27 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
    1635                 :             : {
    1636                 :             :         /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
    1637                 :          27 :         return add_size(BUFFERALIGN(sizeof(BTShared)),
    1638                 :          27 :                                         table_parallelscan_estimate(heap, snapshot));
    1639                 :             : }
    1640                 :             : 
    1641                 :             : /*
    1642                 :             :  * Within leader, wait for end of heap scan.
    1643                 :             :  *
    1644                 :             :  * When called, parallel heap scan started by _bt_begin_parallel() will
    1645                 :             :  * already be underway within worker processes (when leader participates
    1646                 :             :  * as a worker, we should end up here just as workers are finishing).
    1647                 :             :  *
    1648                 :             :  * Fills in fields needed for ambuild statistics, and lets caller set
    1649                 :             :  * field indicating that some worker encountered a broken HOT chain.
    1650                 :             :  *
    1651                 :             :  * Returns the total number of heap tuples scanned.
    1652                 :             :  */
    1653                 :             : static double
    1654                 :          27 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
    1655                 :             : {
    1656                 :          27 :         BTShared   *btshared = buildstate->btleader->btshared;
    1657                 :          27 :         int                     nparticipanttuplesorts;
    1658                 :          27 :         double          reltuples;
    1659                 :             : 
    1660                 :          27 :         nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
    1661                 :          71 :         for (;;)
    1662                 :             :         {
    1663         [ -  + ]:          71 :                 SpinLockAcquire(&btshared->mutex);
    1664         [ +  + ]:          71 :                 if (btshared->nparticipantsdone == nparticipanttuplesorts)
    1665                 :             :                 {
    1666                 :          27 :                         buildstate->havedead = btshared->havedead;
    1667                 :          27 :                         buildstate->indtuples = btshared->indtuples;
    1668                 :          27 :                         *brokenhotchain = btshared->brokenhotchain;
    1669                 :          27 :                         reltuples = btshared->reltuples;
    1670                 :          27 :                         SpinLockRelease(&btshared->mutex);
    1671                 :          27 :                         break;
    1672                 :             :                 }
    1673                 :          44 :                 SpinLockRelease(&btshared->mutex);
    1674                 :             : 
    1675                 :          44 :                 ConditionVariableSleep(&btshared->workersdonecv,
    1676                 :             :                                                            WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
    1677                 :             :         }
    1678                 :             : 
    1679                 :          27 :         ConditionVariableCancelSleep();
    1680                 :             : 
    1681                 :          54 :         return reltuples;
    1682                 :          27 : }
    1683                 :             : 
    1684                 :             : /*
    1685                 :             :  * Within leader, participate as a parallel worker.
    1686                 :             :  */
    1687                 :             : static void
    1688                 :          27 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
    1689                 :             : {
    1690                 :          27 :         BTLeader   *btleader = buildstate->btleader;
    1691                 :          27 :         BTSpool    *leaderworker;
    1692                 :          27 :         BTSpool    *leaderworker2;
    1693                 :          27 :         int                     sortmem;
    1694                 :             : 
    1695                 :             :         /* Allocate memory and initialize private spool */
    1696                 :          27 :         leaderworker = palloc0_object(BTSpool);
    1697                 :          27 :         leaderworker->heap = buildstate->spool->heap;
    1698                 :          27 :         leaderworker->index = buildstate->spool->index;
    1699                 :          27 :         leaderworker->isunique = buildstate->spool->isunique;
    1700                 :          27 :         leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
    1701                 :             : 
    1702                 :             :         /* Initialize second spool, if required */
    1703         [ +  + ]:          27 :         if (!btleader->btshared->isunique)
    1704                 :          14 :                 leaderworker2 = NULL;
    1705                 :             :         else
    1706                 :             :         {
    1707                 :             :                 /* Allocate memory for worker's own private secondary spool */
    1708                 :          13 :                 leaderworker2 = palloc0_object(BTSpool);
    1709                 :             : 
    1710                 :             :                 /* Initialize worker's own secondary spool */
    1711                 :          13 :                 leaderworker2->heap = leaderworker->heap;
    1712                 :          13 :                 leaderworker2->index = leaderworker->index;
    1713                 :          13 :                 leaderworker2->isunique = false;
    1714                 :             :         }
    1715                 :             : 
    1716                 :             :         /*
    1717                 :             :          * Might as well use reliable figure when doling out maintenance_work_mem
    1718                 :             :          * (when requested number of workers were not launched, this will be
    1719                 :             :          * somewhat higher than it is for other workers).
    1720                 :             :          */
    1721                 :          27 :         sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
    1722                 :             : 
    1723                 :             :         /* Perform work common to all participants */
    1724                 :          54 :         _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
    1725                 :          27 :                                                            btleader->sharedsort, btleader->sharedsort2,
    1726                 :          27 :                                                            sortmem, true);
    1727                 :             : 
    1728                 :             : #ifdef BTREE_BUILD_STATS
    1729                 :             :         if (log_btree_build_stats)
    1730                 :             :         {
    1731                 :             :                 ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
    1732                 :             :                 ResetUsage();
    1733                 :             :         }
    1734                 :             : #endif                                                  /* BTREE_BUILD_STATS */
    1735                 :          27 : }
    1736                 :             : 
    1737                 :             : /*
    1738                 :             :  * Perform work within a launched parallel process.
    1739                 :             :  */
    1740                 :             : void
    1741                 :          27 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
    1742                 :             : {
    1743                 :          27 :         char       *sharedquery;
    1744                 :          27 :         BTSpool    *btspool;
    1745                 :          27 :         BTSpool    *btspool2;
    1746                 :          27 :         BTShared   *btshared;
    1747                 :          27 :         Sharedsort *sharedsort;
    1748                 :          27 :         Sharedsort *sharedsort2;
    1749                 :          27 :         Relation        heapRel;
    1750                 :          27 :         Relation        indexRel;
    1751                 :          27 :         LOCKMODE        heapLockmode;
    1752                 :          27 :         LOCKMODE        indexLockmode;
    1753                 :          27 :         WalUsage   *walusage;
    1754                 :          27 :         BufferUsage *bufferusage;
    1755                 :          27 :         int                     sortmem;
    1756                 :             : 
    1757                 :             : #ifdef BTREE_BUILD_STATS
    1758                 :             :         if (log_btree_build_stats)
    1759                 :             :                 ResetUsage();
    1760                 :             : #endif                                                  /* BTREE_BUILD_STATS */
    1761                 :             : 
    1762                 :             :         /*
    1763                 :             :          * The only possible status flag that can be set to the parallel worker is
    1764                 :             :          * PROC_IN_SAFE_IC.
    1765                 :             :          */
    1766   [ -  +  #  # ]:          27 :         Assert((MyProc->statusFlags == 0) ||
    1767                 :             :                    (MyProc->statusFlags == PROC_IN_SAFE_IC));
    1768                 :             : 
    1769                 :             :         /* Set debug_query_string for individual workers first */
    1770                 :          27 :         sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
    1771                 :          27 :         debug_query_string = sharedquery;
    1772                 :             : 
    1773                 :             :         /* Report the query string from leader */
    1774                 :          27 :         pgstat_report_activity(STATE_RUNNING, debug_query_string);
    1775                 :             : 
    1776                 :             :         /* Look up nbtree shared state */
    1777                 :          27 :         btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
    1778                 :             : 
    1779                 :             :         /* Open relations using lock modes known to be obtained by index.c */
    1780         [ -  + ]:          27 :         if (!btshared->isconcurrent)
    1781                 :             :         {
    1782                 :          27 :                 heapLockmode = ShareLock;
    1783                 :          27 :                 indexLockmode = AccessExclusiveLock;
    1784                 :          27 :         }
    1785                 :             :         else
    1786                 :             :         {
    1787                 :           0 :                 heapLockmode = ShareUpdateExclusiveLock;
    1788                 :           0 :                 indexLockmode = RowExclusiveLock;
    1789                 :             :         }
    1790                 :             : 
    1791                 :             :         /* Track query ID */
    1792                 :          27 :         pgstat_report_query_id(btshared->queryid, false);
    1793                 :             : 
    1794                 :             :         /* Open relations within worker */
    1795                 :          27 :         heapRel = table_open(btshared->heaprelid, heapLockmode);
    1796                 :          27 :         indexRel = index_open(btshared->indexrelid, indexLockmode);
    1797                 :             : 
    1798                 :             :         /* Initialize worker's own spool */
    1799                 :          27 :         btspool = palloc0_object(BTSpool);
    1800                 :          27 :         btspool->heap = heapRel;
    1801                 :          27 :         btspool->index = indexRel;
    1802                 :          27 :         btspool->isunique = btshared->isunique;
    1803                 :          27 :         btspool->nulls_not_distinct = btshared->nulls_not_distinct;
    1804                 :             : 
    1805                 :             :         /* Look up shared state private to tuplesort.c */
    1806                 :          27 :         sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
    1807                 :          27 :         tuplesort_attach_shared(sharedsort, seg);
    1808         [ +  + ]:          27 :         if (!btshared->isunique)
    1809                 :             :         {
    1810                 :          14 :                 btspool2 = NULL;
    1811                 :          14 :                 sharedsort2 = NULL;
    1812                 :          14 :         }
    1813                 :             :         else
    1814                 :             :         {
    1815                 :             :                 /* Allocate memory for worker's own private secondary spool */
    1816                 :          13 :                 btspool2 = palloc0_object(BTSpool);
    1817                 :             : 
    1818                 :             :                 /* Initialize worker's own secondary spool */
    1819                 :          13 :                 btspool2->heap = btspool->heap;
    1820                 :          13 :                 btspool2->index = btspool->index;
    1821                 :          13 :                 btspool2->isunique = false;
    1822                 :             :                 /* Look up shared state private to tuplesort.c */
    1823                 :          13 :                 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
    1824                 :          13 :                 tuplesort_attach_shared(sharedsort2, seg);
    1825                 :             :         }
    1826                 :             : 
    1827                 :             :         /* Prepare to track buffer usage during parallel execution */
    1828                 :          27 :         InstrStartParallelQuery();
    1829                 :             : 
    1830                 :             :         /* Perform sorting of spool, and possibly a spool2 */
    1831                 :          27 :         sortmem = maintenance_work_mem / btshared->scantuplesortstates;
    1832                 :          54 :         _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
    1833                 :          27 :                                                            sharedsort2, sortmem, false);
    1834                 :             : 
    1835                 :             :         /* Report WAL/buffer usage during parallel execution */
    1836                 :          27 :         bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
    1837                 :          27 :         walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
    1838                 :          54 :         InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
    1839                 :          27 :                                                   &walusage[ParallelWorkerNumber]);
    1840                 :             : 
    1841                 :             : #ifdef BTREE_BUILD_STATS
    1842                 :             :         if (log_btree_build_stats)
    1843                 :             :         {
    1844                 :             :                 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
    1845                 :             :                 ResetUsage();
    1846                 :             :         }
    1847                 :             : #endif                                                  /* BTREE_BUILD_STATS */
    1848                 :             : 
    1849                 :          27 :         index_close(indexRel, indexLockmode);
    1850                 :          27 :         table_close(heapRel, heapLockmode);
    1851                 :          27 : }
    1852                 :             : 
    1853                 :             : /*
    1854                 :             :  * Perform a worker's portion of a parallel sort.
    1855                 :             :  *
    1856                 :             :  * This generates a tuplesort for passed btspool, and a second tuplesort
    1857                 :             :  * state if a second btspool is need (i.e. for unique index builds).  All
    1858                 :             :  * other spool fields should already be set when this is called.
    1859                 :             :  *
    1860                 :             :  * sortmem is the amount of working memory to use within each worker,
    1861                 :             :  * expressed in KBs.
    1862                 :             :  *
    1863                 :             :  * When this returns, workers are done, and need only release resources.
    1864                 :             :  */
    1865                 :             : static void
    1866                 :          54 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
    1867                 :             :                                                    BTShared *btshared, Sharedsort *sharedsort,
    1868                 :             :                                                    Sharedsort *sharedsort2, int sortmem, bool progress)
    1869                 :             : {
    1870                 :          54 :         SortCoordinate coordinate;
    1871                 :          54 :         BTBuildState buildstate;
    1872                 :          54 :         TableScanDesc scan;
    1873                 :          54 :         double          reltuples;
    1874                 :          54 :         IndexInfo  *indexInfo;
    1875                 :             : 
    1876                 :             :         /* Initialize local tuplesort coordination state */
    1877                 :          54 :         coordinate = palloc0_object(SortCoordinateData);
    1878                 :          54 :         coordinate->isWorker = true;
    1879                 :          54 :         coordinate->nParticipants = -1;
    1880                 :          54 :         coordinate->sharedsort = sharedsort;
    1881                 :             : 
    1882                 :             :         /* Begin "partial" tuplesort */
    1883                 :         108 :         btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
    1884                 :          54 :                                                                                                          btspool->index,
    1885                 :          54 :                                                                                                          btspool->isunique,
    1886                 :          54 :                                                                                                          btspool->nulls_not_distinct,
    1887                 :          54 :                                                                                                          sortmem, coordinate,
    1888                 :             :                                                                                                          TUPLESORT_NONE);
    1889                 :             : 
    1890                 :             :         /*
    1891                 :             :          * Just as with serial case, there may be a second spool.  If so, a
    1892                 :             :          * second, dedicated spool2 partial tuplesort is required.
    1893                 :             :          */
    1894         [ +  + ]:          54 :         if (btspool2)
    1895                 :             :         {
    1896                 :          26 :                 SortCoordinate coordinate2;
    1897                 :             : 
    1898                 :             :                 /*
    1899                 :             :                  * We expect that the second one (for dead tuples) won't get very
    1900                 :             :                  * full, so we give it only work_mem (unless sortmem is less for
    1901                 :             :                  * worker).  Worker processes are generally permitted to allocate
    1902                 :             :                  * work_mem independently.
    1903                 :             :                  */
    1904                 :          26 :                 coordinate2 = palloc0_object(SortCoordinateData);
    1905                 :          26 :                 coordinate2->isWorker = true;
    1906                 :          26 :                 coordinate2->nParticipants = -1;
    1907                 :          26 :                 coordinate2->sharedsort = sharedsort2;
    1908                 :          26 :                 btspool2->sortstate =
    1909                 :          52 :                         tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
    1910         [ -  + ]:          26 :                                                                                 Min(sortmem, work_mem), coordinate2,
    1911                 :             :                                                                                 false);
    1912                 :          26 :         }
    1913                 :             : 
    1914                 :             :         /* Fill in buildstate for _bt_build_callback() */
    1915                 :          54 :         buildstate.isunique = btshared->isunique;
    1916                 :          54 :         buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
    1917                 :          54 :         buildstate.havedead = false;
    1918                 :          54 :         buildstate.heap = btspool->heap;
    1919                 :          54 :         buildstate.spool = btspool;
    1920                 :          54 :         buildstate.spool2 = btspool2;
    1921                 :          54 :         buildstate.indtuples = 0;
    1922                 :          54 :         buildstate.btleader = NULL;
    1923                 :             : 
    1924                 :             :         /* Join parallel scan */
    1925                 :          54 :         indexInfo = BuildIndexInfo(btspool->index);
    1926                 :          54 :         indexInfo->ii_Concurrent = btshared->isconcurrent;
    1927                 :         108 :         scan = table_beginscan_parallel(btspool->heap,
    1928                 :          54 :                                                                         ParallelTableScanFromBTShared(btshared));
    1929                 :         108 :         reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
    1930                 :          54 :                                                                            true, progress, _bt_build_callback,
    1931                 :          54 :                                                                            &buildstate, scan);
    1932                 :             : 
    1933                 :             :         /* Execute this worker's part of the sort */
    1934         [ +  + ]:          54 :         if (progress)
    1935                 :          27 :                 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1936                 :             :                                                                          PROGRESS_BTREE_PHASE_PERFORMSORT_1);
    1937                 :          54 :         tuplesort_performsort(btspool->sortstate);
    1938         [ +  + ]:          54 :         if (btspool2)
    1939                 :             :         {
    1940         [ +  + ]:          26 :                 if (progress)
    1941                 :          13 :                         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1942                 :             :                                                                                  PROGRESS_BTREE_PHASE_PERFORMSORT_2);
    1943                 :          26 :                 tuplesort_performsort(btspool2->sortstate);
    1944                 :          26 :         }
    1945                 :             : 
    1946                 :             :         /*
    1947                 :             :          * Done.  Record ambuild statistics, and whether we encountered a broken
    1948                 :             :          * HOT chain.
    1949                 :             :          */
    1950         [ -  + ]:          54 :         SpinLockAcquire(&btshared->mutex);
    1951                 :          54 :         btshared->nparticipantsdone++;
    1952                 :          54 :         btshared->reltuples += reltuples;
    1953         [ +  - ]:          54 :         if (buildstate.havedead)
    1954                 :           0 :                 btshared->havedead = true;
    1955                 :          54 :         btshared->indtuples += buildstate.indtuples;
    1956         [ +  - ]:          54 :         if (indexInfo->ii_BrokenHotChain)
    1957                 :           0 :                 btshared->brokenhotchain = true;
    1958                 :          54 :         SpinLockRelease(&btshared->mutex);
    1959                 :             : 
    1960                 :             :         /* Notify leader */
    1961                 :          54 :         ConditionVariableSignal(&btshared->workersdonecv);
    1962                 :             : 
    1963                 :             :         /* We can end tuplesorts immediately */
    1964                 :          54 :         tuplesort_end(btspool->sortstate);
    1965         [ +  + ]:          54 :         if (btspool2)
    1966                 :          26 :                 tuplesort_end(btspool2->sortstate);
    1967                 :          54 : }
        

Generated by: LCOV version 2.3.2-1