LCOV - Code coverage - src/backend/access/table/tableam.c

LCOV - code coverage report

Current view:	top level - src/backend/access/table - tableam.c (source / functions)		Coverage	Total	Hit
Test:	Code coverage	Lines:	76.4 %	254	194
Test Date:	2026-01-26 10:56:24	Functions:	84.2 %	19	16
Legend:	Lines: hit not hit Branches: + taken - not taken # not executed	Branches:	46.7 %	152	71

             Branch data     Line data    Source code

       1                 :             : /*----------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * tableam.c
       4                 :             :  *              Table access method routines too big to be inline functions.
       5                 :             :  *
       6                 :             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7                 :             :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :             :  *
       9                 :             :  *
      10                 :             :  * IDENTIFICATION
      11                 :             :  *        src/backend/access/table/tableam.c
      12                 :             :  *
      13                 :             :  * NOTES
      14                 :             :  *        Note that most functions in here are documented in tableam.h, rather than
      15                 :             :  *        here. That's because there's a lot of inline functions in tableam.h and
      16                 :             :  *        it'd be harder to understand if one constantly had to switch between files.
      17                 :             :  *
      18                 :             :  *----------------------------------------------------------------------
      19                 :             :  */
      20                 :             : #include "postgres.h"
      21                 :             : 
      22                 :             : #include <math.h>
      23                 :             : 
      24                 :             : #include "access/syncscan.h"
      25                 :             : #include "access/tableam.h"
      26                 :             : #include "access/xact.h"
      27                 :             : #include "optimizer/optimizer.h"
      28                 :             : #include "optimizer/plancat.h"
      29                 :             : #include "port/pg_bitutils.h"
      30                 :             : #include "storage/bufmgr.h"
      31                 :             : #include "storage/shmem.h"
      32                 :             : #include "storage/smgr.h"
      33                 :             : 
      34                 :             : /*
      35                 :             :  * Constants to control the behavior of block allocation to parallel workers
      36                 :             :  * during a parallel seqscan.  Technically these values do not need to be
      37                 :             :  * powers of 2, but having them as powers of 2 makes the math more optimal
      38                 :             :  * and makes the ramp-down stepping more even.
      39                 :             :  */
      40                 :             : 
      41                 :             : /* The number of I/O chunks we try to break a parallel seqscan down into */
      42                 :             : #define PARALLEL_SEQSCAN_NCHUNKS                        2048
      43                 :             : /* Ramp down size of allocations when we've only this number of chunks left */
      44                 :             : #define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS        64
      45                 :             : /* Cap the size of parallel I/O chunks to this number of blocks */
      46                 :             : #define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE         8192
      47                 :             : 
      48                 :             : /* GUC variables */
      49                 :             : char       *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
      50                 :             : bool            synchronize_seqscans = true;
      51                 :             : 
      52                 :             : 
      53                 :             : /* ----------------------------------------------------------------------------
      54                 :             :  * Slot functions.
      55                 :             :  * ----------------------------------------------------------------------------
      56                 :             :  */
      57                 :             : 
      58                 :             : const TupleTableSlotOps *
      59                 :     3621620 : table_slot_callbacks(Relation relation)
      60                 :             : {
      61                 :     3621620 :         const TupleTableSlotOps *tts_cb;
      62                 :             : 
      63         [ +  + ]:     3621620 :         if (relation->rd_tableam)
      64                 :     3620701 :                 tts_cb = relation->rd_tableam->slot_callbacks(relation);
      65         [ -  + ]:         919 :         else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
      66                 :             :         {
      67                 :             :                 /*
      68                 :             :                  * Historically FDWs expect to store heap tuples in slots. Continue
      69                 :             :                  * handing them one, to make it less painful to adapt FDWs to new
      70                 :             :                  * versions. The cost of a heap slot over a virtual slot is pretty
      71                 :             :                  * small.
      72                 :             :                  */
      73                 :           0 :                 tts_cb = &TTSOpsHeapTuple;
      74                 :           0 :         }
      75                 :             :         else
      76                 :             :         {
      77                 :             :                 /*
      78                 :             :                  * These need to be supported, as some parts of the code (like COPY)
      79                 :             :                  * need to create slots for such relations too. It seems better to
      80                 :             :                  * centralize the knowledge that a heap slot is the right thing in
      81                 :             :                  * that case here.
      82                 :             :                  */
      83   [ +  +  +  - ]:         919 :                 Assert(relation->rd_rel->relkind == RELKIND_VIEW ||
      84                 :             :                            relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
      85                 :         919 :                 tts_cb = &TTSOpsVirtual;
      86                 :             :         }
      87                 :             : 
      88                 :     7243240 :         return tts_cb;
      89                 :     3621620 : }
      90                 :             : 
      91                 :             : TupleTableSlot *
      92                 :     3172484 : table_slot_create(Relation relation, List **reglist)
      93                 :             : {
      94                 :     3172484 :         const TupleTableSlotOps *tts_cb;
      95                 :     3172484 :         TupleTableSlot *slot;
      96                 :             : 
      97                 :     3172484 :         tts_cb = table_slot_callbacks(relation);
      98                 :     3172484 :         slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb);
      99                 :             : 
     100         [ +  + ]:     3172484 :         if (reglist)
     101                 :      410533 :                 *reglist = lappend(*reglist, slot);
     102                 :             : 
     103                 :     6344968 :         return slot;
     104                 :     3172484 : }
     105                 :             : 
     106                 :             : 
     107                 :             : /* ----------------------------------------------------------------------------
     108                 :             :  * Table scan functions.
     109                 :             :  * ----------------------------------------------------------------------------
     110                 :             :  */
     111                 :             : 
     112                 :             : TableScanDesc
     113                 :        6127 : table_beginscan_catalog(Relation relation, int nkeys, ScanKeyData *key)
     114                 :             : {
     115                 :        6127 :         uint32          flags = SO_TYPE_SEQSCAN |
     116                 :             :                 SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
     117                 :        6127 :         Oid                     relid = RelationGetRelid(relation);
     118                 :        6127 :         Snapshot        snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
     119                 :             : 
     120                 :       18381 :         return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key,
     121                 :        6127 :                                                                                         NULL, flags);
     122                 :        6127 : }
     123                 :             : 
     124                 :             : 
     125                 :             : /* ----------------------------------------------------------------------------
     126                 :             :  * Parallel table scan related functions.
     127                 :             :  * ----------------------------------------------------------------------------
     128                 :             :  */
     129                 :             : 
     130                 :             : Size
     131                 :         183 : table_parallelscan_estimate(Relation rel, Snapshot snapshot)
     132                 :             : {
     133                 :         183 :         Size            sz = 0;
     134                 :             : 
     135   [ +  +  -  + ]:         183 :         if (IsMVCCSnapshot(snapshot))
     136                 :         155 :                 sz = add_size(sz, EstimateSnapshotSpace(snapshot));
     137                 :             :         else
     138         [ +  - ]:          28 :                 Assert(snapshot == SnapshotAny);
     139                 :             : 
     140                 :         183 :         sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel));
     141                 :             : 
     142                 :         366 :         return sz;
     143                 :         183 : }
     144                 :             : 
     145                 :             : void
     146                 :         183 : table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan,
     147                 :             :                                                           Snapshot snapshot)
     148                 :             : {
     149                 :         183 :         Size            snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan);
     150                 :             : 
     151                 :         183 :         pscan->phs_snapshot_off = snapshot_off;
     152                 :             : 
     153   [ +  +  -  + ]:         183 :         if (IsMVCCSnapshot(snapshot))
     154                 :             :         {
     155                 :         155 :                 SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off);
     156                 :         155 :                 pscan->phs_snapshot_any = false;
     157                 :         155 :         }
     158                 :             :         else
     159                 :             :         {
     160         [ +  - ]:          28 :                 Assert(snapshot == SnapshotAny);
     161                 :          28 :                 pscan->phs_snapshot_any = true;
     162                 :             :         }
     163                 :         183 : }
     164                 :             : 
     165                 :             : TableScanDesc
     166                 :         659 : table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
     167                 :             : {
     168                 :         659 :         Snapshot        snapshot;
     169                 :         659 :         uint32          flags = SO_TYPE_SEQSCAN |
     170                 :             :                 SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
     171                 :             : 
     172         [ +  - ]:         659 :         Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator));
     173                 :             : 
     174         [ +  + ]:         659 :         if (!pscan->phs_snapshot_any)
     175                 :             :         {
     176                 :             :                 /* Snapshot was serialized -- restore it */
     177                 :         603 :                 snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
     178                 :         603 :                 RegisterSnapshot(snapshot);
     179                 :         603 :                 flags |= SO_TEMP_SNAPSHOT;
     180                 :         603 :         }
     181                 :             :         else
     182                 :             :         {
     183                 :             :                 /* SnapshotAny passed by caller (not serialized) */
     184                 :          56 :                 snapshot = SnapshotAny;
     185                 :             :         }
     186                 :             : 
     187                 :        1977 :         return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
     188                 :         659 :                                                                                         pscan, flags);
     189                 :         659 : }
     190                 :             : 
     191                 :             : TableScanDesc
     192                 :          20 : table_beginscan_parallel_tidrange(Relation relation,
     193                 :             :                                                                   ParallelTableScanDesc pscan)
     194                 :             : {
     195                 :          20 :         Snapshot        snapshot;
     196                 :          20 :         uint32          flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE;
     197                 :          20 :         TableScanDesc sscan;
     198                 :             : 
     199         [ +  - ]:          20 :         Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator));
     200                 :             : 
     201                 :             :         /* disable syncscan in parallel tid range scan. */
     202                 :          20 :         pscan->phs_syncscan = false;
     203                 :             : 
     204         [ -  + ]:          20 :         if (!pscan->phs_snapshot_any)
     205                 :             :         {
     206                 :             :                 /* Snapshot was serialized -- restore it */
     207                 :          20 :                 snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
     208                 :          20 :                 RegisterSnapshot(snapshot);
     209                 :          20 :                 flags |= SO_TEMP_SNAPSHOT;
     210                 :          20 :         }
     211                 :             :         else
     212                 :             :         {
     213                 :             :                 /* SnapshotAny passed by caller (not serialized) */
     214                 :           0 :                 snapshot = SnapshotAny;
     215                 :             :         }
     216                 :             : 
     217                 :          40 :         sscan = relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
     218                 :          20 :                                                                                          pscan, flags);
     219                 :          40 :         return sscan;
     220                 :          20 : }
     221                 :             : 
     222                 :             : 
     223                 :             : /* ----------------------------------------------------------------------------
     224                 :             :  * Index scan related functions.
     225                 :             :  * ----------------------------------------------------------------------------
     226                 :             :  */
     227                 :             : 
     228                 :             : /*
     229                 :             :  * To perform that check simply start an index scan, create the necessary
     230                 :             :  * slot, do the heap lookup, and shut everything down again. This could be
     231                 :             :  * optimized, but is unlikely to matter from a performance POV. If there
     232                 :             :  * frequently are live index pointers also matching a unique index key, the
     233                 :             :  * CPU overhead of this routine is unlikely to matter.
     234                 :             :  *
     235                 :             :  * Note that *tid may be modified when we return true if the AM supports
     236                 :             :  * storing multiple row versions reachable via a single index entry (like
     237                 :             :  * heap's HOT).
     238                 :             :  */
     239                 :             : bool
     240                 :     1847860 : table_index_fetch_tuple_check(Relation rel,
     241                 :             :                                                           ItemPointer tid,
     242                 :             :                                                           Snapshot snapshot,
     243                 :             :                                                           bool *all_dead)
     244                 :             : {
     245                 :     1847860 :         IndexFetchTableData *scan;
     246                 :     1847860 :         TupleTableSlot *slot;
     247                 :     1847860 :         bool            call_again = false;
     248                 :     1847860 :         bool            found;
     249                 :             : 
     250                 :     1847860 :         slot = table_slot_create(rel, NULL);
     251                 :     1847860 :         scan = table_index_fetch_begin(rel);
     252                 :     3695720 :         found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
     253                 :     1847860 :                                                                         all_dead);
     254                 :     1847860 :         table_index_fetch_end(scan);
     255                 :     1847860 :         ExecDropSingleTupleTableSlot(slot);
     256                 :             : 
     257                 :     3695720 :         return found;
     258                 :     1847860 : }
     259                 :             : 
     260                 :             : 
     261                 :             : /* ------------------------------------------------------------------------
     262                 :             :  * Functions for non-modifying operations on individual tuples
     263                 :             :  * ------------------------------------------------------------------------
     264                 :             :  */
     265                 :             : 
     266                 :             : void
     267                 :          51 : table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
     268                 :             : {
     269                 :          51 :         Relation        rel = scan->rs_rd;
     270                 :          51 :         const TableAmRoutine *tableam = rel->rd_tableam;
     271                 :             : 
     272                 :             :         /*
     273                 :             :          * We don't expect direct calls to table_tuple_get_latest_tid with valid
     274                 :             :          * CheckXidAlive for catalog or regular tables.  See detailed comments in
     275                 :             :          * xact.c where these variables are declared.
     276                 :             :          */
     277   [ +  -  +  - ]:          51 :         if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
     278   [ #  #  #  # ]:           0 :                 elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding");
     279                 :             : 
     280                 :             :         /*
     281                 :             :          * Since this can be called with user-supplied TID, don't trust the input
     282                 :             :          * too much.
     283                 :             :          */
     284         [ +  + ]:          51 :         if (!tableam->tuple_tid_valid(scan, tid))
     285   [ +  -  +  - ]:           2 :                 ereport(ERROR,
     286                 :             :                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     287                 :             :                                  errmsg("tid (%u, %u) is not valid for relation \"%s\"",
     288                 :             :                                                 ItemPointerGetBlockNumberNoCheck(tid),
     289                 :             :                                                 ItemPointerGetOffsetNumberNoCheck(tid),
     290                 :             :                                                 RelationGetRelationName(rel))));
     291                 :             : 
     292                 :          49 :         tableam->tuple_get_latest_tid(scan, tid);
     293                 :          49 : }
     294                 :             : 
     295                 :             : 
     296                 :             : /* ----------------------------------------------------------------------------
     297                 :             :  * Functions to make modifications a bit simpler.
     298                 :             :  * ----------------------------------------------------------------------------
     299                 :             :  */
     300                 :             : 
     301                 :             : /*
     302                 :             :  * simple_table_tuple_insert - insert a tuple
     303                 :             :  *
     304                 :             :  * Currently, this routine differs from table_tuple_insert only in supplying a
     305                 :             :  * default command ID and not allowing access to the speedup options.
     306                 :             :  */
     307                 :             : void
     308                 :           0 : simple_table_tuple_insert(Relation rel, TupleTableSlot *slot)
     309                 :             : {
     310                 :           0 :         table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL);
     311                 :           0 : }
     312                 :             : 
     313                 :             : /*
     314                 :             :  * simple_table_tuple_delete - delete a tuple
     315                 :             :  *
     316                 :             :  * This routine may be used to delete a tuple when concurrent updates of
     317                 :             :  * the target tuple are not expected (for example, because we have a lock
     318                 :             :  * on the relation associated with the tuple).  Any failure is reported
     319                 :             :  * via ereport().
     320                 :             :  */
     321                 :             : void
     322                 :           0 : simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot)
     323                 :             : {
     324                 :           0 :         TM_Result       result;
     325                 :           0 :         TM_FailureData tmfd;
     326                 :             : 
     327                 :           0 :         result = table_tuple_delete(rel, tid,
     328                 :           0 :                                                                 GetCurrentCommandId(true),
     329                 :           0 :                                                                 snapshot, InvalidSnapshot,
     330                 :             :                                                                 true /* wait for commit */ ,
     331                 :             :                                                                 &tmfd, false /* changingPart */ );
     332                 :             : 
     333   [ #  #  #  #  :           0 :         switch (result)
                      # ]
     334                 :             :         {
     335                 :             :                 case TM_SelfModified:
     336                 :             :                         /* Tuple was already updated in current command? */
     337   [ #  #  #  # ]:           0 :                         elog(ERROR, "tuple already updated by self");
     338                 :           0 :                         break;
     339                 :             : 
     340                 :             :                 case TM_Ok:
     341                 :             :                         /* done successfully */
     342                 :             :                         break;
     343                 :             : 
     344                 :             :                 case TM_Updated:
     345   [ #  #  #  # ]:           0 :                         elog(ERROR, "tuple concurrently updated");
     346                 :           0 :                         break;
     347                 :             : 
     348                 :             :                 case TM_Deleted:
     349   [ #  #  #  # ]:           0 :                         elog(ERROR, "tuple concurrently deleted");
     350                 :           0 :                         break;
     351                 :             : 
     352                 :             :                 default:
     353   [ #  #  #  # ]:           0 :                         elog(ERROR, "unrecognized table_tuple_delete status: %u", result);
     354                 :           0 :                         break;
     355                 :             :         }
     356                 :           0 : }
     357                 :             : 
     358                 :             : /*
     359                 :             :  * simple_table_tuple_update - replace a tuple
     360                 :             :  *
     361                 :             :  * This routine may be used to update a tuple when concurrent updates of
     362                 :             :  * the target tuple are not expected (for example, because we have a lock
     363                 :             :  * on the relation associated with the tuple).  Any failure is reported
     364                 :             :  * via ereport().
     365                 :             :  */
     366                 :             : void
     367                 :           0 : simple_table_tuple_update(Relation rel, ItemPointer otid,
     368                 :             :                                                   TupleTableSlot *slot,
     369                 :             :                                                   Snapshot snapshot,
     370                 :             :                                                   TU_UpdateIndexes *update_indexes)
     371                 :             : {
     372                 :           0 :         TM_Result       result;
     373                 :           0 :         TM_FailureData tmfd;
     374                 :           0 :         LockTupleMode lockmode;
     375                 :             : 
     376                 :           0 :         result = table_tuple_update(rel, otid, slot,
     377                 :           0 :                                                                 GetCurrentCommandId(true),
     378                 :           0 :                                                                 snapshot, InvalidSnapshot,
     379                 :             :                                                                 true /* wait for commit */ ,
     380                 :           0 :                                                                 &tmfd, &lockmode, update_indexes);
     381                 :             : 
     382   [ #  #  #  #  :           0 :         switch (result)
                      # ]
     383                 :             :         {
     384                 :             :                 case TM_SelfModified:
     385                 :             :                         /* Tuple was already updated in current command? */
     386   [ #  #  #  # ]:           0 :                         elog(ERROR, "tuple already updated by self");
     387                 :           0 :                         break;
     388                 :             : 
     389                 :             :                 case TM_Ok:
     390                 :             :                         /* done successfully */
     391                 :             :                         break;
     392                 :             : 
     393                 :             :                 case TM_Updated:
     394   [ #  #  #  # ]:           0 :                         elog(ERROR, "tuple concurrently updated");
     395                 :           0 :                         break;
     396                 :             : 
     397                 :             :                 case TM_Deleted:
     398   [ #  #  #  # ]:           0 :                         elog(ERROR, "tuple concurrently deleted");
     399                 :           0 :                         break;
     400                 :             : 
     401                 :             :                 default:
     402   [ #  #  #  # ]:           0 :                         elog(ERROR, "unrecognized table_tuple_update status: %u", result);
     403                 :           0 :                         break;
     404                 :             :         }
     405                 :           0 : }
     406                 :             : 
     407                 :             : 
     408                 :             : /* ----------------------------------------------------------------------------
     409                 :             :  * Helper functions to implement parallel scans for block oriented AMs.
     410                 :             :  * ----------------------------------------------------------------------------
     411                 :             :  */
     412                 :             : 
     413                 :             : Size
     414                 :         183 : table_block_parallelscan_estimate(Relation rel)
     415                 :             : {
     416                 :         183 :         return sizeof(ParallelBlockTableScanDescData);
     417                 :             : }
     418                 :             : 
     419                 :             : Size
     420                 :         183 : table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
     421                 :             : {
     422                 :         183 :         ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
     423                 :             : 
     424                 :         183 :         bpscan->base.phs_locator = rel->rd_locator;
     425                 :         183 :         bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel);
     426                 :             :         /* compare phs_syncscan initialization to similar logic in initscan */
     427         [ +  - ]:         366 :         bpscan->base.phs_syncscan = synchronize_seqscans &&
     428         [ -  + ]:         183 :                 !RelationUsesLocalBuffers(rel) &&
     429                 :         183 :                 bpscan->phs_nblocks > NBuffers / 4;
     430                 :         183 :         SpinLockInit(&bpscan->phs_mutex);
     431                 :         183 :         bpscan->phs_startblock = InvalidBlockNumber;
     432                 :         183 :         bpscan->phs_numblock = InvalidBlockNumber;
     433                 :         183 :         pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
     434                 :             : 
     435                 :         183 :         return sizeof(ParallelBlockTableScanDescData);
     436                 :         183 : }
     437                 :             : 
     438                 :             : void
     439                 :          38 : table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
     440                 :             : {
     441                 :          38 :         ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
     442                 :             : 
     443                 :          38 :         pg_atomic_write_u64(&bpscan->phs_nallocated, 0);
     444                 :          38 : }
     445                 :             : 
     446                 :             : /*
     447                 :             :  * find and set the scan's startblock
     448                 :             :  *
     449                 :             :  * Determine where the parallel seq scan should start.  This function may be
     450                 :             :  * called many times, once by each parallel worker.  We must be careful only
     451                 :             :  * to set the phs_startblock and phs_numblock fields once.
     452                 :             :  *
     453                 :             :  * Callers may optionally specify a non-InvalidBlockNumber value for
     454                 :             :  * 'startblock' to force the scan to start at the given page.  Likewise,
     455                 :             :  * 'numblocks' can be specified as a non-InvalidBlockNumber to limit the
     456                 :             :  * number of blocks to scan to that many blocks.
     457                 :             :  */
     458                 :             : void
     459                 :         519 : table_block_parallelscan_startblock_init(Relation rel,
     460                 :             :                                                                                  ParallelBlockTableScanWorker pbscanwork,
     461                 :             :                                                                                  ParallelBlockTableScanDesc pbscan,
     462                 :             :                                                                                  BlockNumber startblock,
     463                 :             :                                                                                  BlockNumber numblocks)
     464                 :             : {
     465                 :             :         StaticAssertDecl(MaxBlockNumber <= 0xFFFFFFFE,
     466                 :             :                                          "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
     467                 :             : 
     468                 :         519 :         BlockNumber sync_startpage = InvalidBlockNumber;
     469                 :         519 :         BlockNumber scan_nblocks;
     470                 :             : 
     471                 :             :         /* Reset the state we use for controlling allocation size. */
     472                 :         519 :         memset(pbscanwork, 0, sizeof(*pbscanwork));
     473                 :             : 
     474                 :             : retry:
     475                 :             :         /* Grab the spinlock. */
     476         [ -  + ]:         519 :         SpinLockAcquire(&pbscan->phs_mutex);
     477                 :             : 
     478                 :             :         /*
     479                 :             :          * When the caller specified a limit on the number of blocks to scan, set
     480                 :             :          * that in the ParallelBlockTableScanDesc, if it's not been done by
     481                 :             :          * another worker already.
     482                 :             :          */
     483   [ +  +  +  + ]:         519 :         if (numblocks != InvalidBlockNumber &&
     484                 :          20 :                 pbscan->phs_numblock == InvalidBlockNumber)
     485                 :             :         {
     486                 :           4 :                 pbscan->phs_numblock = numblocks;
     487                 :           4 :         }
     488                 :             : 
     489                 :             :         /*
     490                 :             :          * If the scan's phs_startblock has not yet been initialized, we must do
     491                 :             :          * so now.  If a startblock was specified, start there, otherwise if this
     492                 :             :          * is not a synchronized scan, we just start at block 0, but if it is a
     493                 :             :          * synchronized scan, we must get the starting position from the
     494                 :             :          * synchronized scan machinery.  We can't hold the spinlock while doing
     495                 :             :          * that, though, so release the spinlock, get the information we need, and
     496                 :             :          * retry.  If nobody else has initialized the scan in the meantime, we'll
     497                 :             :          * fill in the value we fetched on the second time through.
     498                 :             :          */
     499         [ +  + ]:         519 :         if (pbscan->phs_startblock == InvalidBlockNumber)
     500                 :             :         {
     501         [ +  + ]:         179 :                 if (startblock != InvalidBlockNumber)
     502                 :           4 :                         pbscan->phs_startblock = startblock;
     503         [ +  - ]:         175 :                 else if (!pbscan->base.phs_syncscan)
     504                 :         175 :                         pbscan->phs_startblock = 0;
     505         [ #  # ]:           0 :                 else if (sync_startpage != InvalidBlockNumber)
     506                 :           0 :                         pbscan->phs_startblock = sync_startpage;
     507                 :             :                 else
     508                 :             :                 {
     509                 :           0 :                         SpinLockRelease(&pbscan->phs_mutex);
     510                 :           0 :                         sync_startpage = ss_get_location(rel, pbscan->phs_nblocks);
     511                 :           0 :                         goto retry;
     512                 :             :                 }
     513                 :         179 :         }
     514                 :         519 :         SpinLockRelease(&pbscan->phs_mutex);
     515                 :             : 
     516                 :             :         /*
     517                 :             :          * Figure out how many blocks we're going to scan; either all of them, or
     518                 :             :          * just phs_numblock's worth, if a limit has been imposed.
     519                 :             :          */
     520         [ +  + ]:         519 :         if (pbscan->phs_numblock == InvalidBlockNumber)
     521                 :         499 :                 scan_nblocks = pbscan->phs_nblocks;
     522                 :             :         else
     523                 :          20 :                 scan_nblocks = pbscan->phs_numblock;
     524                 :             : 
     525                 :             :         /*
     526                 :             :          * We determine the chunk size based on scan_nblocks.  First we split
     527                 :             :          * scan_nblocks into PARALLEL_SEQSCAN_NCHUNKS chunks then we calculate the
     528                 :             :          * next highest power of 2 number of the result.  This means we split the
     529                 :             :          * blocks we're scanning into somewhere between PARALLEL_SEQSCAN_NCHUNKS
     530                 :             :          * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
     531                 :             :          */
     532         [ -  + ]:         519 :         pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(scan_nblocks /
     533                 :             :                                                                                                            PARALLEL_SEQSCAN_NCHUNKS, 1));
     534                 :             : 
     535                 :             :         /*
     536                 :             :          * Ensure we don't go over the maximum chunk size with larger tables. This
     537                 :             :          * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
     538                 :             :          * tables.  Too large a chunk size has been shown to be detrimental to
     539                 :             :          * sequential scan performance.
     540                 :             :          */
     541         [ +  - ]:         519 :         pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
     542                 :             :                                                                           PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
     543                 :         519 : }
     544                 :             : 
     545                 :             : /*
     546                 :             :  * get the next page to scan
     547                 :             :  *
     548                 :             :  * Get the next page to scan.  Even if there are no pages left to scan,
     549                 :             :  * another backend could have grabbed a page to scan and not yet finished
     550                 :             :  * looking at it, so it doesn't follow that the scan is done when the first
     551                 :             :  * backend gets an InvalidBlockNumber return.
     552                 :             :  */
     553                 :             : BlockNumber
     554                 :       30515 : table_block_parallelscan_nextpage(Relation rel,
     555                 :             :                                                                   ParallelBlockTableScanWorker pbscanwork,
     556                 :             :                                                                   ParallelBlockTableScanDesc pbscan)
     557                 :             : {
     558                 :       30515 :         BlockNumber scan_nblocks;
     559                 :       30515 :         BlockNumber page;
     560                 :       30515 :         uint64          nallocated;
     561                 :             : 
     562                 :             :         /*
     563                 :             :          * The logic below allocates block numbers out to parallel workers in a
     564                 :             :          * way that each worker will receive a set of consecutive block numbers to
     565                 :             :          * scan.  Earlier versions of this would allocate the next highest block
     566                 :             :          * number to the next worker to call this function.  This would generally
     567                 :             :          * result in workers never receiving consecutive block numbers.  Some
     568                 :             :          * operating systems would not detect the sequential I/O pattern due to
     569                 :             :          * each backend being a different process which could result in poor
     570                 :             :          * performance due to inefficient or no readahead.  To work around this
     571                 :             :          * issue, we now allocate a range of block numbers for each worker and
     572                 :             :          * when they come back for another block, we give them the next one in
     573                 :             :          * that range until the range is complete.  When the worker completes the
     574                 :             :          * range of blocks we then allocate another range for it and return the
     575                 :             :          * first block number from that range.
     576                 :             :          *
     577                 :             :          * Here we name these ranges of blocks "chunks".  The initial size of
     578                 :             :          * these chunks is determined in table_block_parallelscan_startblock_init
     579                 :             :          * based on the number of blocks to scan.  Towards the end of the scan, we
     580                 :             :          * start making reductions in the size of the chunks in order to attempt
     581                 :             :          * to divide the remaining work over all the workers as evenly as
     582                 :             :          * possible.
     583                 :             :          *
     584                 :             :          * Here pbscanwork is local worker memory.  phsw_chunk_remaining tracks
     585                 :             :          * the number of blocks remaining in the chunk.  When that reaches 0 then
     586                 :             :          * we must allocate a new chunk for the worker.
     587                 :             :          *
     588                 :             :          * phs_nallocated tracks how many blocks have been allocated to workers
     589                 :             :          * already.  When phs_nallocated >= rs_nblocks, all blocks have been
     590                 :             :          * allocated.
     591                 :             :          *
     592                 :             :          * Because we use an atomic fetch-and-add to fetch the current value, the
     593                 :             :          * phs_nallocated counter will exceed rs_nblocks, because workers will
     594                 :             :          * still increment the value, when they try to allocate the next block but
     595                 :             :          * all blocks have been allocated already. The counter must be 64 bits
     596                 :             :          * wide because of that, to avoid wrapping around when scan_nblocks is
     597                 :             :          * close to 2^32.
     598                 :             :          *
     599                 :             :          * The actual block to return is calculated by adding the counter to the
     600                 :             :          * starting block number, modulo phs_nblocks.
     601                 :             :          */
     602                 :             : 
     603                 :             :         /* First, figure out how many blocks we're planning on scanning */
     604         [ +  + ]:       30515 :         if (pbscan->phs_numblock == InvalidBlockNumber)
     605                 :       30412 :                 scan_nblocks = pbscan->phs_nblocks;
     606                 :             :         else
     607                 :         103 :                 scan_nblocks = pbscan->phs_numblock;
     608                 :             : 
     609                 :             :         /*
     610                 :             :          * Now check if we have any remaining blocks in a previous chunk for this
     611                 :             :          * worker.  We must consume all of the blocks from that before we allocate
     612                 :             :          * a new chunk to the worker.
     613                 :             :          */
     614         [ -  + ]:       30515 :         if (pbscanwork->phsw_chunk_remaining > 0)
     615                 :             :         {
     616                 :             :                 /*
     617                 :             :                  * Give them the next block in the range and update the remaining
     618                 :             :                  * number of blocks.
     619                 :             :                  */
     620                 :           0 :                 nallocated = ++pbscanwork->phsw_nallocated;
     621                 :           0 :                 pbscanwork->phsw_chunk_remaining--;
     622                 :           0 :         }
     623                 :             :         else
     624                 :             :         {
     625                 :             :                 /*
     626                 :             :                  * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
     627                 :             :                  * remaining in the scan, we half the chunk size.  Since we reduce the
     628                 :             :                  * chunk size here, we'll hit this again after doing
     629                 :             :                  * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size.  After a few
     630                 :             :                  * iterations of this, we'll end up doing the last few blocks with the
     631                 :             :                  * chunk size set to 1.
     632                 :             :                  */
     633   [ -  +  #  # ]:       30515 :                 if (pbscanwork->phsw_chunk_size > 1 &&
     634                 :           0 :                         pbscanwork->phsw_nallocated > scan_nblocks -
     635                 :           0 :                         (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
     636                 :           0 :                         pbscanwork->phsw_chunk_size >>= 1;
     637                 :             : 
     638                 :       30515 :                 nallocated = pbscanwork->phsw_nallocated =
     639                 :       61030 :                         pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
     640                 :       30515 :                                                                         pbscanwork->phsw_chunk_size);
     641                 :             : 
     642                 :             :                 /*
     643                 :             :                  * Set the remaining number of blocks in this chunk so that subsequent
     644                 :             :                  * calls from this worker continue on with this chunk until it's done.
     645                 :             :                  */
     646                 :       30515 :                 pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
     647                 :             :         }
     648                 :             : 
     649                 :             :         /* Check if we've run out of blocks to scan */
     650         [ +  + ]:       30515 :         if (nallocated >= scan_nblocks)
     651                 :         519 :                 page = InvalidBlockNumber;      /* all blocks have been allocated */
     652                 :             :         else
     653                 :       29996 :                 page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks;
     654                 :             : 
     655                 :             :         /*
     656                 :             :          * Report scan location.  Normally, we report the current page number.
     657                 :             :          * When we reach the end of the scan, though, we report the starting page,
     658                 :             :          * not the ending page, just so the starting positions for later scans
     659                 :             :          * doesn't slew backwards.  We only report the position at the end of the
     660                 :             :          * scan once, though: subsequent callers will report nothing.
     661                 :             :          */
     662         [ +  - ]:       30515 :         if (pbscan->base.phs_syncscan)
     663                 :             :         {
     664         [ #  # ]:           0 :                 if (page != InvalidBlockNumber)
     665                 :           0 :                         ss_report_location(rel, page);
     666         [ #  # ]:           0 :                 else if (nallocated == pbscan->phs_nblocks)
     667                 :           0 :                         ss_report_location(rel, pbscan->phs_startblock);
     668                 :           0 :         }
     669                 :             : 
     670                 :       61030 :         return page;
     671                 :       30515 : }
     672                 :             : 
     673                 :             : /* ----------------------------------------------------------------------------
     674                 :             :  * Helper functions to implement relation sizing for block oriented AMs.
     675                 :             :  * ----------------------------------------------------------------------------
     676                 :             :  */
     677                 :             : 
     678                 :             : /*
     679                 :             :  * table_block_relation_size
     680                 :             :  *
     681                 :             :  * If a table AM uses the various relation forks as the sole place where data
     682                 :             :  * is stored, and if it uses them in the expected manner (e.g. the actual data
     683                 :             :  * is in the main fork rather than some other), it can use this implementation
     684                 :             :  * of the relation_size callback rather than implementing its own.
     685                 :             :  */
     686                 :             : uint64
     687                 :      405815 : table_block_relation_size(Relation rel, ForkNumber forkNumber)
     688                 :             : {
     689                 :      405815 :         uint64          nblocks = 0;
     690                 :             : 
     691                 :             :         /* InvalidForkNumber indicates returning the size for all forks */
     692         [ -  + ]:      405815 :         if (forkNumber == InvalidForkNumber)
     693                 :             :         {
     694         [ #  # ]:           0 :                 for (int i = 0; i < MAX_FORKNUM; i++)
     695                 :           0 :                         nblocks += smgrnblocks(RelationGetSmgr(rel), i);
     696                 :           0 :         }
     697                 :             :         else
     698                 :      405815 :                 nblocks = smgrnblocks(RelationGetSmgr(rel), forkNumber);
     699                 :             : 
     700                 :      811630 :         return nblocks * BLCKSZ;
     701                 :      405815 : }
     702                 :             : 
     703                 :             : /*
     704                 :             :  * table_block_relation_estimate_size
     705                 :             :  *
     706                 :             :  * This function can't be directly used as the implementation of the
     707                 :             :  * relation_estimate_size callback, because it has a few additional parameters.
     708                 :             :  * Instead, it is intended to be used as a helper function; the caller can
     709                 :             :  * pass through the arguments to its relation_estimate_size function plus the
     710                 :             :  * additional values required here.
     711                 :             :  *
     712                 :             :  * overhead_bytes_per_tuple should contain the approximate number of bytes
     713                 :             :  * of storage required to store a tuple above and beyond what is required for
     714                 :             :  * the tuple data proper. Typically, this would include things like the
     715                 :             :  * size of the tuple header and item pointer. This is only used for query
     716                 :             :  * planning, so a table AM where the value is not constant could choose to
     717                 :             :  * pass a "best guess".
     718                 :             :  *
     719                 :             :  * usable_bytes_per_page should contain the approximate number of bytes per
     720                 :             :  * page usable for tuple data, excluding the page header and any anticipated
     721                 :             :  * special space.
     722                 :             :  */
     723                 :             : void
     724                 :       47801 : table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
     725                 :             :                                                                    BlockNumber *pages, double *tuples,
     726                 :             :                                                                    double *allvisfrac,
     727                 :             :                                                                    Size overhead_bytes_per_tuple,
     728                 :             :                                                                    Size usable_bytes_per_page)
     729                 :             : {
     730                 :       47801 :         BlockNumber curpages;
     731                 :       47801 :         BlockNumber relpages;
     732                 :       47801 :         double          reltuples;
     733                 :       47801 :         BlockNumber relallvisible;
     734                 :       47801 :         double          density;
     735                 :             : 
     736                 :             :         /* it should have storage, so we can call the smgr */
     737                 :       47801 :         curpages = RelationGetNumberOfBlocks(rel);
     738                 :             : 
     739                 :             :         /* coerce values in pg_class to more desirable types */
     740                 :       47801 :         relpages = (BlockNumber) rel->rd_rel->relpages;
     741                 :       47801 :         reltuples = (double) rel->rd_rel->reltuples;
     742                 :       47801 :         relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
     743                 :             : 
     744                 :             :         /*
     745                 :             :          * HACK: if the relation has never yet been vacuumed, use a minimum size
     746                 :             :          * estimate of 10 pages.  The idea here is to avoid assuming a
     747                 :             :          * newly-created table is really small, even if it currently is, because
     748                 :             :          * that may not be true once some data gets loaded into it.  Once a vacuum
     749                 :             :          * or analyze cycle has been done on it, it's more reasonable to believe
     750                 :             :          * the size is somewhat stable.
     751                 :             :          *
     752                 :             :          * (Note that this is only an issue if the plan gets cached and used again
     753                 :             :          * after the table has been filled.  What we're trying to avoid is using a
     754                 :             :          * nestloop-type plan on a table that has grown substantially since the
     755                 :             :          * plan was made.  Normally, autovacuum/autoanalyze will occur once enough
     756                 :             :          * inserts have happened and cause cached-plan invalidation; but that
     757                 :             :          * doesn't happen instantaneously, and it won't happen at all for cases
     758                 :             :          * such as temporary tables.)
     759                 :             :          *
     760                 :             :          * We test "never vacuumed" by seeing whether reltuples < 0.
     761                 :             :          *
     762                 :             :          * If the table has inheritance children, we don't apply this heuristic.
     763                 :             :          * Totally empty parent tables are quite common, so we should be willing
     764                 :             :          * to believe that they are empty.
     765                 :             :          */
     766         [ +  + ]:       47801 :         if (curpages < 10 &&
     767   [ +  +  +  + ]:       30258 :                 reltuples < 0 &&
     768                 :       15652 :                 !rel->rd_rel->relhassubclass)
     769                 :       15253 :                 curpages = 10;
     770                 :             : 
     771                 :             :         /* report estimated # pages */
     772                 :       47801 :         *pages = curpages;
     773                 :             :         /* quick exit if rel is clearly empty */
     774         [ +  + ]:       47801 :         if (curpages == 0)
     775                 :             :         {
     776                 :        1135 :                 *tuples = 0;
     777                 :        1135 :                 *allvisfrac = 0;
     778                 :        1135 :                 return;
     779                 :             :         }
     780                 :             : 
     781                 :             :         /* estimate number of tuples from previous tuple density */
     782   [ +  +  +  + ]:       46666 :         if (reltuples >= 0 && relpages > 0)
     783                 :       25669 :                 density = reltuples / (double) relpages;
     784                 :             :         else
     785                 :             :         {
     786                 :             :                 /*
     787                 :             :                  * When we have no data because the relation was never yet vacuumed,
     788                 :             :                  * estimate tuple width from attribute datatypes.  We assume here that
     789                 :             :                  * the pages are completely full, which is OK for tables but is
     790                 :             :                  * probably an overestimate for indexes.  Fortunately
     791                 :             :                  * get_relation_info() can clamp the overestimate to the parent
     792                 :             :                  * table's size.
     793                 :             :                  *
     794                 :             :                  * Note: this code intentionally disregards alignment considerations,
     795                 :             :                  * because (a) that would be gilding the lily considering how crude
     796                 :             :                  * the estimate is, (b) it creates platform dependencies in the
     797                 :             :                  * default plans which are kind of a headache for regression testing,
     798                 :             :                  * and (c) different table AMs might use different padding schemes.
     799                 :             :                  */
     800                 :       20997 :                 int32           tuple_width;
     801                 :       20997 :                 int                     fillfactor;
     802                 :             : 
     803                 :             :                 /*
     804                 :             :                  * Without reltuples/relpages, we also need to consider fillfactor.
     805                 :             :                  * The other branch considers it implicitly by calculating density
     806                 :             :                  * from actual relpages/reltuples statistics.
     807                 :             :                  */
     808         [ +  + ]:       20997 :                 fillfactor = RelationGetFillFactor(rel, HEAP_DEFAULT_FILLFACTOR);
     809                 :             : 
     810                 :       20997 :                 tuple_width = get_rel_data_width(rel, attr_widths);
     811                 :       20997 :                 tuple_width += overhead_bytes_per_tuple;
     812                 :             :                 /* note: integer division is intentional here */
     813                 :       20997 :                 density = (usable_bytes_per_page * fillfactor / 100) / tuple_width;
     814                 :             :                 /* There's at least one row on the page, even with low fillfactor. */
     815                 :       20997 :                 density = clamp_row_est(density);
     816                 :       20997 :         }
     817                 :       46666 :         *tuples = rint(density * (double) curpages);
     818                 :             : 
     819                 :             :         /*
     820                 :             :          * We use relallvisible as-is, rather than scaling it up like we do for
     821                 :             :          * the pages and tuples counts, on the theory that any pages added since
     822                 :             :          * the last VACUUM are most likely not marked all-visible.  But costsize.c
     823                 :             :          * wants it converted to a fraction.
     824                 :             :          */
     825   [ +  +  -  + ]:       46666 :         if (relallvisible == 0 || curpages <= 0)
     826                 :       27861 :                 *allvisfrac = 0;
     827         [ +  + ]:       18805 :         else if ((double) relallvisible >= curpages)
     828                 :        6891 :                 *allvisfrac = 1;
     829                 :             :         else
     830                 :       11914 :                 *allvisfrac = (double) relallvisible / curpages;
     831         [ -  + ]:       47801 : }

Generated by: LCOV version 2.3.2-1