LCOV - code coverage report
Current view: top level - src/backend/storage/buffer - bufmgr.c (source / functions) Coverage Total Hit
Test: Code coverage Lines: 72.3 % 2992 2164
Test Date: 2026-01-26 10:56:24 Functions: 81.3 % 139 113
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
Branches: 50.8 % 1881 956

             Branch data     Line data    Source code
       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * bufmgr.c
       4                 :             :  *        buffer manager interface routines
       5                 :             :  *
       6                 :             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7                 :             :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :             :  *
       9                 :             :  *
      10                 :             :  * IDENTIFICATION
      11                 :             :  *        src/backend/storage/buffer/bufmgr.c
      12                 :             :  *
      13                 :             :  *-------------------------------------------------------------------------
      14                 :             :  */
      15                 :             : /*
      16                 :             :  * Principal entry points:
      17                 :             :  *
      18                 :             :  * ReadBuffer() -- find or create a buffer holding the requested page,
      19                 :             :  *              and pin it so that no one can destroy it while this process
      20                 :             :  *              is using it.
      21                 :             :  *
      22                 :             :  * StartReadBuffer() -- as above, with separate wait step
      23                 :             :  * StartReadBuffers() -- multiple block version
      24                 :             :  * WaitReadBuffers() -- second step of above
      25                 :             :  *
      26                 :             :  * ReleaseBuffer() -- unpin a buffer
      27                 :             :  *
      28                 :             :  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
      29                 :             :  *              The disk write is delayed until buffer replacement or checkpoint.
      30                 :             :  *
      31                 :             :  * See also these files:
      32                 :             :  *              freelist.c -- chooses victim for buffer replacement
      33                 :             :  *              buf_table.c -- manages the buffer lookup table
      34                 :             :  */
      35                 :             : #include "postgres.h"
      36                 :             : 
      37                 :             : #include <sys/file.h>
      38                 :             : #include <unistd.h>
      39                 :             : 
      40                 :             : #include "access/tableam.h"
      41                 :             : #include "access/xloginsert.h"
      42                 :             : #include "access/xlogutils.h"
      43                 :             : #ifdef USE_ASSERT_CHECKING
      44                 :             : #include "catalog/pg_tablespace_d.h"
      45                 :             : #endif
      46                 :             : #include "catalog/storage.h"
      47                 :             : #include "catalog/storage_xlog.h"
      48                 :             : #include "executor/instrument.h"
      49                 :             : #include "lib/binaryheap.h"
      50                 :             : #include "miscadmin.h"
      51                 :             : #include "pg_trace.h"
      52                 :             : #include "pgstat.h"
      53                 :             : #include "postmaster/bgwriter.h"
      54                 :             : #include "storage/aio.h"
      55                 :             : #include "storage/buf_internals.h"
      56                 :             : #include "storage/bufmgr.h"
      57                 :             : #include "storage/fd.h"
      58                 :             : #include "storage/ipc.h"
      59                 :             : #include "storage/lmgr.h"
      60                 :             : #include "storage/proc.h"
      61                 :             : #include "storage/proclist.h"
      62                 :             : #include "storage/read_stream.h"
      63                 :             : #include "storage/smgr.h"
      64                 :             : #include "storage/standby.h"
      65                 :             : #include "utils/memdebug.h"
      66                 :             : #include "utils/ps_status.h"
      67                 :             : #include "utils/rel.h"
      68                 :             : #include "utils/resowner.h"
      69                 :             : #include "utils/timestamp.h"
      70                 :             : 
      71                 :             : 
      72                 :             : /* Note: these two macros only work on shared buffers, not local ones! */
      73                 :             : #define BufHdrGetBlock(bufHdr)  ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
      74                 :             : #define BufferGetLSN(bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))
      75                 :             : 
      76                 :             : /* Note: this macro only works on local buffers, not shared ones! */
      77                 :             : #define LocalBufHdrGetBlock(bufHdr) \
      78                 :             :         LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
      79                 :             : 
      80                 :             : /* Bits in SyncOneBuffer's return value */
      81                 :             : #define BUF_WRITTEN                             0x01
      82                 :             : #define BUF_REUSABLE                    0x02
      83                 :             : 
      84                 :             : #define RELS_BSEARCH_THRESHOLD          20
      85                 :             : 
      86                 :             : /*
      87                 :             :  * This is the size (in the number of blocks) above which we scan the
      88                 :             :  * entire buffer pool to remove the buffers for all the pages of relation
      89                 :             :  * being dropped. For the relations with size below this threshold, we find
      90                 :             :  * the buffers by doing lookups in BufMapping table.
      91                 :             :  */
      92                 :             : #define BUF_DROP_FULL_SCAN_THRESHOLD            (uint64) (NBuffers / 32)
      93                 :             : 
      94                 :             : /*
      95                 :             :  * This is separated out from PrivateRefCountEntry to allow for copying all
      96                 :             :  * the data members via struct assignment.
      97                 :             :  */
      98                 :             : typedef struct PrivateRefCountData
      99                 :             : {
     100                 :             :         /*
     101                 :             :          * How many times has the buffer been pinned by this backend.
     102                 :             :          */
     103                 :             :         int32           refcount;
     104                 :             : 
     105                 :             :         /*
     106                 :             :          * Is the buffer locked by this backend? BUFFER_LOCK_UNLOCK indicates that
     107                 :             :          * the buffer is not locked.
     108                 :             :          */
     109                 :             :         BufferLockMode lockmode;
     110                 :             : } PrivateRefCountData;
     111                 :             : 
     112                 :             : typedef struct PrivateRefCountEntry
     113                 :             : {
     114                 :             :         /*
     115                 :             :          * Note that this needs to be same as the entry's corresponding
     116                 :             :          * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We
     117                 :             :          * store it in both places as this is used for the hashtable key and
     118                 :             :          * because it is more convenient (passing around a PrivateRefCountEntry
     119                 :             :          * suffices to identify the buffer) and faster (checking the keys array is
     120                 :             :          * faster when checking many entries, checking the entry is faster if just
     121                 :             :          * checking a single entry).
     122                 :             :          */
     123                 :             :         Buffer          buffer;
     124                 :             : 
     125                 :             :         PrivateRefCountData data;
     126                 :             : } PrivateRefCountEntry;
     127                 :             : 
     128                 :             : /* 64 bytes, about the size of a cache line on common systems */
     129                 :             : #define REFCOUNT_ARRAY_ENTRIES 8
     130                 :             : 
     131                 :             : /*
     132                 :             :  * Status of buffers to checkpoint for a particular tablespace, used
     133                 :             :  * internally in BufferSync.
     134                 :             :  */
     135                 :             : typedef struct CkptTsStatus
     136                 :             : {
     137                 :             :         /* oid of the tablespace */
     138                 :             :         Oid                     tsId;
     139                 :             : 
     140                 :             :         /*
     141                 :             :          * Checkpoint progress for this tablespace. To make progress comparable
     142                 :             :          * between tablespaces the progress is, for each tablespace, measured as a
     143                 :             :          * number between 0 and the total number of to-be-checkpointed pages. Each
     144                 :             :          * page checkpointed in this tablespace increments this space's progress
     145                 :             :          * by progress_slice.
     146                 :             :          */
     147                 :             :         float8          progress;
     148                 :             :         float8          progress_slice;
     149                 :             : 
     150                 :             :         /* number of to-be checkpointed pages in this tablespace */
     151                 :             :         int                     num_to_scan;
     152                 :             :         /* already processed pages in this tablespace */
     153                 :             :         int                     num_scanned;
     154                 :             : 
     155                 :             :         /* current offset in CkptBufferIds for this tablespace */
     156                 :             :         int                     index;
     157                 :             : } CkptTsStatus;
     158                 :             : 
     159                 :             : /*
     160                 :             :  * Type for array used to sort SMgrRelations
     161                 :             :  *
     162                 :             :  * FlushRelationsAllBuffers shares the same comparator function with
     163                 :             :  * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
     164                 :             :  * compatible.
     165                 :             :  */
     166                 :             : typedef struct SMgrSortArray
     167                 :             : {
     168                 :             :         RelFileLocator rlocator;        /* This must be the first member */
     169                 :             :         SMgrRelation srel;
     170                 :             : } SMgrSortArray;
     171                 :             : 
     172                 :             : /* GUC variables */
     173                 :             : bool            zero_damaged_pages = false;
     174                 :             : int                     bgwriter_lru_maxpages = 100;
     175                 :             : double          bgwriter_lru_multiplier = 2.0;
     176                 :             : bool            track_io_timing = false;
     177                 :             : 
     178                 :             : /*
     179                 :             :  * How many buffers PrefetchBuffer callers should try to stay ahead of their
     180                 :             :  * ReadBuffer calls by.  Zero means "never prefetch".  This value is only used
     181                 :             :  * for buffers not belonging to tablespaces that have their
     182                 :             :  * effective_io_concurrency parameter set.
     183                 :             :  */
     184                 :             : int                     effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
     185                 :             : 
     186                 :             : /*
     187                 :             :  * Like effective_io_concurrency, but used by maintenance code paths that might
     188                 :             :  * benefit from a higher setting because they work on behalf of many sessions.
     189                 :             :  * Overridden by the tablespace setting of the same name.
     190                 :             :  */
     191                 :             : int                     maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
     192                 :             : 
     193                 :             : /*
     194                 :             :  * Limit on how many blocks should be handled in single I/O operations.
     195                 :             :  * StartReadBuffers() callers should respect it, as should other operations
     196                 :             :  * that call smgr APIs directly.  It is computed as the minimum of underlying
     197                 :             :  * GUCs io_combine_limit_guc and io_max_combine_limit.
     198                 :             :  */
     199                 :             : int                     io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
     200                 :             : int                     io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT;
     201                 :             : int                     io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
     202                 :             : 
     203                 :             : /*
     204                 :             :  * GUC variables about triggering kernel writeback for buffers written; OS
     205                 :             :  * dependent defaults are set via the GUC mechanism.
     206                 :             :  */
     207                 :             : int                     checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
     208                 :             : int                     bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
     209                 :             : int                     backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
     210                 :             : 
     211                 :             : /* local state for LockBufferForCleanup */
     212                 :             : static BufferDesc *PinCountWaitBuf = NULL;
     213                 :             : 
     214                 :             : /*
     215                 :             :  * Backend-Private refcount management:
     216                 :             :  *
     217                 :             :  * Each buffer also has a private refcount that keeps track of the number of
     218                 :             :  * times the buffer is pinned in the current process.  This is so that the
     219                 :             :  * shared refcount needs to be modified only once if a buffer is pinned more
     220                 :             :  * than once by an individual backend.  It's also used to check that no
     221                 :             :  * buffers are still pinned at the end of transactions and when exiting. We
     222                 :             :  * also use this mechanism to track whether this backend has a buffer locked,
     223                 :             :  * and, if so, in what mode.
     224                 :             :  *
     225                 :             :  *
     226                 :             :  * To avoid - as we used to - requiring an array with NBuffers entries to keep
     227                 :             :  * track of local buffers, we use a small sequentially searched array
     228                 :             :  * (PrivateRefCountArrayKeys, with the corresponding data stored in
     229                 :             :  * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
     230                 :             :  * keep track of backend local pins.
     231                 :             :  *
     232                 :             :  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
     233                 :             :  * refcounts are kept track of in the array; after that, new array entries
     234                 :             :  * displace old ones into the hash table. That way a frequently used entry
     235                 :             :  * can't get "stuck" in the hashtable while infrequent ones clog the array.
     236                 :             :  *
     237                 :             :  * Note that in most scenarios the number of pinned buffers will not exceed
     238                 :             :  * REFCOUNT_ARRAY_ENTRIES.
     239                 :             :  *
     240                 :             :  *
     241                 :             :  * To enter a buffer into the refcount tracking mechanism first reserve a free
     242                 :             :  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
     243                 :             :  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
     244                 :             :  * memory allocations in NewPrivateRefCountEntry() which can be important
     245                 :             :  * because in some scenarios it's called with a spinlock held...
     246                 :             :  */
     247                 :             : static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES];
     248                 :             : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
     249                 :             : static HTAB *PrivateRefCountHash = NULL;
     250                 :             : static int32 PrivateRefCountOverflowed = 0;
     251                 :             : static uint32 PrivateRefCountClock = 0;
     252                 :             : static int      ReservedRefCountSlot = -1;
     253                 :             : static int      PrivateRefCountEntryLast = -1;
     254                 :             : 
     255                 :             : static uint32 MaxProportionalPins;
     256                 :             : 
     257                 :             : static void ReservePrivateRefCountEntry(void);
     258                 :             : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
     259                 :             : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
     260                 :             : static inline int32 GetPrivateRefCount(Buffer buffer);
     261                 :             : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
     262                 :             : 
     263                 :             : /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
     264                 :             : static void ResOwnerReleaseBufferIO(Datum res);
     265                 :             : static char *ResOwnerPrintBufferIO(Datum res);
     266                 :             : static void ResOwnerReleaseBuffer(Datum res);
     267                 :             : static char *ResOwnerPrintBuffer(Datum res);
     268                 :             : 
     269                 :             : const ResourceOwnerDesc buffer_io_resowner_desc =
     270                 :             : {
     271                 :             :         .name = "buffer io",
     272                 :             :         .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
     273                 :             :         .release_priority = RELEASE_PRIO_BUFFER_IOS,
     274                 :             :         .ReleaseResource = ResOwnerReleaseBufferIO,
     275                 :             :         .DebugPrint = ResOwnerPrintBufferIO
     276                 :             : };
     277                 :             : 
     278                 :             : const ResourceOwnerDesc buffer_resowner_desc =
     279                 :             : {
     280                 :             :         .name = "buffer",
     281                 :             :         .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
     282                 :             :         .release_priority = RELEASE_PRIO_BUFFER_PINS,
     283                 :             :         .ReleaseResource = ResOwnerReleaseBuffer,
     284                 :             :         .DebugPrint = ResOwnerPrintBuffer
     285                 :             : };
     286                 :             : 
     287                 :             : /*
     288                 :             :  * Ensure that the PrivateRefCountArray has sufficient space to store one more
     289                 :             :  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
     290                 :             :  * a new entry - but it's perfectly fine to not use a reserved entry.
     291                 :             :  */
     292                 :             : static void
     293                 :    11430862 : ReservePrivateRefCountEntry(void)
     294                 :             : {
     295                 :             :         /* Already reserved (or freed), nothing to do */
     296         [ +  + ]:    11430862 :         if (ReservedRefCountSlot != -1)
     297                 :    10001149 :                 return;
     298                 :             : 
     299                 :             :         /*
     300                 :             :          * First search for a free entry the array, that'll be sufficient in the
     301                 :             :          * majority of cases.
     302                 :             :          */
     303                 :             :         {
     304                 :     1429713 :                 int                     i;
     305                 :             : 
     306         [ +  + ]:    12867417 :                 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
     307                 :             :                 {
     308         [ +  + ]:    11437704 :                         if (PrivateRefCountArrayKeys[i] == InvalidBuffer)
     309                 :             :                         {
     310                 :     8900598 :                                 ReservedRefCountSlot = i;
     311                 :             : 
     312                 :             :                                 /*
     313                 :             :                                  * We could return immediately, but iterating till the end of
     314                 :             :                                  * the array allows compiler-autovectorization.
     315                 :             :                                  */
     316                 :     8900598 :                         }
     317                 :    11437704 :                 }
     318                 :             : 
     319         [ +  + ]:     1429713 :                 if (ReservedRefCountSlot != -1)
     320                 :     1421696 :                         return;
     321      [ -  +  + ]:     1429713 :         }
     322                 :             : 
     323                 :             :         /*
     324                 :             :          * No luck. All array entries are full. Move one array entry into the hash
     325                 :             :          * table.
     326                 :             :          */
     327                 :             :         {
     328                 :             :                 /*
     329                 :             :                  * Move entry from the current clock position in the array into the
     330                 :             :                  * hashtable. Use that slot.
     331                 :             :                  */
     332                 :        8017 :                 int                     victim_slot;
     333                 :        8017 :                 PrivateRefCountEntry *victim_entry;
     334                 :        8017 :                 PrivateRefCountEntry *hashent;
     335                 :        8017 :                 bool            found;
     336                 :             : 
     337                 :             :                 /* select victim slot */
     338                 :        8017 :                 victim_slot = PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES;
     339                 :        8017 :                 victim_entry = &PrivateRefCountArray[victim_slot];
     340                 :        8017 :                 ReservedRefCountSlot = victim_slot;
     341                 :             : 
     342                 :             :                 /* Better be used, otherwise we shouldn't get here. */
     343         [ +  - ]:        8017 :                 Assert(PrivateRefCountArrayKeys[victim_slot] != InvalidBuffer);
     344         [ +  - ]:        8017 :                 Assert(PrivateRefCountArray[victim_slot].buffer != InvalidBuffer);
     345         [ +  - ]:        8017 :                 Assert(PrivateRefCountArrayKeys[victim_slot] == PrivateRefCountArray[victim_slot].buffer);
     346                 :             : 
     347                 :             :                 /* enter victim array entry into hashtable */
     348                 :       16034 :                 hashent = hash_search(PrivateRefCountHash,
     349                 :        8017 :                                                           &PrivateRefCountArrayKeys[victim_slot],
     350                 :             :                                                           HASH_ENTER,
     351                 :             :                                                           &found);
     352         [ +  - ]:        8017 :                 Assert(!found);
     353                 :             :                 /* move data from the entry in the array to the hash entry */
     354                 :        8017 :                 hashent->data = victim_entry->data;
     355                 :             : 
     356                 :             :                 /* clear the now free array slot */
     357                 :        8017 :                 PrivateRefCountArrayKeys[victim_slot] = InvalidBuffer;
     358                 :        8017 :                 victim_entry->buffer = InvalidBuffer;
     359                 :             : 
     360                 :             :                 /* clear the whole data member, just for future proofing */
     361                 :        8017 :                 memset(&victim_entry->data, 0, sizeof(victim_entry->data));
     362                 :        8017 :                 victim_entry->data.refcount = 0;
     363                 :        8017 :                 victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
     364                 :             : 
     365                 :        8017 :                 PrivateRefCountOverflowed++;
     366                 :        8017 :         }
     367                 :    11430862 : }
     368                 :             : 
     369                 :             : /*
     370                 :             :  * Fill a previously reserved refcount entry.
     371                 :             :  */
     372                 :             : static PrivateRefCountEntry *
     373                 :    10641775 : NewPrivateRefCountEntry(Buffer buffer)
     374                 :             : {
     375                 :    10641775 :         PrivateRefCountEntry *res;
     376                 :             : 
     377                 :             :         /* only allowed to be called when a reservation has been made */
     378         [ +  - ]:    10641775 :         Assert(ReservedRefCountSlot != -1);
     379                 :             : 
     380                 :             :         /* use up the reserved entry */
     381                 :    10641775 :         res = &PrivateRefCountArray[ReservedRefCountSlot];
     382                 :             : 
     383                 :             :         /* and fill it */
     384                 :    10641775 :         PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
     385                 :    10641775 :         res->buffer = buffer;
     386                 :    10641775 :         res->data.refcount = 0;
     387                 :    10641775 :         res->data.lockmode = BUFFER_LOCK_UNLOCK;
     388                 :             : 
     389                 :             :         /* update cache for the next lookup */
     390                 :    10641775 :         PrivateRefCountEntryLast = ReservedRefCountSlot;
     391                 :             : 
     392                 :    10641775 :         ReservedRefCountSlot = -1;
     393                 :             : 
     394                 :    21283550 :         return res;
     395                 :    10641775 : }
     396                 :             : 
     397                 :             : /*
     398                 :             :  * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth
     399                 :             :  * inlining. This particularly seems to be true if the compiler is capable of
     400                 :             :  * auto-vectorizing the code, as that imposes additional stack-alignment
     401                 :             :  * requirements etc.
     402                 :             :  */
     403                 :             : static pg_noinline PrivateRefCountEntry *
     404                 :    16052330 : GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
     405                 :             : {
     406                 :    16052330 :         PrivateRefCountEntry *res;
     407                 :    16052330 :         int                     match = -1;
     408                 :    16052330 :         int                     i;
     409                 :             : 
     410                 :             :         /*
     411                 :             :          * First search for references in the array, that'll be sufficient in the
     412                 :             :          * majority of cases.
     413                 :             :          */
     414         [ +  + ]:   144470970 :         for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
     415                 :             :         {
     416         [ +  + ]:   128418640 :                 if (PrivateRefCountArrayKeys[i] == buffer)
     417                 :             :                 {
     418                 :     5385536 :                         match = i;
     419                 :             :                         /* see ReservePrivateRefCountEntry() for why we don't return */
     420                 :     5385536 :                 }
     421                 :   128418640 :         }
     422                 :             : 
     423         [ +  + ]:    16052330 :         if (likely(match != -1))
     424                 :             :         {
     425                 :             :                 /* update cache for the next lookup */
     426                 :     5385536 :                 PrivateRefCountEntryLast = match;
     427                 :             : 
     428                 :     5385536 :                 return &PrivateRefCountArray[match];
     429                 :             :         }
     430                 :             : 
     431                 :             :         /*
     432                 :             :          * By here we know that the buffer, if already pinned, isn't residing in
     433                 :             :          * the array.
     434                 :             :          *
     435                 :             :          * Only look up the buffer in the hashtable if we've previously overflowed
     436                 :             :          * into it.
     437                 :             :          */
     438         [ +  + ]:    10666794 :         if (PrivateRefCountOverflowed == 0)
     439                 :    10595830 :                 return NULL;
     440                 :             : 
     441                 :       70964 :         res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
     442                 :             : 
     443         [ +  + ]:       70964 :         if (res == NULL)
     444                 :        5408 :                 return NULL;
     445         [ +  + ]:       65556 :         else if (!do_move)
     446                 :             :         {
     447                 :             :                 /* caller doesn't want us to move the hash entry into the array */
     448                 :       64209 :                 return res;
     449                 :             :         }
     450                 :             :         else
     451                 :             :         {
     452                 :             :                 /* move buffer from hashtable into the free array slot */
     453                 :        1347 :                 bool            found;
     454                 :        1347 :                 PrivateRefCountEntry *free;
     455                 :             : 
     456                 :             :                 /* Ensure there's a free array slot */
     457                 :        1347 :                 ReservePrivateRefCountEntry();
     458                 :             : 
     459                 :             :                 /* Use up the reserved slot */
     460         [ +  - ]:        1347 :                 Assert(ReservedRefCountSlot != -1);
     461                 :        1347 :                 free = &PrivateRefCountArray[ReservedRefCountSlot];
     462         [ +  - ]:        1347 :                 Assert(PrivateRefCountArrayKeys[ReservedRefCountSlot] == free->buffer);
     463         [ +  - ]:        1347 :                 Assert(free->buffer == InvalidBuffer);
     464                 :             : 
     465                 :             :                 /* and fill it */
     466                 :        1347 :                 free->buffer = buffer;
     467                 :        1347 :                 free->data = res->data;
     468                 :        1347 :                 PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
     469                 :             :                 /* update cache for the next lookup */
     470                 :        1347 :                 PrivateRefCountEntryLast = match;
     471                 :             : 
     472                 :        1347 :                 ReservedRefCountSlot = -1;
     473                 :             : 
     474                 :             : 
     475                 :             :                 /* delete from hashtable */
     476                 :        1347 :                 hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
     477         [ +  - ]:        1347 :                 Assert(found);
     478         [ +  - ]:        1347 :                 Assert(PrivateRefCountOverflowed > 0);
     479                 :        1347 :                 PrivateRefCountOverflowed--;
     480                 :             : 
     481                 :        1347 :                 return free;
     482                 :        1347 :         }
     483                 :    16052330 : }
     484                 :             : 
     485                 :             : /*
     486                 :             :  * Return the PrivateRefCount entry for the passed buffer.
     487                 :             :  *
     488                 :             :  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
     489                 :             :  * do_move is true, and the entry resides in the hashtable the entry is
     490                 :             :  * optimized for frequent access by moving it to the array.
     491                 :             :  */
     492                 :             : static inline PrivateRefCountEntry *
     493                 :   166226020 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
     494                 :             : {
     495         [ +  - ]:   166226020 :         Assert(BufferIsValid(buffer));
     496         [ +  - ]:   166226020 :         Assert(!BufferIsLocal(buffer));
     497                 :             : 
     498                 :             :         /*
     499                 :             :          * It's very common to look up the same buffer repeatedly. To make that
     500                 :             :          * fast, we have a one-entry cache.
     501                 :             :          *
     502                 :             :          * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
     503                 :             :          * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
     504                 :             :          * fewer addresses are computed and fewer cachelines are accessed. Whereas
     505                 :             :          * in GetPrivateRefCountEntrySlow()'s case, checking
     506                 :             :          * PrivateRefCountArrayKeys saves a lot of memory accesses.
     507                 :             :          */
     508   [ +  +  +  + ]:   166226020 :         if (likely(PrivateRefCountEntryLast != -1) &&
     509                 :   166223882 :                 likely(PrivateRefCountArray[PrivateRefCountEntryLast].buffer == buffer))
     510                 :             :         {
     511                 :   150173690 :                 return &PrivateRefCountArray[PrivateRefCountEntryLast];
     512                 :             :         }
     513                 :             : 
     514                 :             :         /*
     515                 :             :          * The code for the cached lookup is small enough to be worth inlining
     516                 :             :          * into the caller. In the miss case however, that empirically doesn't
     517                 :             :          * seem worth it.
     518                 :             :          */
     519                 :    16052330 :         return GetPrivateRefCountEntrySlow(buffer, do_move);
     520                 :   166226020 : }
     521                 :             : 
     522                 :             : /*
     523                 :             :  * Returns how many times the passed buffer is pinned by this backend.
     524                 :             :  *
     525                 :             :  * Only works for shared memory buffers!
     526                 :             :  */
     527                 :             : static inline int32
     528                 :    87161037 : GetPrivateRefCount(Buffer buffer)
     529                 :             : {
     530                 :    87161037 :         PrivateRefCountEntry *ref;
     531                 :             : 
     532         [ +  - ]:    87161037 :         Assert(BufferIsValid(buffer));
     533         [ +  - ]:    87161037 :         Assert(!BufferIsLocal(buffer));
     534                 :             : 
     535                 :             :         /*
     536                 :             :          * Not moving the entry - that's ok for the current users, but we might
     537                 :             :          * want to change this one day.
     538                 :             :          */
     539                 :    87161037 :         ref = GetPrivateRefCountEntry(buffer, false);
     540                 :             : 
     541         [ +  - ]:    87161037 :         if (ref == NULL)
     542                 :           0 :                 return 0;
     543                 :    87161037 :         return ref->data.refcount;
     544                 :    87161037 : }
     545                 :             : 
     546                 :             : /*
     547                 :             :  * Release resources used to track the reference count of a buffer which we no
     548                 :             :  * longer have pinned and don't want to pin again immediately.
     549                 :             :  */
     550                 :             : static void
     551                 :    10641775 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
     552                 :             : {
     553         [ +  - ]:    10641775 :         Assert(ref->data.refcount == 0);
     554         [ +  - ]:    10641775 :         Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
     555                 :             : 
     556   [ +  -  +  + ]:    10641775 :         if (ref >= &PrivateRefCountArray[0] &&
     557                 :    10641775 :                 ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
     558                 :             :         {
     559                 :    10635105 :                 ref->buffer = InvalidBuffer;
     560                 :    10635105 :                 PrivateRefCountArrayKeys[ref - PrivateRefCountArray] = InvalidBuffer;
     561                 :             : 
     562                 :             : 
     563                 :             :                 /*
     564                 :             :                  * Mark the just used entry as reserved - in many scenarios that
     565                 :             :                  * allows us to avoid ever having to search the array/hash for free
     566                 :             :                  * entries.
     567                 :             :                  */
     568                 :    10635105 :                 ReservedRefCountSlot = ref - PrivateRefCountArray;
     569                 :    10635105 :         }
     570                 :             :         else
     571                 :             :         {
     572                 :        6670 :                 bool            found;
     573                 :        6670 :                 Buffer          buffer = ref->buffer;
     574                 :             : 
     575                 :        6670 :                 hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
     576         [ +  - ]:        6670 :                 Assert(found);
     577         [ +  - ]:        6670 :                 Assert(PrivateRefCountOverflowed > 0);
     578                 :        6670 :                 PrivateRefCountOverflowed--;
     579                 :        6670 :         }
     580                 :    10641775 : }
     581                 :             : 
     582                 :             : /*
     583                 :             :  * BufferIsPinned
     584                 :             :  *              True iff the buffer is pinned (also checks for valid buffer number).
     585                 :             :  *
     586                 :             :  *              NOTE: what we check here is that *this* backend holds a pin on
     587                 :             :  *              the buffer.  We do not care whether some other backend does.
     588                 :             :  */
     589                 :             : #define BufferIsPinned(bufnum) \
     590                 :             : ( \
     591                 :             :         !BufferIsValid(bufnum) ? \
     592                 :             :                 false \
     593                 :             :         : \
     594                 :             :                 BufferIsLocal(bufnum) ? \
     595                 :             :                         (LocalRefCount[-(bufnum) - 1] > 0) \
     596                 :             :                 : \
     597                 :             :         (GetPrivateRefCount(bufnum) > 0) \
     598                 :             : )
     599                 :             : 
     600                 :             : 
     601                 :             : static Buffer ReadBuffer_common(Relation rel,
     602                 :             :                                                                 SMgrRelation smgr, char smgr_persistence,
     603                 :             :                                                                 ForkNumber forkNum, BlockNumber blockNum,
     604                 :             :                                                                 ReadBufferMode mode, BufferAccessStrategy strategy);
     605                 :             : static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
     606                 :             :                                                                                    ForkNumber fork,
     607                 :             :                                                                                    BufferAccessStrategy strategy,
     608                 :             :                                                                                    uint32 flags,
     609                 :             :                                                                                    uint32 extend_by,
     610                 :             :                                                                                    BlockNumber extend_upto,
     611                 :             :                                                                                    Buffer *buffers,
     612                 :             :                                                                                    uint32 *extended_by);
     613                 :             : static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr,
     614                 :             :                                                                                    ForkNumber fork,
     615                 :             :                                                                                    BufferAccessStrategy strategy,
     616                 :             :                                                                                    uint32 flags,
     617                 :             :                                                                                    uint32 extend_by,
     618                 :             :                                                                                    BlockNumber extend_upto,
     619                 :             :                                                                                    Buffer *buffers,
     620                 :             :                                                                                    uint32 *extended_by);
     621                 :             : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
     622                 :             :                                           bool skip_if_not_valid);
     623                 :             : static void PinBuffer_Locked(BufferDesc *buf);
     624                 :             : static void UnpinBuffer(BufferDesc *buf);
     625                 :             : static void UnpinBufferNoOwner(BufferDesc *buf);
     626                 :             : static void BufferSync(int flags);
     627                 :             : static int      SyncOneBuffer(int buf_id, bool skip_recently_used,
     628                 :             :                                                   WritebackContext *wb_context);
     629                 :             : static void WaitIO(BufferDesc *buf);
     630                 :             : static void AbortBufferIO(Buffer buffer);
     631                 :             : static void shared_buffer_write_error_callback(void *arg);
     632                 :             : static void local_buffer_write_error_callback(void *arg);
     633                 :             : static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
     634                 :             :                                                                           char relpersistence,
     635                 :             :                                                                           ForkNumber forkNum,
     636                 :             :                                                                           BlockNumber blockNum,
     637                 :             :                                                                           BufferAccessStrategy strategy,
     638                 :             :                                                                           bool *foundPtr, IOContext io_context);
     639                 :             : static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
     640                 :             : static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
     641                 :             : static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
     642                 :             : static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
     643                 :             :                                                                 IOObject io_object, IOContext io_context);
     644                 :             : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
     645                 :             :                                                 IOObject io_object, IOContext io_context);
     646                 :             : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
     647                 :             :                                                                            ForkNumber forkNum,
     648                 :             :                                                                            BlockNumber nForkBlock,
     649                 :             :                                                                            BlockNumber firstDelBlock);
     650                 :             : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
     651                 :             :                                                                                    RelFileLocator dstlocator,
     652                 :             :                                                                                    ForkNumber forkNum, bool permanent);
     653                 :             : static void AtProcExit_Buffers(int code, Datum arg);
     654                 :             : static void CheckForBufferLeaks(void);
     655                 :             : #ifdef USE_ASSERT_CHECKING
     656                 :             : static void AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode);
     657                 :             : #endif
     658                 :             : static int      rlocator_comparator(const void *p1, const void *p2);
     659                 :             : static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
     660                 :             : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
     661                 :             : static int      ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
     662                 :             : 
     663                 :             : static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
     664                 :             : static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr);
     665                 :             : static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
     666                 :             : static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode);
     667                 :             : static bool BufferLockHeldByMe(BufferDesc *buf_hdr);
     668                 :             : static inline void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr);
     669                 :             : static inline int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr);
     670                 :             : static inline bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode);
     671                 :             : static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode);
     672                 :             : static void BufferLockDequeueSelf(BufferDesc *buf_hdr);
     673                 :             : static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked);
     674                 :             : static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate);
     675                 :             : static inline uint64 BufferLockReleaseSub(BufferLockMode mode);
     676                 :             : 
     677                 :             : 
     678                 :             : /*
     679                 :             :  * Implementation of PrefetchBuffer() for shared buffers.
     680                 :             :  */
     681                 :             : PrefetchBufferResult
     682                 :        3645 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
     683                 :             :                                          ForkNumber forkNum,
     684                 :             :                                          BlockNumber blockNum)
     685                 :             : {
     686                 :        3645 :         PrefetchBufferResult result = {InvalidBuffer, false};
     687                 :        3645 :         BufferTag       newTag;                 /* identity of requested block */
     688                 :        3645 :         uint32          newHash;                /* hash value for newTag */
     689                 :        3645 :         LWLock     *newPartitionLock;   /* buffer partition lock for it */
     690                 :        3645 :         int                     buf_id;
     691                 :             : 
     692         [ +  - ]:        3645 :         Assert(BlockNumberIsValid(blockNum));
     693                 :             : 
     694                 :             :         /* create a tag so we can lookup the buffer */
     695                 :        7290 :         InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
     696                 :        3645 :                                   forkNum, blockNum);
     697                 :             : 
     698                 :             :         /* determine its hash code and partition lock ID */
     699                 :        3645 :         newHash = BufTableHashCode(&newTag);
     700                 :        3645 :         newPartitionLock = BufMappingPartitionLock(newHash);
     701                 :             : 
     702                 :             :         /* see if the block is in the buffer pool already */
     703                 :        3645 :         LWLockAcquire(newPartitionLock, LW_SHARED);
     704                 :        3645 :         buf_id = BufTableLookup(&newTag, newHash);
     705                 :        3645 :         LWLockRelease(newPartitionLock);
     706                 :             : 
     707                 :             :         /* If not in buffers, initiate prefetch */
     708         [ +  - ]:        3645 :         if (buf_id < 0)
     709                 :             :         {
     710                 :             : #ifdef USE_PREFETCH
     711                 :             :                 /*
     712                 :             :                  * Try to initiate an asynchronous read.  This returns false in
     713                 :             :                  * recovery if the relation file doesn't exist.
     714                 :             :                  */
     715   [ #  #  #  # ]:           0 :                 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
     716                 :           0 :                         smgrprefetch(smgr_reln, forkNum, blockNum, 1))
     717                 :             :                 {
     718                 :           0 :                         result.initiated_io = true;
     719                 :           0 :                 }
     720                 :             : #endif                                                  /* USE_PREFETCH */
     721                 :           0 :         }
     722                 :             :         else
     723                 :             :         {
     724                 :             :                 /*
     725                 :             :                  * Report the buffer it was in at that time.  The caller may be able
     726                 :             :                  * to avoid a buffer table lookup, but it's not pinned and it must be
     727                 :             :                  * rechecked!
     728                 :             :                  */
     729                 :        3645 :                 result.recent_buffer = buf_id + 1;
     730                 :             :         }
     731                 :             : 
     732                 :             :         /*
     733                 :             :          * If the block *is* in buffers, we do nothing.  This is not really ideal:
     734                 :             :          * the block might be just about to be evicted, which would be stupid
     735                 :             :          * since we know we are going to need it soon.  But the only easy answer
     736                 :             :          * is to bump the usage_count, which does not seem like a great solution:
     737                 :             :          * when the caller does ultimately touch the block, usage_count would get
     738                 :             :          * bumped again, resulting in too much favoritism for blocks that are
     739                 :             :          * involved in a prefetch sequence. A real fix would involve some
     740                 :             :          * additional per-buffer state, and it's not clear that there's enough of
     741                 :             :          * a problem to justify that.
     742                 :             :          */
     743                 :             : 
     744                 :             :         return result;
     745                 :        3645 : }
     746                 :             : 
     747                 :             : /*
     748                 :             :  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
     749                 :             :  *
     750                 :             :  * This is named by analogy to ReadBuffer but doesn't actually allocate a
     751                 :             :  * buffer.  Instead it tries to ensure that a future ReadBuffer for the given
     752                 :             :  * block will not be delayed by the I/O.  Prefetching is optional.
     753                 :             :  *
     754                 :             :  * There are three possible outcomes:
     755                 :             :  *
     756                 :             :  * 1.  If the block is already cached, the result includes a valid buffer that
     757                 :             :  * could be used by the caller to avoid the need for a later buffer lookup, but
     758                 :             :  * it's not pinned, so the caller must recheck it.
     759                 :             :  *
     760                 :             :  * 2.  If the kernel has been asked to initiate I/O, the initiated_io member is
     761                 :             :  * true.  Currently there is no way to know if the data was already cached by
     762                 :             :  * the kernel and therefore didn't really initiate I/O, and no way to know when
     763                 :             :  * the I/O completes other than using synchronous ReadBuffer().
     764                 :             :  *
     765                 :             :  * 3.  Otherwise, the buffer wasn't already cached by PostgreSQL, and
     766                 :             :  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
     767                 :             :  * lack of a kernel facility), direct I/O is enabled, or the underlying
     768                 :             :  * relation file wasn't found and we are in recovery.  (If the relation file
     769                 :             :  * wasn't found and we are not in recovery, an error is raised).
     770                 :             :  */
     771                 :             : PrefetchBufferResult
     772                 :        3895 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
     773                 :             : {
     774         [ +  - ]:        3895 :         Assert(RelationIsValid(reln));
     775         [ +  - ]:        3895 :         Assert(BlockNumberIsValid(blockNum));
     776                 :             : 
     777         [ +  + ]:        3895 :         if (RelationUsesLocalBuffers(reln))
     778                 :             :         {
     779                 :             :                 /* see comments in ReadBufferExtended */
     780   [ +  -  +  - ]:         250 :                 if (RELATION_IS_OTHER_TEMP(reln))
     781   [ #  #  #  # ]:           0 :                         ereport(ERROR,
     782                 :             :                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     783                 :             :                                          errmsg("cannot access temporary tables of other sessions")));
     784                 :             : 
     785                 :             :                 /* pass it off to localbuf.c */
     786                 :         250 :                 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
     787                 :             :         }
     788                 :             :         else
     789                 :             :         {
     790                 :             :                 /* pass it to the shared buffer version */
     791                 :        3645 :                 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
     792                 :             :         }
     793                 :        3895 : }
     794                 :             : 
     795                 :             : /*
     796                 :             :  * ReadRecentBuffer -- try to pin a block in a recently observed buffer
     797                 :             :  *
     798                 :             :  * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
     799                 :             :  * successful.  Return true if the buffer is valid and still has the expected
     800                 :             :  * tag.  In that case, the buffer is pinned and the usage count is bumped.
     801                 :             :  */
     802                 :             : bool
     803                 :           0 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
     804                 :             :                                  Buffer recent_buffer)
     805                 :             : {
     806                 :           0 :         BufferDesc *bufHdr;
     807                 :           0 :         BufferTag       tag;
     808                 :           0 :         uint64          buf_state;
     809                 :             : 
     810         [ #  # ]:           0 :         Assert(BufferIsValid(recent_buffer));
     811                 :             : 
     812                 :           0 :         ResourceOwnerEnlarge(CurrentResourceOwner);
     813                 :           0 :         ReservePrivateRefCountEntry();
     814                 :           0 :         InitBufferTag(&tag, &rlocator, forkNum, blockNum);
     815                 :             : 
     816         [ #  # ]:           0 :         if (BufferIsLocal(recent_buffer))
     817                 :             :         {
     818                 :           0 :                 int                     b = -recent_buffer - 1;
     819                 :             : 
     820                 :           0 :                 bufHdr = GetLocalBufferDescriptor(b);
     821                 :           0 :                 buf_state = pg_atomic_read_u64(&bufHdr->state);
     822                 :             : 
     823                 :             :                 /* Is it still valid and holding the right tag? */
     824   [ #  #  #  # ]:           0 :                 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
     825                 :             :                 {
     826                 :           0 :                         PinLocalBuffer(bufHdr, true);
     827                 :             : 
     828                 :           0 :                         pgBufferUsage.local_blks_hit++;
     829                 :             : 
     830                 :           0 :                         return true;
     831                 :             :                 }
     832         [ #  # ]:           0 :         }
     833                 :             :         else
     834                 :             :         {
     835                 :           0 :                 bufHdr = GetBufferDescriptor(recent_buffer - 1);
     836                 :             : 
     837                 :             :                 /*
     838                 :             :                  * Is it still valid and holding the right tag?  We do an unlocked tag
     839                 :             :                  * comparison first, to make it unlikely that we'll increment the
     840                 :             :                  * usage counter of the wrong buffer, if someone calls us with a very
     841                 :             :                  * out of date recent_buffer.  Then we'll check it again if we get the
     842                 :             :                  * pin.
     843                 :             :                  */
     844   [ #  #  #  # ]:           0 :                 if (BufferTagsEqual(&tag, &bufHdr->tag) &&
     845                 :           0 :                         PinBuffer(bufHdr, NULL, true))
     846                 :             :                 {
     847         [ #  # ]:           0 :                         if (BufferTagsEqual(&tag, &bufHdr->tag))
     848                 :             :                         {
     849                 :           0 :                                 pgBufferUsage.shared_blks_hit++;
     850                 :           0 :                                 return true;
     851                 :             :                         }
     852                 :           0 :                         UnpinBuffer(bufHdr);
     853                 :           0 :                 }
     854                 :             :         }
     855                 :             : 
     856                 :           0 :         return false;
     857                 :           0 : }
     858                 :             : 
     859                 :             : /*
     860                 :             :  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
     861                 :             :  *              fork with RBM_NORMAL mode and default strategy.
     862                 :             :  */
     863                 :             : Buffer
     864                 :     8891053 : ReadBuffer(Relation reln, BlockNumber blockNum)
     865                 :             : {
     866                 :     8891053 :         return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
     867                 :             : }
     868                 :             : 
     869                 :             : /*
     870                 :             :  * ReadBufferExtended -- returns a buffer containing the requested
     871                 :             :  *              block of the requested relation.  If the blknum
     872                 :             :  *              requested is P_NEW, extend the relation file and
     873                 :             :  *              allocate a new block.  (Caller is responsible for
     874                 :             :  *              ensuring that only one backend tries to extend a
     875                 :             :  *              relation at the same time!)
     876                 :             :  *
     877                 :             :  * Returns: the buffer number for the buffer containing
     878                 :             :  *              the block read.  The returned buffer has been pinned.
     879                 :             :  *              Does not return on error --- elog's instead.
     880                 :             :  *
     881                 :             :  * Assume when this function is called, that reln has been opened already.
     882                 :             :  *
     883                 :             :  * In RBM_NORMAL mode, the page is read from disk, and the page header is
     884                 :             :  * validated.  An error is thrown if the page header is not valid.  (But
     885                 :             :  * note that an all-zero page is considered "valid"; see
     886                 :             :  * PageIsVerified().)
     887                 :             :  *
     888                 :             :  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
     889                 :             :  * valid, the page is zeroed instead of throwing an error. This is intended
     890                 :             :  * for non-critical data, where the caller is prepared to repair errors.
     891                 :             :  *
     892                 :             :  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
     893                 :             :  * filled with zeros instead of reading it from disk.  Useful when the caller
     894                 :             :  * is going to fill the page from scratch, since this saves I/O and avoids
     895                 :             :  * unnecessary failure if the page-on-disk has corrupt page headers.
     896                 :             :  * The page is returned locked to ensure that the caller has a chance to
     897                 :             :  * initialize the page before it's made visible to others.
     898                 :             :  * Caution: do not use this mode to read a page that is beyond the relation's
     899                 :             :  * current physical EOF; that is likely to cause problems in md.c when
     900                 :             :  * the page is modified and written out. P_NEW is OK, though.
     901                 :             :  *
     902                 :             :  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
     903                 :             :  * a cleanup-strength lock on the page.
     904                 :             :  *
     905                 :             :  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
     906                 :             :  *
     907                 :             :  * If strategy is not NULL, a nondefault buffer access strategy is used.
     908                 :             :  * See buffer/README for details.
     909                 :             :  */
     910                 :             : inline Buffer
     911                 :    10793784 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
     912                 :             :                                    ReadBufferMode mode, BufferAccessStrategy strategy)
     913                 :             : {
     914                 :    10793784 :         Buffer          buf;
     915                 :             : 
     916                 :             :         /*
     917                 :             :          * Reject attempts to read non-local temporary relations; we would be
     918                 :             :          * likely to get wrong data since we have no visibility into the owning
     919                 :             :          * session's local buffers.
     920                 :             :          */
     921   [ +  +  +  - ]:    10793784 :         if (RELATION_IS_OTHER_TEMP(reln))
     922   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     923                 :             :                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     924                 :             :                                  errmsg("cannot access temporary tables of other sessions")));
     925                 :             : 
     926                 :             :         /*
     927                 :             :          * Read the buffer, and update pgstat counters to reflect a cache hit or
     928                 :             :          * miss.
     929                 :             :          */
     930                 :    21587568 :         buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
     931                 :    10793784 :                                                         forkNum, blockNum, mode, strategy);
     932                 :             : 
     933                 :    21587568 :         return buf;
     934                 :    10793784 : }
     935                 :             : 
     936                 :             : 
     937                 :             : /*
     938                 :             :  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
     939                 :             :  *              a relcache entry for the relation.
     940                 :             :  *
     941                 :             :  * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
     942                 :             :  * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
     943                 :             :  * cannot be used for temporary relations (and making that work might be
     944                 :             :  * difficult, unless we only want to read temporary relations for our own
     945                 :             :  * ProcNumber).
     946                 :             :  */
     947                 :             : Buffer
     948                 :        2904 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
     949                 :             :                                                   BlockNumber blockNum, ReadBufferMode mode,
     950                 :             :                                                   BufferAccessStrategy strategy, bool permanent)
     951                 :             : {
     952                 :        2904 :         SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
     953                 :             : 
     954                 :        8712 :         return ReadBuffer_common(NULL, smgr,
     955                 :        2904 :                                                          permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
     956                 :        2904 :                                                          forkNum, blockNum,
     957                 :        2904 :                                                          mode, strategy);
     958                 :        2904 : }
     959                 :             : 
     960                 :             : /*
     961                 :             :  * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
     962                 :             :  */
     963                 :             : Buffer
     964                 :        8685 : ExtendBufferedRel(BufferManagerRelation bmr,
     965                 :             :                                   ForkNumber forkNum,
     966                 :             :                                   BufferAccessStrategy strategy,
     967                 :             :                                   uint32 flags)
     968                 :             : {
     969                 :        8685 :         Buffer          buf;
     970                 :        8685 :         uint32          extend_by = 1;
     971                 :             : 
     972                 :        8685 :         ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
     973                 :             :                                                 &buf, &extend_by);
     974                 :             : 
     975                 :       17370 :         return buf;
     976                 :        8685 : }
     977                 :             : 
     978                 :             : /*
     979                 :             :  * Extend relation by multiple blocks.
     980                 :             :  *
     981                 :             :  * Tries to extend the relation by extend_by blocks. Depending on the
     982                 :             :  * availability of resources the relation may end up being extended by a
     983                 :             :  * smaller number of pages (unless an error is thrown, always by at least one
     984                 :             :  * page). *extended_by is updated to the number of pages the relation has been
     985                 :             :  * extended to.
     986                 :             :  *
     987                 :             :  * buffers needs to be an array that is at least extend_by long. Upon
     988                 :             :  * completion, the first extend_by array elements will point to a pinned
     989                 :             :  * buffer.
     990                 :             :  *
     991                 :             :  * If EB_LOCK_FIRST is part of flags, the first returned buffer is
     992                 :             :  * locked. This is useful for callers that want a buffer that is guaranteed to
     993                 :             :  * be empty.
     994                 :             :  */
     995                 :             : BlockNumber
     996                 :       26696 : ExtendBufferedRelBy(BufferManagerRelation bmr,
     997                 :             :                                         ForkNumber fork,
     998                 :             :                                         BufferAccessStrategy strategy,
     999                 :             :                                         uint32 flags,
    1000                 :             :                                         uint32 extend_by,
    1001                 :             :                                         Buffer *buffers,
    1002                 :             :                                         uint32 *extended_by)
    1003                 :             : {
    1004         [ +  - ]:       26696 :         Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
    1005   [ -  +  #  # ]:       26696 :         Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
    1006         [ +  - ]:       26696 :         Assert(extend_by > 0);
    1007                 :             : 
    1008         [ -  + ]:       26696 :         if (bmr.relpersistence == '\0')
    1009                 :       26696 :                 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
    1010                 :             : 
    1011                 :       53392 :         return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
    1012                 :       26696 :                                                                    extend_by, InvalidBlockNumber,
    1013                 :       26696 :                                                                    buffers, extended_by);
    1014                 :             : }
    1015                 :             : 
    1016                 :             : /*
    1017                 :             :  * Extend the relation so it is at least extend_to blocks large, return buffer
    1018                 :             :  * (extend_to - 1).
    1019                 :             :  *
    1020                 :             :  * This is useful for callers that want to write a specific page, regardless
    1021                 :             :  * of the current size of the relation (e.g. useful for visibilitymap and for
    1022                 :             :  * crash recovery).
    1023                 :             :  */
    1024                 :             : Buffer
    1025                 :         758 : ExtendBufferedRelTo(BufferManagerRelation bmr,
    1026                 :             :                                         ForkNumber fork,
    1027                 :             :                                         BufferAccessStrategy strategy,
    1028                 :             :                                         uint32 flags,
    1029                 :             :                                         BlockNumber extend_to,
    1030                 :             :                                         ReadBufferMode mode)
    1031                 :             : {
    1032                 :         758 :         BlockNumber current_size;
    1033                 :         758 :         uint32          extended_by = 0;
    1034                 :         758 :         Buffer          buffer = InvalidBuffer;
    1035                 :         758 :         Buffer          buffers[64];
    1036                 :             : 
    1037         [ +  - ]:         758 :         Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
    1038   [ -  +  #  # ]:         758 :         Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
    1039         [ +  - ]:         758 :         Assert(extend_to != InvalidBlockNumber && extend_to > 0);
    1040                 :             : 
    1041         [ -  + ]:         758 :         if (bmr.relpersistence == '\0')
    1042                 :         758 :                 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
    1043                 :             : 
    1044                 :             :         /*
    1045                 :             :          * If desired, create the file if it doesn't exist.  If
    1046                 :             :          * smgr_cached_nblocks[fork] is positive then it must exist, no need for
    1047                 :             :          * an smgrexists call.
    1048                 :             :          */
    1049         [ -  + ]:         758 :         if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
    1050   [ +  -  +  + ]:         758 :                 (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
    1051   [ +  -  +  -  :         758 :                  BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
                   -  + ]
    1052         [ +  - ]:         754 :                 !smgrexists(BMR_GET_SMGR(bmr), fork))
    1053                 :             :         {
    1054                 :         754 :                 LockRelationForExtension(bmr.rel, ExclusiveLock);
    1055                 :             : 
    1056                 :             :                 /* recheck, fork might have been created concurrently */
    1057   [ +  -  -  + ]:         754 :                 if (!smgrexists(BMR_GET_SMGR(bmr), fork))
    1058         [ +  - ]:         754 :                         smgrcreate(BMR_GET_SMGR(bmr), fork, flags & EB_PERFORMING_RECOVERY);
    1059                 :             : 
    1060                 :         754 :                 UnlockRelationForExtension(bmr.rel, ExclusiveLock);
    1061                 :         754 :         }
    1062                 :             : 
    1063                 :             :         /*
    1064                 :             :          * If requested, invalidate size cache, so that smgrnblocks asks the
    1065                 :             :          * kernel.
    1066                 :             :          */
    1067         [ -  + ]:         758 :         if (flags & EB_CLEAR_SIZE_CACHE)
    1068         [ +  - ]:         758 :                 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
    1069                 :             : 
    1070                 :             :         /*
    1071                 :             :          * Estimate how many pages we'll need to extend by. This avoids acquiring
    1072                 :             :          * unnecessarily many victim buffers.
    1073                 :             :          */
    1074         [ +  - ]:         758 :         current_size = smgrnblocks(BMR_GET_SMGR(bmr), fork);
    1075                 :             : 
    1076                 :             :         /*
    1077                 :             :          * Since no-one else can be looking at the page contents yet, there is no
    1078                 :             :          * difference between an exclusive lock and a cleanup-strength lock. Note
    1079                 :             :          * that we pass the original mode to ReadBuffer_common() below, when
    1080                 :             :          * falling back to reading the buffer to a concurrent relation extension.
    1081                 :             :          */
    1082   [ +  -  -  + ]:         758 :         if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
    1083                 :           0 :                 flags |= EB_LOCK_TARGET;
    1084                 :             : 
    1085         [ +  + ]:        1516 :         while (current_size < extend_to)
    1086                 :             :         {
    1087                 :         758 :                 uint32          num_pages = lengthof(buffers);
    1088                 :         758 :                 BlockNumber first_block;
    1089                 :             : 
    1090         [ -  + ]:         758 :                 if ((uint64) current_size + num_pages > extend_to)
    1091                 :         758 :                         num_pages = extend_to - current_size;
    1092                 :             : 
    1093                 :        1516 :                 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
    1094                 :         758 :                                                                                           num_pages, extend_to,
    1095                 :         758 :                                                                                           buffers, &extended_by);
    1096                 :             : 
    1097                 :         758 :                 current_size = first_block + extended_by;
    1098   [ -  +  #  # ]:         758 :                 Assert(num_pages != 0 || current_size >= extend_to);
    1099                 :             : 
    1100         [ +  + ]:        2426 :                 for (uint32 i = 0; i < extended_by; i++)
    1101                 :             :                 {
    1102         [ +  + ]:        1668 :                         if (first_block + i != extend_to - 1)
    1103                 :         910 :                                 ReleaseBuffer(buffers[i]);
    1104                 :             :                         else
    1105                 :         758 :                                 buffer = buffers[i];
    1106                 :        1668 :                 }
    1107                 :         758 :         }
    1108                 :             : 
    1109                 :             :         /*
    1110                 :             :          * It's possible that another backend concurrently extended the relation.
    1111                 :             :          * In that case read the buffer.
    1112                 :             :          *
    1113                 :             :          * XXX: Should we control this via a flag?
    1114                 :             :          */
    1115         [ +  - ]:         758 :         if (buffer == InvalidBuffer)
    1116                 :             :         {
    1117         [ #  # ]:           0 :                 Assert(extended_by == 0);
    1118         [ #  # ]:           0 :                 buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
    1119                 :           0 :                                                                    fork, extend_to - 1, mode, strategy);
    1120                 :           0 :         }
    1121                 :             : 
    1122                 :        1516 :         return buffer;
    1123                 :         758 : }
    1124                 :             : 
    1125                 :             : /*
    1126                 :             :  * Lock and optionally zero a buffer, as part of the implementation of
    1127                 :             :  * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK.  The buffer must be already
    1128                 :             :  * pinned.  If the buffer is not already valid, it is zeroed and made valid.
    1129                 :             :  */
    1130                 :             : static void
    1131                 :        3095 : ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
    1132                 :             : {
    1133                 :        3095 :         BufferDesc *bufHdr;
    1134                 :        3095 :         bool            need_to_zero;
    1135                 :        3095 :         bool            isLocalBuf = BufferIsLocal(buffer);
    1136                 :             : 
    1137   [ -  +  #  # ]:        3095 :         Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
    1138                 :             : 
    1139         [ +  + ]:        3095 :         if (already_valid)
    1140                 :             :         {
    1141                 :             :                 /*
    1142                 :             :                  * If the caller already knew the buffer was valid, we can skip some
    1143                 :             :                  * header interaction.  The caller just wants to lock the buffer.
    1144                 :             :                  */
    1145                 :          12 :                 need_to_zero = false;
    1146                 :          12 :         }
    1147         [ -  + ]:        3083 :         else if (isLocalBuf)
    1148                 :             :         {
    1149                 :             :                 /* Simple case for non-shared buffers. */
    1150                 :           0 :                 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    1151                 :           0 :                 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
    1152                 :           0 :         }
    1153                 :             :         else
    1154                 :             :         {
    1155                 :             :                 /*
    1156                 :             :                  * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
    1157                 :             :                  * concurrently.  Even though we aren't doing I/O, that ensures that
    1158                 :             :                  * we don't zero a page that someone else has pinned.  An exclusive
    1159                 :             :                  * content lock wouldn't be enough, because readers are allowed to
    1160                 :             :                  * drop the content lock after determining that a tuple is visible
    1161                 :             :                  * (see buffer access rules in README).
    1162                 :             :                  */
    1163                 :        3083 :                 bufHdr = GetBufferDescriptor(buffer - 1);
    1164                 :        3083 :                 need_to_zero = StartBufferIO(bufHdr, true, false);
    1165                 :             :         }
    1166                 :             : 
    1167         [ +  + ]:        3095 :         if (need_to_zero)
    1168                 :             :         {
    1169                 :        3083 :                 memset(BufferGetPage(buffer), 0, BLCKSZ);
    1170                 :             : 
    1171                 :             :                 /*
    1172                 :             :                  * Grab the buffer content lock before marking the page as valid, to
    1173                 :             :                  * make sure that no other backend sees the zeroed page before the
    1174                 :             :                  * caller has had a chance to initialize it.
    1175                 :             :                  *
    1176                 :             :                  * Since no-one else can be looking at the page contents yet, there is
    1177                 :             :                  * no difference between an exclusive lock and a cleanup-strength
    1178                 :             :                  * lock. (Note that we cannot use LockBuffer() or
    1179                 :             :                  * LockBufferForCleanup() here, because they assert that the buffer is
    1180                 :             :                  * already valid.)
    1181                 :             :                  */
    1182         [ -  + ]:        3083 :                 if (!isLocalBuf)
    1183                 :        3083 :                         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    1184                 :             : 
    1185                 :             :                 /* Set BM_VALID, terminate IO, and wake up any waiters */
    1186         [ -  + ]:        3083 :                 if (isLocalBuf)
    1187                 :           0 :                         TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
    1188                 :             :                 else
    1189                 :        3083 :                         TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
    1190                 :        3083 :         }
    1191         [ -  + ]:          12 :         else if (!isLocalBuf)
    1192                 :             :         {
    1193                 :             :                 /*
    1194                 :             :                  * The buffer is valid, so we can't zero it.  The caller still expects
    1195                 :             :                  * the page to be locked on return.
    1196                 :             :                  */
    1197         [ +  - ]:          12 :                 if (mode == RBM_ZERO_AND_LOCK)
    1198                 :          12 :                         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    1199                 :             :                 else
    1200                 :           0 :                         LockBufferForCleanup(buffer);
    1201                 :          12 :         }
    1202                 :        3095 : }
    1203                 :             : 
    1204                 :             : /*
    1205                 :             :  * Pin a buffer for a given block.  *foundPtr is set to true if the block was
    1206                 :             :  * already present, or false if more work is required to either read it in or
    1207                 :             :  * zero it.
    1208                 :             :  */
    1209                 :             : static pg_attribute_always_inline Buffer
    1210                 :    11686721 : PinBufferForBlock(Relation rel,
    1211                 :             :                                   SMgrRelation smgr,
    1212                 :             :                                   char persistence,
    1213                 :             :                                   ForkNumber forkNum,
    1214                 :             :                                   BlockNumber blockNum,
    1215                 :             :                                   BufferAccessStrategy strategy,
    1216                 :             :                                   bool *foundPtr)
    1217                 :             : {
    1218                 :    11686721 :         BufferDesc *bufHdr;
    1219                 :    11686721 :         IOContext       io_context;
    1220                 :    11686721 :         IOObject        io_object;
    1221                 :             : 
    1222         [ +  - ]:    11686721 :         Assert(blockNum != P_NEW);
    1223                 :             : 
    1224                 :             :         /* Persistence should be set before */
    1225   [ +  +  +  +  :    11686721 :         Assert((persistence == RELPERSISTENCE_TEMP ||
                   +  - ]
    1226                 :             :                         persistence == RELPERSISTENCE_PERMANENT ||
    1227                 :             :                         persistence == RELPERSISTENCE_UNLOGGED));
    1228                 :             : 
    1229         [ +  + ]:    11686721 :         if (persistence == RELPERSISTENCE_TEMP)
    1230                 :             :         {
    1231                 :      353699 :                 io_context = IOCONTEXT_NORMAL;
    1232                 :      353699 :                 io_object = IOOBJECT_TEMP_RELATION;
    1233                 :      353699 :         }
    1234                 :             :         else
    1235                 :             :         {
    1236                 :    11333022 :                 io_context = IOContextForStrategy(strategy);
    1237                 :    11333022 :                 io_object = IOOBJECT_RELATION;
    1238                 :             :         }
    1239                 :             : 
    1240                 :    11686721 :         TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
    1241                 :             :                                                                            smgr->smgr_rlocator.locator.spcOid,
    1242                 :             :                                                                            smgr->smgr_rlocator.locator.dbOid,
    1243                 :             :                                                                            smgr->smgr_rlocator.locator.relNumber,
    1244                 :             :                                                                            smgr->smgr_rlocator.backend);
    1245                 :             : 
    1246         [ +  + ]:    11686721 :         if (persistence == RELPERSISTENCE_TEMP)
    1247                 :             :         {
    1248                 :      353699 :                 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
    1249         [ +  + ]:      353699 :                 if (*foundPtr)
    1250                 :      350930 :                         pgBufferUsage.local_blks_hit++;
    1251                 :      353699 :         }
    1252                 :             :         else
    1253                 :             :         {
    1254                 :    22666044 :                 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
    1255                 :    11333022 :                                                          strategy, foundPtr, io_context);
    1256         [ +  + ]:    11333022 :                 if (*foundPtr)
    1257                 :    11321557 :                         pgBufferUsage.shared_blks_hit++;
    1258                 :             :         }
    1259         [ +  + ]:    11686721 :         if (rel)
    1260                 :             :         {
    1261                 :             :                 /*
    1262                 :             :                  * While pgBufferUsage's "read" counter isn't bumped unless we reach
    1263                 :             :                  * WaitReadBuffers() (so, not for hits, and not for buffers that are
    1264                 :             :                  * zeroed instead), the per-relation stats always count them.
    1265                 :             :                  */
    1266   [ +  +  +  +  :    11680955 :                 pgstat_count_buffer_read(rel);
                   +  + ]
    1267         [ +  + ]:    11680955 :                 if (*foundPtr)
    1268   [ +  +  +  +  :    11671495 :                         pgstat_count_buffer_hit(rel);
                   -  + ]
    1269                 :    11680955 :         }
    1270         [ +  + ]:    11686721 :         if (*foundPtr)
    1271                 :             :         {
    1272                 :    11672487 :                 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
    1273         [ +  - ]:    11672487 :                 if (VacuumCostActive)
    1274                 :           0 :                         VacuumCostBalance += VacuumCostPageHit;
    1275                 :             : 
    1276                 :    11672487 :                 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
    1277                 :             :                                                                                   smgr->smgr_rlocator.locator.spcOid,
    1278                 :             :                                                                                   smgr->smgr_rlocator.locator.dbOid,
    1279                 :             :                                                                                   smgr->smgr_rlocator.locator.relNumber,
    1280                 :             :                                                                                   smgr->smgr_rlocator.backend,
    1281                 :             :                                                                                   true);
    1282                 :    11672487 :         }
    1283                 :             : 
    1284                 :    23373442 :         return BufferDescriptorGetBuffer(bufHdr);
    1285                 :    11686721 : }
    1286                 :             : 
    1287                 :             : /*
    1288                 :             :  * ReadBuffer_common -- common logic for all ReadBuffer variants
    1289                 :             :  *
    1290                 :             :  * smgr is required, rel is optional unless using P_NEW.
    1291                 :             :  */
    1292                 :             : static pg_attribute_always_inline Buffer
    1293                 :    10796688 : ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
    1294                 :             :                                   ForkNumber forkNum,
    1295                 :             :                                   BlockNumber blockNum, ReadBufferMode mode,
    1296                 :             :                                   BufferAccessStrategy strategy)
    1297                 :             : {
    1298                 :    10796688 :         ReadBuffersOperation operation;
    1299                 :    10796688 :         Buffer          buffer;
    1300                 :    10796688 :         int                     flags;
    1301                 :    10796688 :         char            persistence;
    1302                 :             : 
    1303                 :             :         /*
    1304                 :             :          * Backward compatibility path, most code should use ExtendBufferedRel()
    1305                 :             :          * instead, as acquiring the extension lock inside ExtendBufferedRel()
    1306                 :             :          * scales a lot better.
    1307                 :             :          */
    1308         [ +  + ]:    10796688 :         if (unlikely(blockNum == P_NEW))
    1309                 :             :         {
    1310                 :          61 :                 uint32          flags = EB_SKIP_EXTENSION_LOCK;
    1311                 :             : 
    1312                 :             :                 /*
    1313                 :             :                  * Since no-one else can be looking at the page contents yet, there is
    1314                 :             :                  * no difference between an exclusive lock and a cleanup-strength
    1315                 :             :                  * lock.
    1316                 :             :                  */
    1317   [ +  -  -  + ]:          61 :                 if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
    1318                 :           0 :                         flags |= EB_LOCK_FIRST;
    1319                 :             : 
    1320                 :          61 :                 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
    1321                 :          61 :         }
    1322                 :             : 
    1323         [ +  + ]:    10796627 :         if (rel)
    1324                 :    10793723 :                 persistence = rel->rd_rel->relpersistence;
    1325                 :             :         else
    1326                 :        2904 :                 persistence = smgr_persistence;
    1327                 :             : 
    1328   [ -  +  +  + ]:    10796627 :         if (unlikely(mode == RBM_ZERO_AND_CLEANUP_LOCK ||
    1329                 :             :                                  mode == RBM_ZERO_AND_LOCK))
    1330                 :             :         {
    1331                 :        3095 :                 bool            found;
    1332                 :             : 
    1333                 :        6190 :                 buffer = PinBufferForBlock(rel, smgr, persistence,
    1334                 :        3095 :                                                                    forkNum, blockNum, strategy, &found);
    1335                 :        3095 :                 ZeroAndLockBuffer(buffer, mode, found);
    1336                 :        3095 :                 return buffer;
    1337                 :        3095 :         }
    1338                 :             : 
    1339                 :             :         /*
    1340                 :             :          * Signal that we are going to immediately wait. If we're immediately
    1341                 :             :          * waiting, there is no benefit in actually executing the IO
    1342                 :             :          * asynchronously, it would just add dispatch overhead.
    1343                 :             :          */
    1344                 :    10793532 :         flags = READ_BUFFERS_SYNCHRONOUSLY;
    1345         [ +  + ]:    10793532 :         if (mode == RBM_ZERO_ON_ERROR)
    1346                 :      461197 :                 flags |= READ_BUFFERS_ZERO_ON_ERROR;
    1347                 :    10793532 :         operation.smgr = smgr;
    1348                 :    10793532 :         operation.rel = rel;
    1349                 :    10793532 :         operation.persistence = persistence;
    1350                 :    10793532 :         operation.forknum = forkNum;
    1351                 :    10793532 :         operation.strategy = strategy;
    1352         [ +  + ]:    10793532 :         if (StartReadBuffer(&operation,
    1353                 :             :                                                 &buffer,
    1354                 :    10793532 :                                                 blockNum,
    1355                 :    10793532 :                                                 flags))
    1356                 :        5966 :                 WaitReadBuffers(&operation);
    1357                 :             : 
    1358                 :    10793532 :         return buffer;
    1359                 :    10796688 : }
    1360                 :             : 
    1361                 :             : static pg_attribute_always_inline bool
    1362                 :    11679530 : StartReadBuffersImpl(ReadBuffersOperation *operation,
    1363                 :             :                                          Buffer *buffers,
    1364                 :             :                                          BlockNumber blockNum,
    1365                 :             :                                          int *nblocks,
    1366                 :             :                                          int flags,
    1367                 :             :                                          bool allow_forwarding)
    1368                 :             : {
    1369                 :    11679530 :         int                     actual_nblocks = *nblocks;
    1370                 :    11679530 :         int                     maxcombine = 0;
    1371                 :    11679530 :         bool            did_start_io;
    1372                 :             : 
    1373   [ +  +  +  - ]:    11679530 :         Assert(*nblocks == 1 || allow_forwarding);
    1374         [ +  - ]:    11679530 :         Assert(*nblocks > 0);
    1375         [ +  - ]:    11679530 :         Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
    1376                 :             : 
    1377   [ +  +  +  + ]:    23363167 :         for (int i = 0; i < actual_nblocks; ++i)
    1378                 :             :         {
    1379                 :    11683637 :                 bool            found;
    1380                 :             : 
    1381   [ +  +  +  + ]:    11683637 :                 if (allow_forwarding && buffers[i] != InvalidBuffer)
    1382                 :             :                 {
    1383                 :          11 :                         BufferDesc *bufHdr;
    1384                 :             : 
    1385                 :             :                         /*
    1386                 :             :                          * This is a buffer that was pinned by an earlier call to
    1387                 :             :                          * StartReadBuffers(), but couldn't be handled in one operation at
    1388                 :             :                          * that time.  The operation was split, and the caller has passed
    1389                 :             :                          * an already pinned buffer back to us to handle the rest of the
    1390                 :             :                          * operation.  It must continue at the expected block number.
    1391                 :             :                          */
    1392         [ -  + ]:          11 :                         Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
    1393                 :             : 
    1394                 :             :                         /*
    1395                 :             :                          * It might be an already valid buffer (a hit) that followed the
    1396                 :             :                          * final contiguous block of an earlier I/O (a miss) marking the
    1397                 :             :                          * end of it, or a buffer that some other backend has since made
    1398                 :             :                          * valid by performing the I/O for us, in which case we can handle
    1399                 :             :                          * it as a hit now.  It is safe to check for a BM_VALID flag with
    1400                 :             :                          * a relaxed load, because we got a fresh view of it while pinning
    1401                 :             :                          * it in the previous call.
    1402                 :             :                          *
    1403                 :             :                          * On the other hand if we don't see BM_VALID yet, it must be an
    1404                 :             :                          * I/O that was split by the previous call and we need to try to
    1405                 :             :                          * start a new I/O from this block.  We're also racing against any
    1406                 :             :                          * other backend that might start the I/O or even manage to mark
    1407                 :             :                          * it BM_VALID after this check, but StartBufferIO() will handle
    1408                 :             :                          * those cases.
    1409                 :             :                          */
    1410         [ +  - ]:          11 :                         if (BufferIsLocal(buffers[i]))
    1411                 :           0 :                                 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
    1412                 :             :                         else
    1413                 :          11 :                                 bufHdr = GetBufferDescriptor(buffers[i] - 1);
    1414         [ -  + ]:          11 :                         Assert(pg_atomic_read_u64(&bufHdr->state) & BM_TAG_VALID);
    1415                 :          11 :                         found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
    1416                 :          11 :                 }
    1417                 :             :                 else
    1418                 :             :                 {
    1419                 :    23367252 :                         buffers[i] = PinBufferForBlock(operation->rel,
    1420                 :    11683626 :                                                                                    operation->smgr,
    1421                 :    11683626 :                                                                                    operation->persistence,
    1422                 :    11683626 :                                                                                    operation->forknum,
    1423                 :    11683626 :                                                                                    blockNum + i,
    1424                 :    11683626 :                                                                                    operation->strategy,
    1425                 :             :                                                                                    &found);
    1426                 :             :                 }
    1427                 :             : 
    1428         [ +  + ]:    11683637 :                 if (found)
    1429                 :             :                 {
    1430                 :             :                         /*
    1431                 :             :                          * We have a hit.  If it's the first block in the requested range,
    1432                 :             :                          * we can return it immediately and report that WaitReadBuffers()
    1433                 :             :                          * does not need to be called.  If the initial value of *nblocks
    1434                 :             :                          * was larger, the caller will have to call again for the rest.
    1435                 :             :                          */
    1436         [ +  + ]:    11672486 :                         if (i == 0)
    1437                 :             :                         {
    1438                 :    11672475 :                                 *nblocks = 1;
    1439                 :             : 
    1440                 :             : #ifdef USE_ASSERT_CHECKING
    1441                 :             : 
    1442                 :             :                                 /*
    1443                 :             :                                  * Initialize enough of ReadBuffersOperation to make
    1444                 :             :                                  * CheckReadBuffersOperation() work. Outside of assertions
    1445                 :             :                                  * that's not necessary when no IO is issued.
    1446                 :             :                                  */
    1447                 :    11672475 :                                 operation->buffers = buffers;
    1448                 :    11672475 :                                 operation->blocknum = blockNum;
    1449                 :    11672475 :                                 operation->nblocks = 1;
    1450                 :    11672475 :                                 operation->nblocks_done = 1;
    1451                 :    11672475 :                                 CheckReadBuffersOperation(operation, true);
    1452                 :             : #endif
    1453                 :    11672475 :                                 return false;
    1454                 :             :                         }
    1455                 :             : 
    1456                 :             :                         /*
    1457                 :             :                          * Otherwise we already have an I/O to perform, but this block
    1458                 :             :                          * can't be included as it is already valid.  Split the I/O here.
    1459                 :             :                          * There may or may not be more blocks requiring I/O after this
    1460                 :             :                          * one, we haven't checked, but they can't be contiguous with this
    1461                 :             :                          * one in the way.  We'll leave this buffer pinned, forwarding it
    1462                 :             :                          * to the next call, avoiding the need to unpin it here and re-pin
    1463                 :             :                          * it in the next call.
    1464                 :             :                          */
    1465                 :          11 :                         actual_nblocks = i;
    1466                 :          11 :                         break;
    1467                 :             :                 }
    1468                 :             :                 else
    1469                 :             :                 {
    1470                 :             :                         /*
    1471                 :             :                          * Check how many blocks we can cover with the same IO. The smgr
    1472                 :             :                          * implementation might e.g. be limited due to a segment boundary.
    1473                 :             :                          */
    1474   [ +  +  +  + ]:       11151 :                         if (i == 0 && actual_nblocks > 1)
    1475                 :             :                         {
    1476                 :        1278 :                                 maxcombine = smgrmaxcombine(operation->smgr,
    1477                 :         639 :                                                                                         operation->forknum,
    1478                 :         639 :                                                                                         blockNum);
    1479         [ +  - ]:         639 :                                 if (unlikely(maxcombine < actual_nblocks))
    1480                 :             :                                 {
    1481   [ #  #  #  # ]:           0 :                                         elog(DEBUG2, "limiting nblocks at %u from %u to %u",
    1482                 :             :                                                  blockNum, actual_nblocks, maxcombine);
    1483                 :           0 :                                         actual_nblocks = maxcombine;
    1484                 :           0 :                                 }
    1485                 :         639 :                         }
    1486                 :             :                 }
    1487         [ +  + ]:    11683637 :         }
    1488                 :        7053 :         *nblocks = actual_nblocks;
    1489                 :             : 
    1490                 :             :         /* Populate information needed for I/O. */
    1491                 :        7053 :         operation->buffers = buffers;
    1492                 :        7053 :         operation->blocknum = blockNum;
    1493                 :        7053 :         operation->flags = flags;
    1494                 :        7053 :         operation->nblocks = actual_nblocks;
    1495                 :        7053 :         operation->nblocks_done = 0;
    1496                 :        7053 :         pgaio_wref_clear(&operation->io_wref);
    1497                 :             : 
    1498                 :             :         /*
    1499                 :             :          * When using AIO, start the IO in the background. If not, issue prefetch
    1500                 :             :          * requests if desired by the caller.
    1501                 :             :          *
    1502                 :             :          * The reason we have a dedicated path for IOMETHOD_SYNC here is to
    1503                 :             :          * de-risk the introduction of AIO somewhat. It's a large architectural
    1504                 :             :          * change, with lots of chances for unanticipated performance effects.
    1505                 :             :          *
    1506                 :             :          * Use of IOMETHOD_SYNC already leads to not actually performing IO
    1507                 :             :          * asynchronously, but without the check here we'd execute IO earlier than
    1508                 :             :          * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
    1509                 :             :          */
    1510         [ +  - ]:        7053 :         if (io_method != IOMETHOD_SYNC)
    1511                 :             :         {
    1512                 :             :                 /*
    1513                 :             :                  * Try to start IO asynchronously. It's possible that no IO needs to
    1514                 :             :                  * be started, if another backend already performed the IO.
    1515                 :             :                  *
    1516                 :             :                  * Note that if an IO is started, it might not cover the entire
    1517                 :             :                  * requested range, e.g. because an intermediary block has been read
    1518                 :             :                  * in by another backend.  In that case any "trailing" buffers we
    1519                 :             :                  * already pinned above will be "forwarded" by read_stream.c to the
    1520                 :             :                  * next call to StartReadBuffers().
    1521                 :             :                  *
    1522                 :             :                  * This is signalled to the caller by decrementing *nblocks *and*
    1523                 :             :                  * reducing operation->nblocks. The latter is done here, but not below
    1524                 :             :                  * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
    1525                 :             :                  * overall read size anymore, we need to retry until done in its
    1526                 :             :                  * entirety or until failed.
    1527                 :             :                  */
    1528                 :        7053 :                 did_start_io = AsyncReadBuffers(operation, nblocks);
    1529                 :             : 
    1530                 :        7053 :                 operation->nblocks = *nblocks;
    1531                 :        7053 :         }
    1532                 :             :         else
    1533                 :             :         {
    1534                 :           0 :                 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
    1535                 :             : 
    1536         [ #  # ]:           0 :                 if (flags & READ_BUFFERS_ISSUE_ADVICE)
    1537                 :             :                 {
    1538                 :             :                         /*
    1539                 :             :                          * In theory we should only do this if PinBufferForBlock() had to
    1540                 :             :                          * allocate new buffers above.  That way, if two calls to
    1541                 :             :                          * StartReadBuffers() were made for the same blocks before
    1542                 :             :                          * WaitReadBuffers(), only the first would issue the advice.
    1543                 :             :                          * That'd be a better simulation of true asynchronous I/O, which
    1544                 :             :                          * would only start the I/O once, but isn't done here for
    1545                 :             :                          * simplicity.
    1546                 :             :                          */
    1547                 :           0 :                         smgrprefetch(operation->smgr,
    1548                 :           0 :                                                  operation->forknum,
    1549                 :           0 :                                                  blockNum,
    1550                 :           0 :                                                  actual_nblocks);
    1551                 :           0 :                 }
    1552                 :             : 
    1553                 :             :                 /*
    1554                 :             :                  * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
    1555                 :             :                  * will initiate the necessary IO.
    1556                 :             :                  */
    1557                 :           0 :                 did_start_io = true;
    1558                 :             :         }
    1559                 :             : 
    1560                 :        7053 :         CheckReadBuffersOperation(operation, !did_start_io);
    1561                 :             : 
    1562                 :        7053 :         return did_start_io;
    1563                 :    11679530 : }
    1564                 :             : 
    1565                 :             : /*
    1566                 :             :  * Begin reading a range of blocks beginning at blockNum and extending for
    1567                 :             :  * *nblocks.  *nblocks and the buffers array are in/out parameters.  On entry,
    1568                 :             :  * the buffers elements covered by *nblocks must hold either InvalidBuffer or
    1569                 :             :  * buffers forwarded by an earlier call to StartReadBuffers() that was split
    1570                 :             :  * and is now being continued.  On return, *nblocks holds the number of blocks
    1571                 :             :  * accepted by this operation.  If it is less than the original number then
    1572                 :             :  * this operation has been split, but buffer elements up to the original
    1573                 :             :  * requested size may hold forwarded buffers to be used for a continuing
    1574                 :             :  * operation.  The caller must either start a new I/O beginning at the block
    1575                 :             :  * immediately following the blocks accepted by this call and pass those
    1576                 :             :  * buffers back in, or release them if it chooses not to.  It shouldn't make
    1577                 :             :  * any other use of or assumptions about forwarded buffers.
    1578                 :             :  *
    1579                 :             :  * If false is returned, no I/O is necessary and the buffers covered by
    1580                 :             :  * *nblocks on exit are valid and ready to be accessed.  If true is returned,
    1581                 :             :  * an I/O has been started, and WaitReadBuffers() must be called with the same
    1582                 :             :  * operation object before the buffers covered by *nblocks on exit can be
    1583                 :             :  * accessed.  Along with the operation object, the caller-supplied array of
    1584                 :             :  * buffers must remain valid until WaitReadBuffers() is called, and any
    1585                 :             :  * forwarded buffers must also be preserved for a continuing call unless
    1586                 :             :  * they are explicitly released.
    1587                 :             :  */
    1588                 :             : bool
    1589                 :      401903 : StartReadBuffers(ReadBuffersOperation *operation,
    1590                 :             :                                  Buffer *buffers,
    1591                 :             :                                  BlockNumber blockNum,
    1592                 :             :                                  int *nblocks,
    1593                 :             :                                  int flags)
    1594                 :             : {
    1595                 :      401903 :         return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
    1596                 :             :                                                                 true /* expect forwarded buffers */ );
    1597                 :             : }
    1598                 :             : 
    1599                 :             : /*
    1600                 :             :  * Single block version of the StartReadBuffers().  This might save a few
    1601                 :             :  * instructions when called from another translation unit, because it is
    1602                 :             :  * specialized for nblocks == 1.
    1603                 :             :  *
    1604                 :             :  * This version does not support "forwarded" buffers: they cannot be created
    1605                 :             :  * by reading only one block and *buffer is ignored on entry.
    1606                 :             :  */
    1607                 :             : bool
    1608                 :    11277627 : StartReadBuffer(ReadBuffersOperation *operation,
    1609                 :             :                                 Buffer *buffer,
    1610                 :             :                                 BlockNumber blocknum,
    1611                 :             :                                 int flags)
    1612                 :             : {
    1613                 :    11277627 :         int                     nblocks = 1;
    1614                 :    11277627 :         bool            result;
    1615                 :             : 
    1616                 :    11277627 :         result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
    1617                 :             :                                                                   false /* single block, no forwarding */ );
    1618         [ +  - ]:    11277627 :         Assert(nblocks == 1);           /* single block can't be short */
    1619                 :             : 
    1620                 :    22555254 :         return result;
    1621                 :    11277627 : }
    1622                 :             : 
    1623                 :             : /*
    1624                 :             :  * Perform sanity checks on the ReadBuffersOperation.
    1625                 :             :  */
    1626                 :             : static void
    1627                 :    11693436 : CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
    1628                 :             : {
    1629                 :             : #ifdef USE_ASSERT_CHECKING
    1630         [ +  - ]:    11693436 :         Assert(operation->nblocks_done <= operation->nblocks);
    1631   [ +  +  +  - ]:    11693436 :         Assert(!is_complete || operation->nblocks == operation->nblocks_done);
    1632                 :             : 
    1633         [ +  + ]:    23399164 :         for (int i = 0; i < operation->nblocks; i++)
    1634                 :             :         {
    1635                 :    11705728 :                 Buffer          buffer = operation->buffers[i];
    1636         [ +  + ]:    11705728 :                 BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
    1637                 :      359039 :                         GetLocalBufferDescriptor(-buffer - 1) :
    1638                 :    11346689 :                         GetBufferDescriptor(buffer - 1);
    1639                 :             : 
    1640         [ +  - ]:    11705728 :                 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
    1641         [ +  - ]:    11705728 :                 Assert(pg_atomic_read_u64(&buf_hdr->state) & BM_TAG_VALID);
    1642                 :             : 
    1643         [ +  + ]:    11705728 :                 if (i < operation->nblocks_done)
    1644         [ -  + ]:    11683527 :                         Assert(pg_atomic_read_u64(&buf_hdr->state) & BM_VALID);
    1645                 :    11705728 :         }
    1646                 :             : #endif
    1647                 :    11693436 : }
    1648                 :             : 
    1649                 :             : /* helper for ReadBuffersCanStartIO(), to avoid repetition */
    1650                 :             : static inline bool
    1651                 :       11151 : ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
    1652                 :             : {
    1653         [ +  + ]:       11151 :         if (BufferIsLocal(buffer))
    1654                 :        5538 :                 return StartLocalBufferIO(GetLocalBufferDescriptor(-buffer - 1),
    1655                 :        2769 :                                                                   true, nowait);
    1656                 :             :         else
    1657                 :        8382 :                 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
    1658                 :       11151 : }
    1659                 :             : 
    1660                 :             : /*
    1661                 :             :  * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
    1662                 :             :  */
    1663                 :             : static inline bool
    1664                 :       11151 : ReadBuffersCanStartIO(Buffer buffer, bool nowait)
    1665                 :             : {
    1666                 :             :         /*
    1667                 :             :          * If this backend currently has staged IO, we need to submit the pending
    1668                 :             :          * IO before waiting for the right to issue IO, to avoid the potential for
    1669                 :             :          * deadlocks (and, more commonly, unnecessary delays for other backends).
    1670                 :             :          */
    1671   [ +  +  +  + ]:       11151 :         if (!nowait && pgaio_have_staged())
    1672                 :             :         {
    1673         [ +  - ]:           4 :                 if (ReadBuffersCanStartIOOnce(buffer, true))
    1674                 :           4 :                         return true;
    1675                 :             : 
    1676                 :             :                 /*
    1677                 :             :                  * Unfortunately StartBufferIO() returning false doesn't allow to
    1678                 :             :                  * distinguish between the buffer already being valid and IO already
    1679                 :             :                  * being in progress. Since IO already being in progress is quite
    1680                 :             :                  * rare, this approach seems fine.
    1681                 :             :                  */
    1682                 :           0 :                 pgaio_submit_staged();
    1683                 :           0 :         }
    1684                 :             : 
    1685                 :       11147 :         return ReadBuffersCanStartIOOnce(buffer, nowait);
    1686                 :       11151 : }
    1687                 :             : 
    1688                 :             : /*
    1689                 :             :  * Helper for WaitReadBuffers() that processes the results of a readv
    1690                 :             :  * operation, raising an error if necessary.
    1691                 :             :  */
    1692                 :             : static void
    1693                 :        6954 : ProcessReadBuffersResult(ReadBuffersOperation *operation)
    1694                 :             : {
    1695                 :        6954 :         PgAioReturn *aio_ret = &operation->io_return;
    1696                 :        6954 :         PgAioResultStatus rs = aio_ret->result.status;
    1697                 :        6954 :         int                     newly_read_blocks = 0;
    1698                 :             : 
    1699         [ +  - ]:        6954 :         Assert(pgaio_wref_valid(&operation->io_wref));
    1700         [ +  - ]:        6954 :         Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
    1701                 :             : 
    1702                 :             :         /*
    1703                 :             :          * SMGR reports the number of blocks successfully read as the result of
    1704                 :             :          * the IO operation. Thus we can simply add that to ->nblocks_done.
    1705                 :             :          */
    1706                 :             : 
    1707         [ -  + ]:        6954 :         if (likely(rs != PGAIO_RS_ERROR))
    1708                 :        6954 :                 newly_read_blocks = aio_ret->result.result;
    1709                 :             : 
    1710   [ +  -  -  + ]:        6954 :         if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
    1711                 :           0 :                 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
    1712                 :           0 :                                                         rs == PGAIO_RS_ERROR ? ERROR : WARNING);
    1713         [ +  - ]:        6954 :         else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
    1714                 :             :         {
    1715                 :             :                 /*
    1716                 :             :                  * We'll retry, so we just emit a debug message to the server log (or
    1717                 :             :                  * not even that in prod scenarios).
    1718                 :             :                  */
    1719                 :           0 :                 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
    1720   [ #  #  #  # ]:           0 :                 elog(DEBUG3, "partial read, will retry");
    1721                 :           0 :         }
    1722                 :             : 
    1723         [ +  - ]:        6954 :         Assert(newly_read_blocks > 0);
    1724         [ +  - ]:        6954 :         Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
    1725                 :             : 
    1726                 :        6954 :         operation->nblocks_done += newly_read_blocks;
    1727                 :             : 
    1728         [ +  - ]:        6954 :         Assert(operation->nblocks_done <= operation->nblocks);
    1729                 :        6954 : }
    1730                 :             : 
    1731                 :             : void
    1732                 :        6954 : WaitReadBuffers(ReadBuffersOperation *operation)
    1733                 :             : {
    1734                 :        6954 :         PgAioReturn *aio_ret = &operation->io_return;
    1735                 :        6954 :         IOContext       io_context;
    1736                 :        6954 :         IOObject        io_object;
    1737                 :             : 
    1738         [ +  + ]:        6954 :         if (operation->persistence == RELPERSISTENCE_TEMP)
    1739                 :             :         {
    1740                 :         484 :                 io_context = IOCONTEXT_NORMAL;
    1741                 :         484 :                 io_object = IOOBJECT_TEMP_RELATION;
    1742                 :         484 :         }
    1743                 :             :         else
    1744                 :             :         {
    1745                 :        6470 :                 io_context = IOContextForStrategy(operation->strategy);
    1746                 :        6470 :                 io_object = IOOBJECT_RELATION;
    1747                 :             :         }
    1748                 :             : 
    1749                 :             :         /*
    1750                 :             :          * If we get here without an IO operation having been issued, the
    1751                 :             :          * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
    1752                 :             :          * caller should not have called WaitReadBuffers().
    1753                 :             :          *
    1754                 :             :          * In the case of IOMETHOD_SYNC, we start - as we used to before the
    1755                 :             :          * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
    1756                 :             :          * of the retry logic below, no extra code is required.
    1757                 :             :          *
    1758                 :             :          * This path is expected to eventually go away.
    1759                 :             :          */
    1760   [ -  +  #  # ]:        6954 :         if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
    1761   [ #  #  #  # ]:           0 :                 elog(ERROR, "waiting for read operation that didn't read");
    1762                 :             : 
    1763                 :             :         /*
    1764                 :             :          * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
    1765                 :             :          * done. We may need multiple retries, not just because we could get
    1766                 :             :          * multiple partial reads, but also because some of the remaining
    1767                 :             :          * to-be-read buffers may have been read in by other backends, limiting
    1768                 :             :          * the IO size.
    1769                 :             :          */
    1770                 :        6954 :         while (true)
    1771                 :             :         {
    1772                 :        6954 :                 int                     ignored_nblocks_progress;
    1773                 :             : 
    1774                 :        6954 :                 CheckReadBuffersOperation(operation, false);
    1775                 :             : 
    1776                 :             :                 /*
    1777                 :             :                  * If there is an IO associated with the operation, we may need to
    1778                 :             :                  * wait for it.
    1779                 :             :                  */
    1780         [ -  + ]:        6954 :                 if (pgaio_wref_valid(&operation->io_wref))
    1781                 :             :                 {
    1782                 :             :                         /*
    1783                 :             :                          * Track the time spent waiting for the IO to complete. As
    1784                 :             :                          * tracking a wait even if we don't actually need to wait
    1785                 :             :                          *
    1786                 :             :                          * a) is not cheap, due to the timestamping overhead
    1787                 :             :                          *
    1788                 :             :                          * b) reports some time as waiting, even if we never waited
    1789                 :             :                          *
    1790                 :             :                          * we first check if we already know the IO is complete.
    1791                 :             :                          */
    1792   [ +  +  +  + ]:        6954 :                         if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
    1793                 :         658 :                                 !pgaio_wref_check_done(&operation->io_wref))
    1794                 :             :                         {
    1795                 :         593 :                                 instr_time      io_start = pgstat_prepare_io_time(track_io_timing);
    1796                 :             : 
    1797                 :         593 :                                 pgaio_wref_wait(&operation->io_wref);
    1798                 :             : 
    1799                 :             :                                 /*
    1800                 :             :                                  * The IO operation itself was already counted earlier, in
    1801                 :             :                                  * AsyncReadBuffers(), this just accounts for the wait time.
    1802                 :             :                                  */
    1803                 :         593 :                                 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
    1804                 :             :                                                                                 io_start, 0, 0);
    1805                 :         593 :                         }
    1806                 :             :                         else
    1807                 :             :                         {
    1808         [ -  + ]:        6361 :                                 Assert(pgaio_wref_check_done(&operation->io_wref));
    1809                 :             :                         }
    1810                 :             : 
    1811                 :             :                         /*
    1812                 :             :                          * We now are sure the IO completed. Check the results. This
    1813                 :             :                          * includes reporting on errors if there were any.
    1814                 :             :                          */
    1815                 :        6954 :                         ProcessReadBuffersResult(operation);
    1816                 :        6954 :                 }
    1817                 :             : 
    1818                 :             :                 /*
    1819                 :             :                  * Most of the time, the one IO we already started, will read in
    1820                 :             :                  * everything.  But we need to deal with partial reads and buffers not
    1821                 :             :                  * needing IO anymore.
    1822                 :             :                  */
    1823         [ +  - ]:        6954 :                 if (operation->nblocks_done == operation->nblocks)
    1824                 :        6954 :                         break;
    1825                 :             : 
    1826         [ #  # ]:           0 :                 CHECK_FOR_INTERRUPTS();
    1827                 :             : 
    1828                 :             :                 /*
    1829                 :             :                  * This may only complete the IO partially, either because some
    1830                 :             :                  * buffers were already valid, or because of a partial read.
    1831                 :             :                  *
    1832                 :             :                  * NB: In contrast to after the AsyncReadBuffers() call in
    1833                 :             :                  * StartReadBuffers(), we do *not* reduce
    1834                 :             :                  * ReadBuffersOperation->nblocks here, callers expect the full
    1835                 :             :                  * operation to be completed at this point (as more operations may
    1836                 :             :                  * have been queued).
    1837                 :             :                  */
    1838                 :           0 :                 AsyncReadBuffers(operation, &ignored_nblocks_progress);
    1839      [ -  -  + ]:        6954 :         }
    1840                 :             : 
    1841                 :        6954 :         CheckReadBuffersOperation(operation, true);
    1842                 :             : 
    1843                 :             :         /* NB: READ_DONE tracepoint was already executed in completion callback */
    1844                 :        6954 : }
    1845                 :             : 
    1846                 :             : /*
    1847                 :             :  * Initiate IO for the ReadBuffersOperation
    1848                 :             :  *
    1849                 :             :  * This function only starts a single IO at a time. The size of the IO may be
    1850                 :             :  * limited to below the to-be-read blocks, if one of the buffers has
    1851                 :             :  * concurrently been read in. If the first to-be-read buffer is already valid,
    1852                 :             :  * no IO will be issued.
    1853                 :             :  *
    1854                 :             :  * To support retries after partial reads, the first operation->nblocks_done
    1855                 :             :  * buffers are skipped.
    1856                 :             :  *
    1857                 :             :  * On return *nblocks_progress is updated to reflect the number of buffers
    1858                 :             :  * affected by the call. If the first buffer is valid, *nblocks_progress is
    1859                 :             :  * set to 1 and operation->nblocks_done is incremented.
    1860                 :             :  *
    1861                 :             :  * Returns true if IO was initiated, false if no IO was necessary.
    1862                 :             :  */
    1863                 :             : static bool
    1864                 :        7053 : AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
    1865                 :             : {
    1866                 :        7053 :         Buffer     *buffers = &operation->buffers[0];
    1867                 :        7053 :         int                     flags = operation->flags;
    1868                 :        7053 :         BlockNumber blocknum = operation->blocknum;
    1869                 :        7053 :         ForkNumber      forknum = operation->forknum;
    1870                 :        7053 :         char            persistence = operation->persistence;
    1871                 :        7053 :         int16           nblocks_done = operation->nblocks_done;
    1872                 :        7053 :         Buffer     *io_buffers = &operation->buffers[nblocks_done];
    1873                 :        7053 :         int                     io_buffers_len = 0;
    1874                 :        7053 :         PgAioHandle *ioh;
    1875                 :        7053 :         uint32          ioh_flags = 0;
    1876                 :        7053 :         void       *io_pages[MAX_IO_COMBINE_LIMIT];
    1877                 :        7053 :         IOContext       io_context;
    1878                 :        7053 :         IOObject        io_object;
    1879                 :        7053 :         bool            did_start_io;
    1880                 :             : 
    1881                 :             :         /*
    1882                 :             :          * When this IO is executed synchronously, either because the caller will
    1883                 :             :          * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
    1884                 :             :          * the AIO subsystem needs to know.
    1885                 :             :          */
    1886         [ +  + ]:        7053 :         if (flags & READ_BUFFERS_SYNCHRONOUSLY)
    1887                 :        5967 :                 ioh_flags |= PGAIO_HF_SYNCHRONOUS;
    1888                 :             : 
    1889         [ +  + ]:        7053 :         if (persistence == RELPERSISTENCE_TEMP)
    1890                 :             :         {
    1891                 :         582 :                 io_context = IOCONTEXT_NORMAL;
    1892                 :         582 :                 io_object = IOOBJECT_TEMP_RELATION;
    1893                 :         582 :                 ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
    1894                 :         582 :         }
    1895                 :             :         else
    1896                 :             :         {
    1897                 :        6471 :                 io_context = IOContextForStrategy(operation->strategy);
    1898                 :        6471 :                 io_object = IOOBJECT_RELATION;
    1899                 :             :         }
    1900                 :             : 
    1901                 :             :         /*
    1902                 :             :          * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
    1903                 :             :          * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
    1904                 :             :          * set globally, but on a per-session basis. The completion callback,
    1905                 :             :          * which may be run in other processes, e.g. in IO workers, may have a
    1906                 :             :          * different value of the zero_damaged_pages GUC.
    1907                 :             :          *
    1908                 :             :          * XXX: We probably should eventually use a different flag for
    1909                 :             :          * zero_damaged_pages, so we can report different log levels / error codes
    1910                 :             :          * for zero_damaged_pages and ZERO_ON_ERROR.
    1911                 :             :          */
    1912         [ +  - ]:        7053 :         if (zero_damaged_pages)
    1913                 :           0 :                 flags |= READ_BUFFERS_ZERO_ON_ERROR;
    1914                 :             : 
    1915                 :             :         /*
    1916                 :             :          * For the same reason as with zero_damaged_pages we need to use this
    1917                 :             :          * backend's ignore_checksum_failure value.
    1918                 :             :          */
    1919         [ +  - ]:        7053 :         if (ignore_checksum_failure)
    1920                 :           0 :                 flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES;
    1921                 :             : 
    1922                 :             : 
    1923                 :             :         /*
    1924                 :             :          * To be allowed to report stats in the local completion callback we need
    1925                 :             :          * to prepare to report stats now. This ensures we can safely report the
    1926                 :             :          * checksum failure even in a critical section.
    1927                 :             :          */
    1928                 :        7053 :         pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
    1929                 :             : 
    1930                 :             :         /*
    1931                 :             :          * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
    1932                 :             :          * might block, which we don't want after setting IO_IN_PROGRESS.
    1933                 :             :          *
    1934                 :             :          * If we need to wait for IO before we can get a handle, submit
    1935                 :             :          * already-staged IO first, so that other backends don't need to wait.
    1936                 :             :          * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
    1937                 :             :          * wait for already submitted IO, which doesn't require additional locks,
    1938                 :             :          * but it could still cause undesirable waits.
    1939                 :             :          *
    1940                 :             :          * A secondary benefit is that this would allow us to measure the time in
    1941                 :             :          * pgaio_io_acquire() without causing undue timer overhead in the common,
    1942                 :             :          * non-blocking, case.  However, currently the pgstats infrastructure
    1943                 :             :          * doesn't really allow that, as it a) asserts that an operation can't
    1944                 :             :          * have time without operations b) doesn't have an API to report
    1945                 :             :          * "accumulated" time.
    1946                 :             :          */
    1947                 :        7053 :         ioh = pgaio_io_acquire_nb(CurrentResourceOwner, &operation->io_return);
    1948         [ +  - ]:        7053 :         if (unlikely(!ioh))
    1949                 :             :         {
    1950                 :           0 :                 pgaio_submit_staged();
    1951                 :             : 
    1952                 :           0 :                 ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return);
    1953                 :           0 :         }
    1954                 :             : 
    1955                 :             :         /*
    1956                 :             :          * Check if we can start IO on the first to-be-read buffer.
    1957                 :             :          *
    1958                 :             :          * If an I/O is already in progress in another backend, we want to wait
    1959                 :             :          * for the outcome: either done, or something went wrong and we will
    1960                 :             :          * retry.
    1961                 :             :          */
    1962         [ +  + ]:        7053 :         if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
    1963                 :             :         {
    1964                 :             :                 /*
    1965                 :             :                  * Someone else has already completed this block, we're done.
    1966                 :             :                  *
    1967                 :             :                  * When IO is necessary, ->nblocks_done is updated in
    1968                 :             :                  * ProcessReadBuffersResult(), but that is not called if no IO is
    1969                 :             :                  * necessary. Thus update here.
    1970                 :             :                  */
    1971                 :           1 :                 operation->nblocks_done += 1;
    1972                 :           1 :                 *nblocks_progress = 1;
    1973                 :             : 
    1974                 :           1 :                 pgaio_io_release(ioh);
    1975                 :           1 :                 pgaio_wref_clear(&operation->io_wref);
    1976                 :           1 :                 did_start_io = false;
    1977                 :             : 
    1978                 :             :                 /*
    1979                 :             :                  * Report and track this as a 'hit' for this backend, even though it
    1980                 :             :                  * must have started out as a miss in PinBufferForBlock(). The other
    1981                 :             :                  * backend will track this as a 'read'.
    1982                 :             :                  */
    1983                 :           1 :                 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
    1984                 :             :                                                                                   operation->smgr->smgr_rlocator.locator.spcOid,
    1985                 :             :                                                                                   operation->smgr->smgr_rlocator.locator.dbOid,
    1986                 :             :                                                                                   operation->smgr->smgr_rlocator.locator.relNumber,
    1987                 :             :                                                                                   operation->smgr->smgr_rlocator.backend,
    1988                 :             :                                                                                   true);
    1989                 :             : 
    1990         [ -  + ]:           1 :                 if (persistence == RELPERSISTENCE_TEMP)
    1991                 :           0 :                         pgBufferUsage.local_blks_hit += 1;
    1992                 :             :                 else
    1993                 :           1 :                         pgBufferUsage.shared_blks_hit += 1;
    1994                 :             : 
    1995         [ -  + ]:           1 :                 if (operation->rel)
    1996   [ +  -  +  -  :           1 :                         pgstat_count_buffer_hit(operation->rel);
                   #  # ]
    1997                 :             : 
    1998                 :           1 :                 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
    1999                 :             : 
    2000         [ +  - ]:           1 :                 if (VacuumCostActive)
    2001                 :           0 :                         VacuumCostBalance += VacuumCostPageHit;
    2002                 :           1 :         }
    2003                 :             :         else
    2004                 :             :         {
    2005                 :        7052 :                 instr_time      io_start;
    2006                 :             : 
    2007                 :             :                 /* We found a buffer that we need to read in. */
    2008         [ +  - ]:        7052 :                 Assert(io_buffers[0] == buffers[nblocks_done]);
    2009                 :        7052 :                 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
    2010                 :        7052 :                 io_buffers_len = 1;
    2011                 :             : 
    2012                 :             :                 /*
    2013                 :             :                  * How many neighboring-on-disk blocks can we scatter-read into other
    2014                 :             :                  * buffers at the same time?  In this case we don't wait if we see an
    2015                 :             :                  * I/O already in progress.  We already set BM_IO_IN_PROGRESS for the
    2016                 :             :                  * head block, so we should get on with that I/O as soon as possible.
    2017                 :             :                  */
    2018         [ +  + ]:       11150 :                 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
    2019                 :             :                 {
    2020         [ -  + ]:        4098 :                         if (!ReadBuffersCanStartIO(buffers[i], true))
    2021                 :           0 :                                 break;
    2022                 :             :                         /* Must be consecutive block numbers. */
    2023         [ +  - ]:        4098 :                         Assert(BufferGetBlockNumber(buffers[i - 1]) ==
    2024                 :             :                                    BufferGetBlockNumber(buffers[i]) - 1);
    2025         [ +  - ]:        4098 :                         Assert(io_buffers[io_buffers_len] == buffers[i]);
    2026                 :             : 
    2027                 :        4098 :                         io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
    2028                 :        4098 :                 }
    2029                 :             : 
    2030                 :             :                 /* get a reference to wait for in WaitReadBuffers() */
    2031                 :        7052 :                 pgaio_io_get_wref(ioh, &operation->io_wref);
    2032                 :             : 
    2033                 :             :                 /* provide the list of buffers to the completion callbacks */
    2034                 :        7052 :                 pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
    2035                 :             : 
    2036                 :       14104 :                 pgaio_io_register_callbacks(ioh,
    2037                 :        7052 :                                                                         persistence == RELPERSISTENCE_TEMP ?
    2038                 :             :                                                                         PGAIO_HCB_LOCAL_BUFFER_READV :
    2039                 :             :                                                                         PGAIO_HCB_SHARED_BUFFER_READV,
    2040                 :        7052 :                                                                         flags);
    2041                 :             : 
    2042                 :        7052 :                 pgaio_io_set_flag(ioh, ioh_flags);
    2043                 :             : 
    2044                 :             :                 /* ---
    2045                 :             :                  * Even though we're trying to issue IO asynchronously, track the time
    2046                 :             :                  * in smgrstartreadv():
    2047                 :             :                  * - if io_method == IOMETHOD_SYNC, we will always perform the IO
    2048                 :             :                  *   immediately
    2049                 :             :                  * - the io method might not support the IO (e.g. worker IO for a temp
    2050                 :             :                  *   table)
    2051                 :             :                  * ---
    2052                 :             :                  */
    2053                 :        7052 :                 io_start = pgstat_prepare_io_time(track_io_timing);
    2054                 :       14104 :                 smgrstartreadv(ioh, operation->smgr, forknum,
    2055                 :        7052 :                                            blocknum + nblocks_done,
    2056                 :        7052 :                                            io_pages, io_buffers_len);
    2057                 :       14104 :                 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
    2058                 :        7052 :                                                                 io_start, 1, io_buffers_len * BLCKSZ);
    2059                 :             : 
    2060         [ +  + ]:        7052 :                 if (persistence == RELPERSISTENCE_TEMP)
    2061                 :         582 :                         pgBufferUsage.local_blks_read += io_buffers_len;
    2062                 :             :                 else
    2063                 :        6470 :                         pgBufferUsage.shared_blks_read += io_buffers_len;
    2064                 :             : 
    2065                 :             :                 /*
    2066                 :             :                  * Track vacuum cost when issuing IO, not after waiting for it.
    2067                 :             :                  * Otherwise we could end up issuing a lot of IO in a short timespan,
    2068                 :             :                  * despite a low cost limit.
    2069                 :             :                  */
    2070         [ +  - ]:        7052 :                 if (VacuumCostActive)
    2071                 :           0 :                         VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
    2072                 :             : 
    2073                 :        7052 :                 *nblocks_progress = io_buffers_len;
    2074                 :        7052 :                 did_start_io = true;
    2075                 :        7052 :         }
    2076                 :             : 
    2077                 :       14106 :         return did_start_io;
    2078                 :        7053 : }
    2079                 :             : 
    2080                 :             : /*
    2081                 :             :  * BufferAlloc -- subroutine for PinBufferForBlock.  Handles lookup of a shared
    2082                 :             :  *              buffer.  If no buffer exists already, selects a replacement victim and
    2083                 :             :  *              evicts the old page, but does NOT read in new page.
    2084                 :             :  *
    2085                 :             :  * "strategy" can be a buffer replacement strategy object, or NULL for
    2086                 :             :  * the default strategy.  The selected buffer's usage_count is advanced when
    2087                 :             :  * using the default strategy, but otherwise possibly not (see PinBuffer).
    2088                 :             :  *
    2089                 :             :  * The returned buffer is pinned and is already marked as holding the
    2090                 :             :  * desired page.  If it already did have the desired page, *foundPtr is
    2091                 :             :  * set true.  Otherwise, *foundPtr is set false.
    2092                 :             :  *
    2093                 :             :  * io_context is passed as an output parameter to avoid calling
    2094                 :             :  * IOContextForStrategy() when there is a shared buffers hit and no IO
    2095                 :             :  * statistics need be captured.
    2096                 :             :  *
    2097                 :             :  * No locks are held either at entry or exit.
    2098                 :             :  */
    2099                 :             : static pg_attribute_always_inline BufferDesc *
    2100                 :    11333022 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
    2101                 :             :                         BlockNumber blockNum,
    2102                 :             :                         BufferAccessStrategy strategy,
    2103                 :             :                         bool *foundPtr, IOContext io_context)
    2104                 :             : {
    2105                 :    11333022 :         BufferTag       newTag;                 /* identity of requested block */
    2106                 :    11333022 :         uint32          newHash;                /* hash value for newTag */
    2107                 :    11333022 :         LWLock     *newPartitionLock;   /* buffer partition lock for it */
    2108                 :    11333022 :         int                     existing_buf_id;
    2109                 :    11333022 :         Buffer          victim_buffer;
    2110                 :    11333022 :         BufferDesc *victim_buf_hdr;
    2111                 :    11333022 :         uint64          victim_buf_state;
    2112                 :    11333022 :         uint64          set_bits = 0;
    2113                 :             : 
    2114                 :             :         /* Make sure we will have room to remember the buffer pin */
    2115                 :    11333022 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    2116                 :    11333022 :         ReservePrivateRefCountEntry();
    2117                 :             : 
    2118                 :             :         /* create a tag so we can lookup the buffer */
    2119                 :    11333022 :         InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
    2120                 :             : 
    2121                 :             :         /* determine its hash code and partition lock ID */
    2122                 :    11333022 :         newHash = BufTableHashCode(&newTag);
    2123                 :    11333022 :         newPartitionLock = BufMappingPartitionLock(newHash);
    2124                 :             : 
    2125                 :             :         /* see if the block is in the buffer pool already */
    2126                 :    11333022 :         LWLockAcquire(newPartitionLock, LW_SHARED);
    2127                 :    11333022 :         existing_buf_id = BufTableLookup(&newTag, newHash);
    2128         [ +  + ]:    11333022 :         if (existing_buf_id >= 0)
    2129                 :             :         {
    2130                 :    11321558 :                 BufferDesc *buf;
    2131                 :    11321558 :                 bool            valid;
    2132                 :             : 
    2133                 :             :                 /*
    2134                 :             :                  * Found it.  Now, pin the buffer so no one can steal it from the
    2135                 :             :                  * buffer pool, and check to see if the correct data has been loaded
    2136                 :             :                  * into the buffer.
    2137                 :             :                  */
    2138                 :    11321558 :                 buf = GetBufferDescriptor(existing_buf_id);
    2139                 :             : 
    2140                 :    11321558 :                 valid = PinBuffer(buf, strategy, false);
    2141                 :             : 
    2142                 :             :                 /* Can release the mapping lock as soon as we've pinned it */
    2143                 :    11321558 :                 LWLockRelease(newPartitionLock);
    2144                 :             : 
    2145                 :    11321558 :                 *foundPtr = true;
    2146                 :             : 
    2147         [ +  + ]:    11321558 :                 if (!valid)
    2148                 :             :                 {
    2149                 :             :                         /*
    2150                 :             :                          * We can only get here if (a) someone else is still reading in
    2151                 :             :                          * the page, (b) a previous read attempt failed, or (c) someone
    2152                 :             :                          * called StartReadBuffers() but not yet WaitReadBuffers().
    2153                 :             :                          */
    2154                 :           1 :                         *foundPtr = false;
    2155                 :           1 :                 }
    2156                 :             : 
    2157                 :    11321558 :                 return buf;
    2158                 :    11321558 :         }
    2159                 :             : 
    2160                 :             :         /*
    2161                 :             :          * Didn't find it in the buffer pool.  We'll have to initialize a new
    2162                 :             :          * buffer.  Remember to unlock the mapping lock while doing the work.
    2163                 :             :          */
    2164                 :       11464 :         LWLockRelease(newPartitionLock);
    2165                 :             : 
    2166                 :             :         /*
    2167                 :             :          * Acquire a victim buffer. Somebody else might try to do the same, we
    2168                 :             :          * don't hold any conflicting locks. If so we'll have to undo our work
    2169                 :             :          * later.
    2170                 :             :          */
    2171                 :       11464 :         victim_buffer = GetVictimBuffer(strategy, io_context);
    2172                 :       11464 :         victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
    2173                 :             : 
    2174                 :             :         /*
    2175                 :             :          * Try to make a hashtable entry for the buffer under its new tag. If
    2176                 :             :          * somebody else inserted another buffer for the tag, we'll release the
    2177                 :             :          * victim buffer we acquired and use the already inserted one.
    2178                 :             :          */
    2179                 :       11464 :         LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
    2180                 :       11464 :         existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
    2181         [ -  + ]:       11464 :         if (existing_buf_id >= 0)
    2182                 :             :         {
    2183                 :           0 :                 BufferDesc *existing_buf_hdr;
    2184                 :           0 :                 bool            valid;
    2185                 :             : 
    2186                 :             :                 /*
    2187                 :             :                  * Got a collision. Someone has already done what we were about to do.
    2188                 :             :                  * We'll just handle this as if it were found in the buffer pool in
    2189                 :             :                  * the first place.  First, give up the buffer we were planning to
    2190                 :             :                  * use.
    2191                 :             :                  *
    2192                 :             :                  * We could do this after releasing the partition lock, but then we'd
    2193                 :             :                  * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
    2194                 :             :                  * before acquiring the lock, for the rare case of such a collision.
    2195                 :             :                  */
    2196                 :           0 :                 UnpinBuffer(victim_buf_hdr);
    2197                 :             : 
    2198                 :             :                 /* remaining code should match code at top of routine */
    2199                 :             : 
    2200                 :           0 :                 existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
    2201                 :             : 
    2202                 :           0 :                 valid = PinBuffer(existing_buf_hdr, strategy, false);
    2203                 :             : 
    2204                 :             :                 /* Can release the mapping lock as soon as we've pinned it */
    2205                 :           0 :                 LWLockRelease(newPartitionLock);
    2206                 :             : 
    2207                 :           0 :                 *foundPtr = true;
    2208                 :             : 
    2209         [ #  # ]:           0 :                 if (!valid)
    2210                 :             :                 {
    2211                 :             :                         /*
    2212                 :             :                          * We can only get here if (a) someone else is still reading in
    2213                 :             :                          * the page, (b) a previous read attempt failed, or (c) someone
    2214                 :             :                          * called StartReadBuffers() but not yet WaitReadBuffers().
    2215                 :             :                          */
    2216                 :           0 :                         *foundPtr = false;
    2217                 :           0 :                 }
    2218                 :             : 
    2219                 :           0 :                 return existing_buf_hdr;
    2220                 :           0 :         }
    2221                 :             : 
    2222                 :             :         /*
    2223                 :             :          * Need to lock the buffer header too in order to change its tag.
    2224                 :             :          */
    2225                 :       11464 :         victim_buf_state = LockBufHdr(victim_buf_hdr);
    2226                 :             : 
    2227                 :             :         /* some sanity checks while we hold the buffer header lock */
    2228         [ +  - ]:       11464 :         Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
    2229         [ +  - ]:       11464 :         Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
    2230                 :             : 
    2231                 :       11464 :         victim_buf_hdr->tag = newTag;
    2232                 :             : 
    2233                 :             :         /*
    2234                 :             :          * Make sure BM_PERMANENT is set for buffers that must be written at every
    2235                 :             :          * checkpoint.  Unlogged buffers only need to be written at shutdown
    2236                 :             :          * checkpoints, except for their "init" forks, which need to be treated
    2237                 :             :          * just like permanent relations.
    2238                 :             :          */
    2239                 :       11464 :         set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
    2240   [ +  +  -  + ]:       11464 :         if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
    2241                 :       11463 :                 set_bits |= BM_PERMANENT;
    2242                 :             : 
    2243                 :       22928 :         UnlockBufHdrExt(victim_buf_hdr, victim_buf_state,
    2244                 :       11464 :                                         set_bits, 0, 0);
    2245                 :             : 
    2246                 :       11464 :         LWLockRelease(newPartitionLock);
    2247                 :             : 
    2248                 :             :         /*
    2249                 :             :          * Buffer contents are currently invalid.
    2250                 :             :          */
    2251                 :       11464 :         *foundPtr = false;
    2252                 :             : 
    2253                 :       11464 :         return victim_buf_hdr;
    2254                 :    11333022 : }
    2255                 :             : 
    2256                 :             : /*
    2257                 :             :  * InvalidateBuffer -- mark a shared buffer invalid.
    2258                 :             :  *
    2259                 :             :  * The buffer header spinlock must be held at entry.  We drop it before
    2260                 :             :  * returning.  (This is sane because the caller must have locked the
    2261                 :             :  * buffer in order to be sure it should be dropped.)
    2262                 :             :  *
    2263                 :             :  * This is used only in contexts such as dropping a relation.  We assume
    2264                 :             :  * that no other backend could possibly be interested in using the page,
    2265                 :             :  * so the only reason the buffer might be pinned is if someone else is
    2266                 :             :  * trying to write it out.  We have to let them finish before we can
    2267                 :             :  * reclaim the buffer.
    2268                 :             :  *
    2269                 :             :  * The buffer could get reclaimed by someone else while we are waiting
    2270                 :             :  * to acquire the necessary locks; if so, don't mess it up.
    2271                 :             :  */
    2272                 :             : static void
    2273                 :       19008 : InvalidateBuffer(BufferDesc *buf)
    2274                 :             : {
    2275                 :       19008 :         BufferTag       oldTag;
    2276                 :       19008 :         uint32          oldHash;                /* hash value for oldTag */
    2277                 :       19008 :         LWLock     *oldPartitionLock;   /* buffer partition lock for it */
    2278                 :       19008 :         uint32          oldFlags;
    2279                 :       19008 :         uint64          buf_state;
    2280                 :             : 
    2281                 :             :         /* Save the original buffer tag before dropping the spinlock */
    2282                 :       19008 :         oldTag = buf->tag;
    2283                 :             : 
    2284                 :       19008 :         UnlockBufHdr(buf);
    2285                 :             : 
    2286                 :             :         /*
    2287                 :             :          * Need to compute the old tag's hashcode and partition lock ID. XXX is it
    2288                 :             :          * worth storing the hashcode in BufferDesc so we need not recompute it
    2289                 :             :          * here?  Probably not.
    2290                 :             :          */
    2291                 :       19008 :         oldHash = BufTableHashCode(&oldTag);
    2292                 :       19008 :         oldPartitionLock = BufMappingPartitionLock(oldHash);
    2293                 :             : 
    2294                 :             : retry:
    2295                 :             : 
    2296                 :             :         /*
    2297                 :             :          * Acquire exclusive mapping lock in preparation for changing the buffer's
    2298                 :             :          * association.
    2299                 :             :          */
    2300                 :       19008 :         LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
    2301                 :             : 
    2302                 :             :         /* Re-lock the buffer header */
    2303                 :       19008 :         buf_state = LockBufHdr(buf);
    2304                 :             : 
    2305                 :             :         /* If it's changed while we were waiting for lock, do nothing */
    2306         [ +  - ]:       19008 :         if (!BufferTagsEqual(&buf->tag, &oldTag))
    2307                 :             :         {
    2308                 :           0 :                 UnlockBufHdr(buf);
    2309                 :           0 :                 LWLockRelease(oldPartitionLock);
    2310                 :           0 :                 return;
    2311                 :             :         }
    2312                 :             : 
    2313                 :             :         /*
    2314                 :             :          * We assume the reason for it to be pinned is that either we were
    2315                 :             :          * asynchronously reading the page in before erroring out or someone else
    2316                 :             :          * is flushing the page out.  Wait for the IO to finish.  (This could be
    2317                 :             :          * an infinite loop if the refcount is messed up... it would be nice to
    2318                 :             :          * time out after awhile, but there seems no way to be sure how many loops
    2319                 :             :          * may be needed.  Note that if the other guy has pinned the buffer but
    2320                 :             :          * not yet done StartBufferIO, WaitIO will fall through and we'll
    2321                 :             :          * effectively be busy-looping here.)
    2322                 :             :          */
    2323         [ -  + ]:       19008 :         if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
    2324                 :             :         {
    2325                 :           0 :                 UnlockBufHdr(buf);
    2326                 :           0 :                 LWLockRelease(oldPartitionLock);
    2327                 :             :                 /* safety check: should definitely not be our *own* pin */
    2328         [ #  # ]:           0 :                 if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
    2329   [ #  #  #  # ]:           0 :                         elog(ERROR, "buffer is pinned in InvalidateBuffer");
    2330                 :           0 :                 WaitIO(buf);
    2331                 :           0 :                 goto retry;
    2332                 :             :         }
    2333                 :             : 
    2334                 :             :         /*
    2335                 :             :          * An invalidated buffer should not have any backends waiting to lock the
    2336                 :             :          * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
    2337                 :             :          */
    2338         [ +  - ]:       19008 :         Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
    2339                 :             : 
    2340                 :             :         /*
    2341                 :             :          * Clear out the buffer's tag and flags.  We must do this to ensure that
    2342                 :             :          * linear scans of the buffer array don't think the buffer is valid.
    2343                 :             :          */
    2344                 :       19008 :         oldFlags = buf_state & BUF_FLAG_MASK;
    2345                 :       19008 :         ClearBufferTag(&buf->tag);
    2346                 :             : 
    2347                 :       19008 :         UnlockBufHdrExt(buf, buf_state,
    2348                 :             :                                         0,
    2349                 :             :                                         BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
    2350                 :             :                                         0);
    2351                 :             : 
    2352                 :             :         /*
    2353                 :             :          * Remove the buffer from the lookup hashtable, if it was in there.
    2354                 :             :          */
    2355         [ -  + ]:       19008 :         if (oldFlags & BM_TAG_VALID)
    2356                 :       19008 :                 BufTableDelete(&oldTag, oldHash);
    2357                 :             : 
    2358                 :             :         /*
    2359                 :             :          * Done with mapping lock.
    2360                 :             :          */
    2361                 :       19008 :         LWLockRelease(oldPartitionLock);
    2362         [ -  + ]:       19008 : }
    2363                 :             : 
    2364                 :             : /*
    2365                 :             :  * Helper routine for GetVictimBuffer()
    2366                 :             :  *
    2367                 :             :  * Needs to be called on a buffer with a valid tag, pinned, but without the
    2368                 :             :  * buffer header spinlock held.
    2369                 :             :  *
    2370                 :             :  * Returns true if the buffer can be reused, in which case the buffer is only
    2371                 :             :  * pinned by this backend and marked as invalid, false otherwise.
    2372                 :             :  */
    2373                 :             : static bool
    2374                 :        1912 : InvalidateVictimBuffer(BufferDesc *buf_hdr)
    2375                 :             : {
    2376                 :        1912 :         uint64          buf_state;
    2377                 :        1912 :         uint32          hash;
    2378                 :        1912 :         LWLock     *partition_lock;
    2379                 :        1912 :         BufferTag       tag;
    2380                 :             : 
    2381         [ +  - ]:        1912 :         Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
    2382                 :             : 
    2383                 :             :         /* have buffer pinned, so it's safe to read tag without lock */
    2384                 :        1912 :         tag = buf_hdr->tag;
    2385                 :             : 
    2386                 :        1912 :         hash = BufTableHashCode(&tag);
    2387                 :        1912 :         partition_lock = BufMappingPartitionLock(hash);
    2388                 :             : 
    2389                 :        1912 :         LWLockAcquire(partition_lock, LW_EXCLUSIVE);
    2390                 :             : 
    2391                 :             :         /* lock the buffer header */
    2392                 :        1912 :         buf_state = LockBufHdr(buf_hdr);
    2393                 :             : 
    2394                 :             :         /*
    2395                 :             :          * We have the buffer pinned nobody else should have been able to unset
    2396                 :             :          * this concurrently.
    2397                 :             :          */
    2398         [ +  - ]:        1912 :         Assert(buf_state & BM_TAG_VALID);
    2399         [ +  - ]:        1912 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    2400         [ +  - ]:        1912 :         Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
    2401                 :             : 
    2402                 :             :         /*
    2403                 :             :          * If somebody else pinned the buffer since, or even worse, dirtied it,
    2404                 :             :          * give up on this buffer: It's clearly in use.
    2405                 :             :          */
    2406   [ +  -  -  + ]:        1912 :         if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
    2407                 :             :         {
    2408         [ #  # ]:           0 :                 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    2409                 :             : 
    2410                 :           0 :                 UnlockBufHdr(buf_hdr);
    2411                 :           0 :                 LWLockRelease(partition_lock);
    2412                 :             : 
    2413                 :           0 :                 return false;
    2414                 :             :         }
    2415                 :             : 
    2416                 :             :         /*
    2417                 :             :          * An invalidated buffer should not have any backends waiting to lock the
    2418                 :             :          * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
    2419                 :             :          */
    2420         [ +  - ]:        1912 :         Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
    2421                 :             : 
    2422                 :             :         /*
    2423                 :             :          * Clear out the buffer's tag and flags and usagecount.  This is not
    2424                 :             :          * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
    2425                 :             :          * doing anything with the buffer. But currently it's beneficial, as the
    2426                 :             :          * cheaper pre-check for several linear scans of shared buffers use the
    2427                 :             :          * tag (see e.g. FlushDatabaseBuffers()).
    2428                 :             :          */
    2429                 :        1912 :         ClearBufferTag(&buf_hdr->tag);
    2430                 :        1912 :         UnlockBufHdrExt(buf_hdr, buf_state,
    2431                 :             :                                         0,
    2432                 :             :                                         BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
    2433                 :             :                                         0);
    2434                 :             : 
    2435         [ +  - ]:        1912 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    2436                 :             : 
    2437                 :             :         /* finally delete buffer from the buffer mapping table */
    2438                 :        1912 :         BufTableDelete(&tag, hash);
    2439                 :             : 
    2440                 :        1912 :         LWLockRelease(partition_lock);
    2441                 :             : 
    2442                 :        1912 :         buf_state = pg_atomic_read_u64(&buf_hdr->state);
    2443         [ +  - ]:        1912 :         Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
    2444         [ +  - ]:        1912 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    2445         [ +  - ]:        1912 :         Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u64(&buf_hdr->state)) > 0);
    2446                 :             : 
    2447                 :        1912 :         return true;
    2448                 :        1912 : }
    2449                 :             : 
    2450                 :             : static Buffer
    2451                 :       40537 : GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
    2452                 :             : {
    2453                 :       40537 :         BufferDesc *buf_hdr;
    2454                 :       40537 :         Buffer          buf;
    2455                 :       40537 :         uint64          buf_state;
    2456                 :       40537 :         bool            from_ring;
    2457                 :             : 
    2458                 :             :         /*
    2459                 :             :          * Ensure, before we pin a victim buffer, that there's a free refcount
    2460                 :             :          * entry and resource owner slot for the pin.
    2461                 :             :          */
    2462                 :       40537 :         ReservePrivateRefCountEntry();
    2463                 :       40537 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    2464                 :             : 
    2465                 :             :         /* we return here if a prospective victim buffer gets used concurrently */
    2466                 :             : again:
    2467                 :             : 
    2468                 :             :         /*
    2469                 :             :          * Select a victim buffer.  The buffer is returned pinned and owned by
    2470                 :             :          * this backend.
    2471                 :             :          */
    2472                 :       40537 :         buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
    2473                 :       40537 :         buf = BufferDescriptorGetBuffer(buf_hdr);
    2474                 :             : 
    2475                 :             :         /*
    2476                 :             :          * We shouldn't have any other pins for this buffer.
    2477                 :             :          */
    2478                 :       40537 :         CheckBufferIsPinnedOnce(buf);
    2479                 :             : 
    2480                 :             :         /*
    2481                 :             :          * If the buffer was dirty, try to write it out.  There is a race
    2482                 :             :          * condition here, in that someone might dirty it after we released the
    2483                 :             :          * buffer header lock above, or even while we are writing it out (since
    2484                 :             :          * our share-lock won't prevent hint-bit updates).  We will recheck the
    2485                 :             :          * dirty bit after re-locking the buffer header.
    2486                 :             :          */
    2487         [ +  - ]:       40537 :         if (buf_state & BM_DIRTY)
    2488                 :             :         {
    2489         [ #  # ]:           0 :                 Assert(buf_state & BM_TAG_VALID);
    2490         [ #  # ]:           0 :                 Assert(buf_state & BM_VALID);
    2491                 :             : 
    2492                 :             :                 /*
    2493                 :             :                  * We need a share-lock on the buffer contents to write it out (else
    2494                 :             :                  * we might write invalid data, eg because someone else is compacting
    2495                 :             :                  * the page contents while we write).  We must use a conditional lock
    2496                 :             :                  * acquisition here to avoid deadlock.  Even though the buffer was not
    2497                 :             :                  * pinned (and therefore surely not locked) when StrategyGetBuffer
    2498                 :             :                  * returned it, someone else could have pinned and exclusive-locked it
    2499                 :             :                  * by the time we get here. If we try to get the lock unconditionally,
    2500                 :             :                  * we'd block waiting for them; if they later block waiting for us,
    2501                 :             :                  * deadlock ensues. (This has been observed to happen when two
    2502                 :             :                  * backends are both trying to split btree index pages, and the second
    2503                 :             :                  * one just happens to be trying to split the page the first one got
    2504                 :             :                  * from StrategyGetBuffer.)
    2505                 :             :                  */
    2506         [ #  # ]:           0 :                 if (!BufferLockConditional(buf, buf_hdr, BUFFER_LOCK_SHARE))
    2507                 :             :                 {
    2508                 :             :                         /*
    2509                 :             :                          * Someone else has locked the buffer, so give it up and loop back
    2510                 :             :                          * to get another one.
    2511                 :             :                          */
    2512                 :           0 :                         UnpinBuffer(buf_hdr);
    2513                 :           0 :                         goto again;
    2514                 :             :                 }
    2515                 :             : 
    2516                 :             :                 /*
    2517                 :             :                  * If using a nondefault strategy, and writing the buffer would
    2518                 :             :                  * require a WAL flush, let the strategy decide whether to go ahead
    2519                 :             :                  * and write/reuse the buffer or to choose another victim.  We need a
    2520                 :             :                  * lock to inspect the page LSN, so this can't be done inside
    2521                 :             :                  * StrategyGetBuffer.
    2522                 :             :                  */
    2523         [ #  # ]:           0 :                 if (strategy != NULL)
    2524                 :             :                 {
    2525                 :           0 :                         XLogRecPtr      lsn;
    2526                 :             : 
    2527                 :             :                         /* Read the LSN while holding buffer header lock */
    2528                 :           0 :                         buf_state = LockBufHdr(buf_hdr);
    2529                 :           0 :                         lsn = BufferGetLSN(buf_hdr);
    2530                 :           0 :                         UnlockBufHdr(buf_hdr);
    2531                 :             : 
    2532                 :           0 :                         if (XLogNeedsFlush(lsn)
    2533   [ #  #  #  # ]:           0 :                                 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
    2534                 :             :                         {
    2535                 :           0 :                                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    2536                 :           0 :                                 UnpinBuffer(buf_hdr);
    2537                 :           0 :                                 goto again;
    2538                 :             :                         }
    2539      [ #  #  # ]:           0 :                 }
    2540                 :             : 
    2541                 :             :                 /* OK, do the I/O */
    2542                 :           0 :                 FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
    2543                 :           0 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    2544                 :             : 
    2545                 :           0 :                 ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
    2546                 :           0 :                                                                           &buf_hdr->tag);
    2547                 :           0 :         }
    2548                 :             : 
    2549                 :             : 
    2550         [ +  + ]:       40537 :         if (buf_state & BM_VALID)
    2551                 :             :         {
    2552                 :             :                 /*
    2553                 :             :                  * When a BufferAccessStrategy is in use, blocks evicted from shared
    2554                 :             :                  * buffers are counted as IOOP_EVICT in the corresponding context
    2555                 :             :                  * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
    2556                 :             :                  * strategy in two cases: 1) while initially claiming buffers for the
    2557                 :             :                  * strategy ring 2) to replace an existing strategy ring buffer
    2558                 :             :                  * because it is pinned or in use and cannot be reused.
    2559                 :             :                  *
    2560                 :             :                  * Blocks evicted from buffers already in the strategy ring are
    2561                 :             :                  * counted as IOOP_REUSE in the corresponding strategy context.
    2562                 :             :                  *
    2563                 :             :                  * At this point, we can accurately count evictions and reuses,
    2564                 :             :                  * because we have successfully claimed the valid buffer. Previously,
    2565                 :             :                  * we may have been forced to release the buffer due to concurrent
    2566                 :             :                  * pinners or erroring out.
    2567                 :             :                  */
    2568                 :        3824 :                 pgstat_count_io_op(IOOBJECT_RELATION, io_context,
    2569                 :        1912 :                                                    from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
    2570                 :        1912 :         }
    2571                 :             : 
    2572                 :             :         /*
    2573                 :             :          * If the buffer has an entry in the buffer mapping table, delete it. This
    2574                 :             :          * can fail because another backend could have pinned or dirtied the
    2575                 :             :          * buffer.
    2576                 :             :          */
    2577   [ +  +  -  + ]:       40537 :         if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
    2578                 :             :         {
    2579                 :           0 :                 UnpinBuffer(buf_hdr);
    2580                 :           0 :                 goto again;
    2581                 :             :         }
    2582                 :             : 
    2583                 :             :         /* a final set of sanity checks */
    2584                 :             : #ifdef USE_ASSERT_CHECKING
    2585                 :       40537 :         buf_state = pg_atomic_read_u64(&buf_hdr->state);
    2586                 :             : 
    2587         [ +  - ]:       40537 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
    2588         [ +  - ]:       40537 :         Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
    2589                 :             : 
    2590                 :       40537 :         CheckBufferIsPinnedOnce(buf);
    2591                 :             : #endif
    2592                 :             : 
    2593                 :       81074 :         return buf;
    2594                 :       40537 : }
    2595                 :             : 
    2596                 :             : /*
    2597                 :             :  * Return the maximum number of buffers that a backend should try to pin once,
    2598                 :             :  * to avoid exceeding its fair share.  This is the highest value that
    2599                 :             :  * GetAdditionalPinLimit() could ever return.  Note that it may be zero on a
    2600                 :             :  * system with a very small buffer pool relative to max_connections.
    2601                 :             :  */
    2602                 :             : uint32
    2603                 :      346145 : GetPinLimit(void)
    2604                 :             : {
    2605                 :      346145 :         return MaxProportionalPins;
    2606                 :             : }
    2607                 :             : 
    2608                 :             : /*
    2609                 :             :  * Return the maximum number of additional buffers that this backend should
    2610                 :             :  * pin if it wants to stay under the per-backend limit, considering the number
    2611                 :             :  * of buffers it has already pinned.  Unlike LimitAdditionalPins(), the limit
    2612                 :             :  * return by this function can be zero.
    2613                 :             :  */
    2614                 :             : uint32
    2615                 :      796971 : GetAdditionalPinLimit(void)
    2616                 :             : {
    2617                 :      796971 :         uint32          estimated_pins_held;
    2618                 :             : 
    2619                 :             :         /*
    2620                 :             :          * We get the number of "overflowed" pins for free, but don't know the
    2621                 :             :          * number of pins in PrivateRefCountArray.  The cost of calculating that
    2622                 :             :          * exactly doesn't seem worth it, so just assume the max.
    2623                 :             :          */
    2624                 :      796971 :         estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
    2625                 :             : 
    2626                 :             :         /* Is this backend already holding more than its fair share? */
    2627         [ -  + ]:      796971 :         if (estimated_pins_held > MaxProportionalPins)
    2628                 :           0 :                 return 0;
    2629                 :             : 
    2630                 :      796971 :         return MaxProportionalPins - estimated_pins_held;
    2631                 :      796971 : }
    2632                 :             : 
    2633                 :             : /*
    2634                 :             :  * Limit the number of pins a batch operation may additionally acquire, to
    2635                 :             :  * avoid running out of pinnable buffers.
    2636                 :             :  *
    2637                 :             :  * One additional pin is always allowed, on the assumption that the operation
    2638                 :             :  * requires at least one to make progress.
    2639                 :             :  */
    2640                 :             : void
    2641                 :       24322 : LimitAdditionalPins(uint32 *additional_pins)
    2642                 :             : {
    2643                 :       24322 :         uint32          limit;
    2644                 :             : 
    2645         [ +  + ]:       24322 :         if (*additional_pins <= 1)
    2646                 :       23645 :                 return;
    2647                 :             : 
    2648                 :         677 :         limit = GetAdditionalPinLimit();
    2649         [ +  - ]:         677 :         limit = Max(limit, 1);
    2650         [ +  - ]:         677 :         if (limit < *additional_pins)
    2651                 :           0 :                 *additional_pins = limit;
    2652         [ -  + ]:       24322 : }
    2653                 :             : 
    2654                 :             : /*
    2655                 :             :  * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
    2656                 :             :  * avoid duplicating the tracing and relpersistence related logic.
    2657                 :             :  */
    2658                 :             : static BlockNumber
    2659                 :       27454 : ExtendBufferedRelCommon(BufferManagerRelation bmr,
    2660                 :             :                                                 ForkNumber fork,
    2661                 :             :                                                 BufferAccessStrategy strategy,
    2662                 :             :                                                 uint32 flags,
    2663                 :             :                                                 uint32 extend_by,
    2664                 :             :                                                 BlockNumber extend_upto,
    2665                 :             :                                                 Buffer *buffers,
    2666                 :             :                                                 uint32 *extended_by)
    2667                 :             : {
    2668                 :       27454 :         BlockNumber first_block;
    2669                 :             : 
    2670                 :       27454 :         TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
    2671                 :             :                                                                                  BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
    2672                 :             :                                                                                  BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
    2673                 :             :                                                                                  BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
    2674                 :             :                                                                                  BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
    2675                 :             :                                                                                  extend_by);
    2676                 :             : 
    2677         [ +  + ]:       27454 :         if (bmr.relpersistence == RELPERSISTENCE_TEMP)
    2678                 :        6264 :                 first_block = ExtendBufferedRelLocal(bmr, fork, flags,
    2679                 :        3132 :                                                                                          extend_by, extend_upto,
    2680                 :        3132 :                                                                                          buffers, &extend_by);
    2681                 :             :         else
    2682                 :       48644 :                 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
    2683                 :       24322 :                                                                                           extend_by, extend_upto,
    2684                 :       24322 :                                                                                           buffers, &extend_by);
    2685                 :       27454 :         *extended_by = extend_by;
    2686                 :             : 
    2687                 :       27454 :         TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
    2688                 :             :                                                                                 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
    2689                 :             :                                                                                 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
    2690                 :             :                                                                                 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
    2691                 :             :                                                                                 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
    2692                 :             :                                                                                 *extended_by,
    2693                 :             :                                                                                 first_block);
    2694                 :             : 
    2695                 :       54908 :         return first_block;
    2696                 :       27454 : }
    2697                 :             : 
    2698                 :             : /*
    2699                 :             :  * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
    2700                 :             :  * shared buffers.
    2701                 :             :  */
    2702                 :             : static BlockNumber
    2703                 :       24322 : ExtendBufferedRelShared(BufferManagerRelation bmr,
    2704                 :             :                                                 ForkNumber fork,
    2705                 :             :                                                 BufferAccessStrategy strategy,
    2706                 :             :                                                 uint32 flags,
    2707                 :             :                                                 uint32 extend_by,
    2708                 :             :                                                 BlockNumber extend_upto,
    2709                 :             :                                                 Buffer *buffers,
    2710                 :             :                                                 uint32 *extended_by)
    2711                 :             : {
    2712                 :       24322 :         BlockNumber first_block;
    2713                 :       24322 :         IOContext       io_context = IOContextForStrategy(strategy);
    2714                 :       24322 :         instr_time      io_start;
    2715                 :             : 
    2716                 :       24322 :         LimitAdditionalPins(&extend_by);
    2717                 :             : 
    2718                 :             :         /*
    2719                 :             :          * Acquire victim buffers for extension without holding extension lock.
    2720                 :             :          * Writing out victim buffers is the most expensive part of extending the
    2721                 :             :          * relation, particularly when doing so requires WAL flushes. Zeroing out
    2722                 :             :          * the buffers is also quite expensive, so do that before holding the
    2723                 :             :          * extension lock as well.
    2724                 :             :          *
    2725                 :             :          * These pages are pinned by us and not valid. While we hold the pin they
    2726                 :             :          * can't be acquired as victim buffers by another backend.
    2727                 :             :          */
    2728         [ +  + ]:       53395 :         for (uint32 i = 0; i < extend_by; i++)
    2729                 :             :         {
    2730                 :       29073 :                 Block           buf_block;
    2731                 :             : 
    2732                 :       29073 :                 buffers[i] = GetVictimBuffer(strategy, io_context);
    2733                 :       29073 :                 buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
    2734                 :             : 
    2735                 :             :                 /* new buffers are zero-filled */
    2736   [ +  -  +  -  :       29073 :                 MemSet(buf_block, 0, BLCKSZ);
          +  -  +  -  #  
                      # ]
    2737                 :       29073 :         }
    2738                 :             : 
    2739                 :             :         /*
    2740                 :             :          * Lock relation against concurrent extensions, unless requested not to.
    2741                 :             :          *
    2742                 :             :          * We use the same extension lock for all forks. That's unnecessarily
    2743                 :             :          * restrictive, but currently extensions for forks don't happen often
    2744                 :             :          * enough to make it worth locking more granularly.
    2745                 :             :          *
    2746                 :             :          * Note that another backend might have extended the relation by the time
    2747                 :             :          * we get the lock.
    2748                 :             :          */
    2749         [ +  + ]:       24322 :         if (!(flags & EB_SKIP_EXTENSION_LOCK))
    2750                 :       22646 :                 LockRelationForExtension(bmr.rel, ExclusiveLock);
    2751                 :             : 
    2752                 :             :         /*
    2753                 :             :          * If requested, invalidate size cache, so that smgrnblocks asks the
    2754                 :             :          * kernel.
    2755                 :             :          */
    2756         [ +  + ]:       24322 :         if (flags & EB_CLEAR_SIZE_CACHE)
    2757         [ +  - ]:         715 :                 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
    2758                 :             : 
    2759         [ +  - ]:       24322 :         first_block = smgrnblocks(BMR_GET_SMGR(bmr), fork);
    2760                 :             : 
    2761                 :             :         /*
    2762                 :             :          * Now that we have the accurate relation size, check if the caller wants
    2763                 :             :          * us to extend to only up to a specific size. If there were concurrent
    2764                 :             :          * extensions, we might have acquired too many buffers and need to release
    2765                 :             :          * them.
    2766                 :             :          */
    2767         [ +  + ]:       24322 :         if (extend_upto != InvalidBlockNumber)
    2768                 :             :         {
    2769                 :         715 :                 uint32          orig_extend_by = extend_by;
    2770                 :             : 
    2771         [ -  + ]:         715 :                 if (first_block > extend_upto)
    2772                 :           0 :                         extend_by = 0;
    2773         [ +  - ]:         715 :                 else if ((uint64) first_block + extend_by > extend_upto)
    2774                 :           0 :                         extend_by = extend_upto - first_block;
    2775                 :             : 
    2776         [ -  + ]:         715 :                 for (uint32 i = extend_by; i < orig_extend_by; i++)
    2777                 :             :                 {
    2778                 :           0 :                         BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
    2779                 :             : 
    2780                 :           0 :                         UnpinBuffer(buf_hdr);
    2781                 :           0 :                 }
    2782                 :             : 
    2783         [ +  - ]:         715 :                 if (extend_by == 0)
    2784                 :             :                 {
    2785         [ #  # ]:           0 :                         if (!(flags & EB_SKIP_EXTENSION_LOCK))
    2786                 :           0 :                                 UnlockRelationForExtension(bmr.rel, ExclusiveLock);
    2787                 :           0 :                         *extended_by = extend_by;
    2788                 :           0 :                         return first_block;
    2789                 :             :                 }
    2790         [ -  + ]:         715 :         }
    2791                 :             : 
    2792                 :             :         /* Fail if relation is already at maximum possible length */
    2793         [ +  - ]:       24322 :         if ((uint64) first_block + extend_by >= MaxBlockNumber)
    2794   [ #  #  #  #  :           0 :                 ereport(ERROR,
          #  #  #  #  #  
                #  #  # ]
    2795                 :             :                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
    2796                 :             :                                  errmsg("cannot extend relation %s beyond %u blocks",
    2797                 :             :                                                 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
    2798                 :             :                                                 MaxBlockNumber)));
    2799                 :             : 
    2800                 :             :         /*
    2801                 :             :          * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
    2802                 :             :          *
    2803                 :             :          * This needs to happen before we extend the relation, because as soon as
    2804                 :             :          * we do, other backends can start to read in those pages.
    2805                 :             :          */
    2806         [ +  + ]:       53395 :         for (uint32 i = 0; i < extend_by; i++)
    2807                 :             :         {
    2808                 :       29073 :                 Buffer          victim_buf = buffers[i];
    2809                 :       29073 :                 BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
    2810                 :       29073 :                 BufferTag       tag;
    2811                 :       29073 :                 uint32          hash;
    2812                 :       29073 :                 LWLock     *partition_lock;
    2813                 :       29073 :                 int                     existing_id;
    2814                 :             : 
    2815                 :             :                 /* in case we need to pin an existing buffer below */
    2816                 :       29073 :                 ResourceOwnerEnlarge(CurrentResourceOwner);
    2817                 :       29073 :                 ReservePrivateRefCountEntry();
    2818                 :             : 
    2819         [ +  - ]:       29073 :                 InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
    2820                 :       29073 :                                           first_block + i);
    2821                 :       29073 :                 hash = BufTableHashCode(&tag);
    2822                 :       29073 :                 partition_lock = BufMappingPartitionLock(hash);
    2823                 :             : 
    2824                 :       29073 :                 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
    2825                 :             : 
    2826                 :       29073 :                 existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
    2827                 :             : 
    2828                 :             :                 /*
    2829                 :             :                  * We get here only in the corner case where we are trying to extend
    2830                 :             :                  * the relation but we found a pre-existing buffer. This can happen
    2831                 :             :                  * because a prior attempt at extending the relation failed, and
    2832                 :             :                  * because mdread doesn't complain about reads beyond EOF (when
    2833                 :             :                  * zero_damaged_pages is ON) and so a previous attempt to read a block
    2834                 :             :                  * beyond EOF could have left a "valid" zero-filled buffer.
    2835                 :             :                  *
    2836                 :             :                  * This has also been observed when relation was overwritten by
    2837                 :             :                  * external process. Since the legitimate cases should always have
    2838                 :             :                  * left a zero-filled buffer, complain if not PageIsNew.
    2839                 :             :                  */
    2840         [ -  + ]:       29073 :                 if (existing_id >= 0)
    2841                 :             :                 {
    2842                 :           0 :                         BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
    2843                 :           0 :                         Block           buf_block;
    2844                 :           0 :                         bool            valid;
    2845                 :             : 
    2846                 :             :                         /*
    2847                 :             :                          * Pin the existing buffer before releasing the partition lock,
    2848                 :             :                          * preventing it from being evicted.
    2849                 :             :                          */
    2850                 :           0 :                         valid = PinBuffer(existing_hdr, strategy, false);
    2851                 :             : 
    2852                 :           0 :                         LWLockRelease(partition_lock);
    2853                 :           0 :                         UnpinBuffer(victim_buf_hdr);
    2854                 :             : 
    2855                 :           0 :                         buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
    2856                 :           0 :                         buf_block = BufHdrGetBlock(existing_hdr);
    2857                 :             : 
    2858   [ #  #  #  # ]:           0 :                         if (valid && !PageIsNew((Page) buf_block))
    2859   [ #  #  #  #  :           0 :                                 ereport(ERROR,
          #  #  #  #  #  
                #  #  # ]
    2860                 :             :                                                 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
    2861                 :             :                                                                 existing_hdr->tag.blockNum,
    2862                 :             :                                                                 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
    2863                 :             : 
    2864                 :             :                         /*
    2865                 :             :                          * We *must* do smgr[zero]extend before succeeding, else the page
    2866                 :             :                          * will not be reserved by the kernel, and the next P_NEW call
    2867                 :             :                          * will decide to return the same page.  Clear the BM_VALID bit,
    2868                 :             :                          * do StartBufferIO() and proceed.
    2869                 :             :                          *
    2870                 :             :                          * Loop to handle the very small possibility that someone re-sets
    2871                 :             :                          * BM_VALID between our clearing it and StartBufferIO inspecting
    2872                 :             :                          * it.
    2873                 :             :                          */
    2874                 :           0 :                         do
    2875                 :             :                         {
    2876                 :           0 :                                 pg_atomic_fetch_and_u64(&existing_hdr->state, ~BM_VALID);
    2877         [ #  # ]:           0 :                         } while (!StartBufferIO(existing_hdr, true, false));
    2878                 :           0 :                 }
    2879                 :             :                 else
    2880                 :             :                 {
    2881                 :       29073 :                         uint64          buf_state;
    2882                 :       29073 :                         uint64          set_bits = 0;
    2883                 :             : 
    2884                 :       29073 :                         buf_state = LockBufHdr(victim_buf_hdr);
    2885                 :             : 
    2886                 :             :                         /* some sanity checks while we hold the buffer header lock */
    2887         [ +  - ]:       29073 :                         Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
    2888         [ -  + ]:       29073 :                         Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
    2889                 :             : 
    2890                 :       29073 :                         victim_buf_hdr->tag = tag;
    2891                 :             : 
    2892                 :       29073 :                         set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
    2893   [ +  +  +  + ]:       29073 :                         if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
    2894                 :       29029 :                                 set_bits |= BM_PERMANENT;
    2895                 :             : 
    2896                 :       58146 :                         UnlockBufHdrExt(victim_buf_hdr, buf_state,
    2897                 :       29073 :                                                         set_bits, 0,
    2898                 :             :                                                         0);
    2899                 :             : 
    2900                 :       29073 :                         LWLockRelease(partition_lock);
    2901                 :             : 
    2902                 :             :                         /* XXX: could combine the locked operations in it with the above */
    2903                 :       29073 :                         StartBufferIO(victim_buf_hdr, true, false);
    2904                 :       29073 :                 }
    2905                 :       29073 :         }
    2906                 :             : 
    2907                 :       24322 :         io_start = pgstat_prepare_io_time(track_io_timing);
    2908                 :             : 
    2909                 :             :         /*
    2910                 :             :          * Note: if smgrzeroextend fails, we will end up with buffers that are
    2911                 :             :          * allocated but not marked BM_VALID.  The next relation extension will
    2912                 :             :          * still select the same block number (because the relation didn't get any
    2913                 :             :          * longer on disk) and so future attempts to extend the relation will find
    2914                 :             :          * the same buffers (if they have not been recycled) but come right back
    2915                 :             :          * here to try smgrzeroextend again.
    2916                 :             :          *
    2917                 :             :          * We don't need to set checksum for all-zero pages.
    2918                 :             :          */
    2919         [ +  - ]:       24322 :         smgrzeroextend(BMR_GET_SMGR(bmr), fork, first_block, extend_by, false);
    2920                 :             : 
    2921                 :             :         /*
    2922                 :             :          * Release the file-extension lock; it's now OK for someone else to extend
    2923                 :             :          * the relation some more.
    2924                 :             :          *
    2925                 :             :          * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
    2926                 :             :          * take noticeable time.
    2927                 :             :          */
    2928         [ +  + ]:       24322 :         if (!(flags & EB_SKIP_EXTENSION_LOCK))
    2929                 :       22646 :                 UnlockRelationForExtension(bmr.rel, ExclusiveLock);
    2930                 :             : 
    2931                 :       48644 :         pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
    2932                 :       24322 :                                                         io_start, 1, extend_by * BLCKSZ);
    2933                 :             : 
    2934                 :             :         /* Set BM_VALID, terminate IO, and wake up any waiters */
    2935         [ +  + ]:       53395 :         for (uint32 i = 0; i < extend_by; i++)
    2936                 :             :         {
    2937                 :       29073 :                 Buffer          buf = buffers[i];
    2938                 :       29073 :                 BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
    2939                 :       29073 :                 bool            lock = false;
    2940                 :             : 
    2941   [ +  +  +  + ]:       29073 :                 if (flags & EB_LOCK_FIRST && i == 0)
    2942                 :       23546 :                         lock = true;
    2943         [ +  - ]:        5527 :                 else if (flags & EB_LOCK_TARGET)
    2944                 :             :                 {
    2945         [ #  # ]:           0 :                         Assert(extend_upto != InvalidBlockNumber);
    2946         [ #  # ]:           0 :                         if (first_block + i + 1 == extend_upto)
    2947                 :           0 :                                 lock = true;
    2948                 :           0 :                 }
    2949                 :             : 
    2950         [ +  + ]:       29073 :                 if (lock)
    2951                 :       23546 :                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
    2952                 :             : 
    2953                 :       29073 :                 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
    2954                 :       29073 :         }
    2955                 :             : 
    2956                 :       24322 :         pgBufferUsage.shared_blks_written += extend_by;
    2957                 :             : 
    2958                 :       24322 :         *extended_by = extend_by;
    2959                 :             : 
    2960                 :       24322 :         return first_block;
    2961                 :       24322 : }
    2962                 :             : 
    2963                 :             : /*
    2964                 :             :  * BufferIsLockedByMe
    2965                 :             :  *
    2966                 :             :  *      Checks if this backend has the buffer locked in any mode.
    2967                 :             :  *
    2968                 :             :  * Buffer must be pinned.
    2969                 :             :  */
    2970                 :             : bool
    2971                 :     1862163 : BufferIsLockedByMe(Buffer buffer)
    2972                 :             : {
    2973                 :     1862163 :         BufferDesc *bufHdr;
    2974                 :             : 
    2975   [ -  +  #  #  :     1862163 :         Assert(BufferIsPinned(buffer));
                   -  + ]
    2976                 :             : 
    2977         [ +  - ]:     1862163 :         if (BufferIsLocal(buffer))
    2978                 :             :         {
    2979                 :             :                 /* Content locks are not maintained for local buffers. */
    2980                 :           0 :                 return true;
    2981                 :             :         }
    2982                 :             :         else
    2983                 :             :         {
    2984                 :     1862163 :                 bufHdr = GetBufferDescriptor(buffer - 1);
    2985                 :     1862163 :                 return BufferLockHeldByMe(bufHdr);
    2986                 :             :         }
    2987                 :     1862163 : }
    2988                 :             : 
    2989                 :             : /*
    2990                 :             :  * BufferIsLockedByMeInMode
    2991                 :             :  *
    2992                 :             :  *      Checks if this backend has the buffer locked in the specified mode.
    2993                 :             :  *
    2994                 :             :  * Buffer must be pinned.
    2995                 :             :  */
    2996                 :             : bool
    2997                 :     8621338 : BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
    2998                 :             : {
    2999                 :     8621338 :         BufferDesc *bufHdr;
    3000                 :             : 
    3001   [ -  +  #  #  :     8621338 :         Assert(BufferIsPinned(buffer));
                   +  + ]
    3002                 :             : 
    3003         [ +  + ]:     8621338 :         if (BufferIsLocal(buffer))
    3004                 :             :         {
    3005                 :             :                 /* Content locks are not maintained for local buffers. */
    3006                 :         253 :                 return true;
    3007                 :             :         }
    3008                 :             :         else
    3009                 :             :         {
    3010                 :     8621085 :                 bufHdr = GetBufferDescriptor(buffer - 1);
    3011                 :     8621085 :                 return BufferLockHeldByMeInMode(bufHdr, mode);
    3012                 :             :         }
    3013                 :     8621338 : }
    3014                 :             : 
    3015                 :             : /*
    3016                 :             :  * BufferIsDirty
    3017                 :             :  *
    3018                 :             :  *              Checks if buffer is already dirty.
    3019                 :             :  *
    3020                 :             :  * Buffer must be pinned and exclusive-locked.  (Without an exclusive lock,
    3021                 :             :  * the result may be stale before it's returned.)
    3022                 :             :  */
    3023                 :             : bool
    3024                 :     2592503 : BufferIsDirty(Buffer buffer)
    3025                 :             : {
    3026                 :     2592503 :         BufferDesc *bufHdr;
    3027                 :             : 
    3028   [ -  +  #  #  :     2592503 :         Assert(BufferIsPinned(buffer));
                   -  + ]
    3029                 :             : 
    3030         [ +  - ]:     2592503 :         if (BufferIsLocal(buffer))
    3031                 :             :         {
    3032                 :           0 :                 int                     bufid = -buffer - 1;
    3033                 :             : 
    3034                 :           0 :                 bufHdr = GetLocalBufferDescriptor(bufid);
    3035                 :             :                 /* Content locks are not maintained for local buffers. */
    3036                 :           0 :         }
    3037                 :             :         else
    3038                 :             :         {
    3039                 :     2592503 :                 bufHdr = GetBufferDescriptor(buffer - 1);
    3040         [ +  - ]:     2592503 :                 Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
    3041                 :             :         }
    3042                 :             : 
    3043                 :     5185006 :         return pg_atomic_read_u64(&bufHdr->state) & BM_DIRTY;
    3044                 :     2592503 : }
    3045                 :             : 
    3046                 :             : /*
    3047                 :             :  * MarkBufferDirty
    3048                 :             :  *
    3049                 :             :  *              Marks buffer contents as dirty (actual write happens later).
    3050                 :             :  *
    3051                 :             :  * Buffer must be pinned and exclusive-locked.  (If caller does not hold
    3052                 :             :  * exclusive lock, then somebody could be in process of writing the buffer,
    3053                 :             :  * leading to risk of bad data written to disk.)
    3054                 :             :  */
    3055                 :             : void
    3056                 :     3777801 : MarkBufferDirty(Buffer buffer)
    3057                 :             : {
    3058                 :     3777801 :         BufferDesc *bufHdr;
    3059                 :     3777801 :         uint64          buf_state;
    3060                 :     3777801 :         uint64          old_buf_state;
    3061                 :             : 
    3062         [ +  - ]:     3777801 :         if (!BufferIsValid(buffer))
    3063   [ #  #  #  # ]:           0 :                 elog(ERROR, "bad buffer ID: %d", buffer);
    3064                 :             : 
    3065         [ +  + ]:     3777801 :         if (BufferIsLocal(buffer))
    3066                 :             :         {
    3067                 :      340295 :                 MarkLocalBufferDirty(buffer);
    3068                 :      340295 :                 return;
    3069                 :             :         }
    3070                 :             : 
    3071                 :     3437506 :         bufHdr = GetBufferDescriptor(buffer - 1);
    3072                 :             : 
    3073   [ -  +  #  #  :     3437506 :         Assert(BufferIsPinned(buffer));
                   -  + ]
    3074         [ +  - ]:     3437506 :         Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
    3075                 :             : 
    3076                 :             :         /*
    3077                 :             :          * NB: We have to wait for the buffer header spinlock to be not held, as
    3078                 :             :          * TerminateBufferIO() relies on the spinlock.
    3079                 :             :          */
    3080                 :     3437506 :         old_buf_state = pg_atomic_read_u64(&bufHdr->state);
    3081                 :     3437611 :         for (;;)
    3082                 :             :         {
    3083         [ +  + ]:     3437611 :                 if (old_buf_state & BM_LOCKED)
    3084                 :         103 :                         old_buf_state = WaitBufHdrUnlocked(bufHdr);
    3085                 :             : 
    3086                 :     3437611 :                 buf_state = old_buf_state;
    3087                 :             : 
    3088         [ +  - ]:     3437611 :                 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    3089                 :     3437611 :                 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
    3090                 :             : 
    3091   [ +  +  +  + ]:     6875222 :                 if (pg_atomic_compare_exchange_u64(&bufHdr->state, &old_buf_state,
    3092                 :     3437611 :                                                                                    buf_state))
    3093                 :     3437506 :                         break;
    3094                 :             :         }
    3095                 :             : 
    3096                 :             :         /*
    3097                 :             :          * If the buffer was not dirty already, do vacuum accounting.
    3098                 :             :          */
    3099         [ +  + ]:     3437506 :         if (!(old_buf_state & BM_DIRTY))
    3100                 :             :         {
    3101                 :       38155 :                 pgBufferUsage.shared_blks_dirtied++;
    3102         [ +  - ]:       38155 :                 if (VacuumCostActive)
    3103                 :           0 :                         VacuumCostBalance += VacuumCostPageDirty;
    3104                 :       38155 :         }
    3105         [ -  + ]:     3777801 : }
    3106                 :             : 
    3107                 :             : /*
    3108                 :             :  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
    3109                 :             :  *
    3110                 :             :  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
    3111                 :             :  * compared to calling the two routines separately.  Now it's mainly just
    3112                 :             :  * a convenience function.  However, if the passed buffer is valid and
    3113                 :             :  * already contains the desired block, we just return it as-is; and that
    3114                 :             :  * does save considerable work compared to a full release and reacquire.
    3115                 :             :  *
    3116                 :             :  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
    3117                 :             :  * buffer actually needs to be released.  This case is the same as ReadBuffer,
    3118                 :             :  * but can save some tests in the caller.
    3119                 :             :  */
    3120                 :             : Buffer
    3121                 :     5393321 : ReleaseAndReadBuffer(Buffer buffer,
    3122                 :             :                                          Relation relation,
    3123                 :             :                                          BlockNumber blockNum)
    3124                 :             : {
    3125                 :     5393321 :         ForkNumber      forkNum = MAIN_FORKNUM;
    3126                 :     5393321 :         BufferDesc *bufHdr;
    3127                 :             : 
    3128         [ +  + ]:     5393321 :         if (BufferIsValid(buffer))
    3129                 :             :         {
    3130   [ -  +  #  #  :     2815268 :                 Assert(BufferIsPinned(buffer));
                   +  + ]
    3131         [ +  + ]:     2815268 :                 if (BufferIsLocal(buffer))
    3132                 :             :                 {
    3133                 :       12277 :                         bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    3134         [ +  + ]:       12277 :                         if (bufHdr->tag.blockNum == blockNum &&
    3135   [ +  -  -  + ]:        1166 :                                 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
    3136                 :        1166 :                                 BufTagGetForkNum(&bufHdr->tag) == forkNum)
    3137                 :        1166 :                                 return buffer;
    3138                 :       11111 :                         UnpinLocalBuffer(buffer);
    3139                 :       11111 :                 }
    3140                 :             :                 else
    3141                 :             :                 {
    3142                 :     2802991 :                         bufHdr = GetBufferDescriptor(buffer - 1);
    3143                 :             :                         /* we have pin, so it's ok to examine tag without spinlock */
    3144         [ +  + ]:     2802991 :                         if (bufHdr->tag.blockNum == blockNum &&
    3145   [ +  -  -  + ]:      877924 :                                 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
    3146                 :      877924 :                                 BufTagGetForkNum(&bufHdr->tag) == forkNum)
    3147                 :      877924 :                                 return buffer;
    3148                 :     1925067 :                         UnpinBuffer(bufHdr);
    3149                 :             :                 }
    3150                 :     1936178 :         }
    3151                 :             : 
    3152                 :     4514231 :         return ReadBuffer(relation, blockNum);
    3153                 :     5393321 : }
    3154                 :             : 
    3155                 :             : /*
    3156                 :             :  * PinBuffer -- make buffer unavailable for replacement.
    3157                 :             :  *
    3158                 :             :  * For the default access strategy, the buffer's usage_count is incremented
    3159                 :             :  * when we first pin it; for other strategies we just make sure the usage_count
    3160                 :             :  * isn't zero.  (The idea of the latter is that we don't want synchronized
    3161                 :             :  * heap scans to inflate the count, but we need it to not be zero to discourage
    3162                 :             :  * other backends from stealing buffers from our ring.  As long as we cycle
    3163                 :             :  * through the ring faster than the global clock-sweep cycles, buffers in
    3164                 :             :  * our ring won't be chosen as victims for replacement by other backends.)
    3165                 :             :  *
    3166                 :             :  * This should be applied only to shared buffers, never local ones.
    3167                 :             :  *
    3168                 :             :  * Since buffers are pinned/unpinned very frequently, pin buffers without
    3169                 :             :  * taking the buffer header lock; instead update the state variable in loop of
    3170                 :             :  * CAS operations. Hopefully it's just a single CAS.
    3171                 :             :  *
    3172                 :             :  * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
    3173                 :             :  * must have been done already.
    3174                 :             :  *
    3175                 :             :  * Returns true if buffer is BM_VALID, else false.  This provision allows
    3176                 :             :  * some callers to avoid an extra spinlock cycle.  If skip_if_not_valid is
    3177                 :             :  * true, then a false return value also indicates that the buffer was
    3178                 :             :  * (recently) invalid and has not been pinned.
    3179                 :             :  */
    3180                 :             : static bool
    3181                 :    11321558 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
    3182                 :             :                   bool skip_if_not_valid)
    3183                 :             : {
    3184                 :    11321558 :         Buffer          b = BufferDescriptorGetBuffer(buf);
    3185                 :    11321558 :         bool            result;
    3186                 :    11321558 :         PrivateRefCountEntry *ref;
    3187                 :             : 
    3188         [ +  - ]:    11321558 :         Assert(!BufferIsLocal(b));
    3189         [ +  - ]:    11321558 :         Assert(ReservedRefCountSlot != -1);
    3190                 :             : 
    3191                 :    11321558 :         ref = GetPrivateRefCountEntry(b, true);
    3192                 :             : 
    3193         [ +  + ]:    11321558 :         if (ref == NULL)
    3194                 :             :         {
    3195                 :    10594240 :                 uint64          buf_state;
    3196                 :    10594240 :                 uint64          old_buf_state;
    3197                 :             : 
    3198                 :    10594240 :                 old_buf_state = pg_atomic_read_u64(&buf->state);
    3199                 :    10596920 :                 for (;;)
    3200                 :             :                 {
    3201   [ +  -  +  - ]:    10596920 :                         if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID)))
    3202                 :           0 :                                 return false;
    3203                 :             : 
    3204                 :             :                         /*
    3205                 :             :                          * We're not allowed to increase the refcount while the buffer
    3206                 :             :                          * header spinlock is held. Wait for the lock to be released.
    3207                 :             :                          */
    3208         [ +  + ]:    10596920 :                         if (old_buf_state & BM_LOCKED)
    3209                 :          92 :                                 old_buf_state = WaitBufHdrUnlocked(buf);
    3210                 :             : 
    3211                 :    10596920 :                         buf_state = old_buf_state;
    3212                 :             : 
    3213                 :             :                         /* increase refcount */
    3214                 :    10596920 :                         buf_state += BUF_REFCOUNT_ONE;
    3215                 :             : 
    3216         [ +  + ]:    10596920 :                         if (strategy == NULL)
    3217                 :             :                         {
    3218                 :             :                                 /* Default case: increase usagecount unless already max. */
    3219         [ +  + ]:    10551331 :                                 if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
    3220                 :      121648 :                                         buf_state += BUF_USAGECOUNT_ONE;
    3221                 :    10551331 :                         }
    3222                 :             :                         else
    3223                 :             :                         {
    3224                 :             :                                 /*
    3225                 :             :                                  * Ring buffers shouldn't evict others from pool.  Thus we
    3226                 :             :                                  * don't make usagecount more than 1.
    3227                 :             :                                  */
    3228         [ -  + ]:       45589 :                                 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
    3229                 :           0 :                                         buf_state += BUF_USAGECOUNT_ONE;
    3230                 :             :                         }
    3231                 :             : 
    3232   [ +  +  +  + ]:    21193840 :                         if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
    3233                 :    10596920 :                                                                                            buf_state))
    3234                 :             :                         {
    3235                 :    10594240 :                                 result = (buf_state & BM_VALID) != 0;
    3236                 :             : 
    3237                 :    10594240 :                                 TrackNewBufferPin(b);
    3238                 :    10594240 :                                 break;
    3239                 :             :                         }
    3240                 :             :                 }
    3241         [ -  + ]:    10594240 :         }
    3242                 :             :         else
    3243                 :             :         {
    3244                 :             :                 /*
    3245                 :             :                  * If we previously pinned the buffer, it is likely to be valid, but
    3246                 :             :                  * it may not be if StartReadBuffers() was called and
    3247                 :             :                  * WaitReadBuffers() hasn't been called yet.  We'll check by loading
    3248                 :             :                  * the flags without locking.  This is racy, but it's OK to return
    3249                 :             :                  * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
    3250                 :             :                  * it'll see that it's now valid.
    3251                 :             :                  *
    3252                 :             :                  * Note: We deliberately avoid a Valgrind client request here.
    3253                 :             :                  * Individual access methods can optionally superimpose buffer page
    3254                 :             :                  * client requests on top of our client requests to enforce that
    3255                 :             :                  * buffers are only accessed while locked (and pinned).  It's possible
    3256                 :             :                  * that the buffer page is legitimately non-accessible here.  We
    3257                 :             :                  * cannot meddle with that.
    3258                 :             :                  */
    3259                 :      727318 :                 result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
    3260                 :             : 
    3261         [ +  - ]:      727318 :                 Assert(ref->data.refcount > 0);
    3262                 :      727318 :                 ref->data.refcount++;
    3263                 :      727318 :                 ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
    3264                 :             :         }
    3265                 :             : 
    3266                 :    11321558 :         return result;
    3267                 :    11321558 : }
    3268                 :             : 
    3269                 :             : /*
    3270                 :             :  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
    3271                 :             :  * The spinlock is released before return.
    3272                 :             :  *
    3273                 :             :  * As this function is called with the spinlock held, the caller has to
    3274                 :             :  * previously call ReservePrivateRefCountEntry() and
    3275                 :             :  * ResourceOwnerEnlarge(CurrentResourceOwner);
    3276                 :             :  *
    3277                 :             :  * Currently, no callers of this function want to modify the buffer's
    3278                 :             :  * usage_count at all, so there's no need for a strategy parameter.
    3279                 :             :  * Also we don't bother with a BM_VALID test (the caller could check that for
    3280                 :             :  * itself).
    3281                 :             :  *
    3282                 :             :  * Also all callers only ever use this function when it's known that the
    3283                 :             :  * buffer can't have a preexisting pin by this backend. That allows us to skip
    3284                 :             :  * searching the private refcount array & hash, which is a boon, because the
    3285                 :             :  * spinlock is still held.
    3286                 :             :  *
    3287                 :             :  * Note: use of this routine is frequently mandatory, not just an optimization
    3288                 :             :  * to save a spin lock/unlock cycle, because we need to pin a buffer before
    3289                 :             :  * its state can change under us.
    3290                 :             :  */
    3291                 :             : static void
    3292                 :        6998 : PinBuffer_Locked(BufferDesc *buf)
    3293                 :             : {
    3294                 :        6998 :         uint64          old_buf_state;
    3295                 :             : 
    3296                 :             :         /*
    3297                 :             :          * As explained, We don't expect any preexisting pins. That allows us to
    3298                 :             :          * manipulate the PrivateRefCount after releasing the spinlock
    3299                 :             :          */
    3300         [ +  - ]:        6998 :         Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
    3301                 :             : 
    3302                 :             :         /*
    3303                 :             :          * Since we hold the buffer spinlock, we can update the buffer state and
    3304                 :             :          * release the lock in one operation.
    3305                 :             :          */
    3306                 :        6998 :         old_buf_state = pg_atomic_read_u64(&buf->state);
    3307                 :             : 
    3308                 :        6998 :         UnlockBufHdrExt(buf, old_buf_state,
    3309                 :             :                                         0, 0, 1);
    3310                 :             : 
    3311                 :        6998 :         TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
    3312                 :        6998 : }
    3313                 :             : 
    3314                 :             : /*
    3315                 :             :  * Support for waking up another backend that is waiting for the cleanup lock
    3316                 :             :  * to be released using BM_PIN_COUNT_WAITER.
    3317                 :             :  *
    3318                 :             :  * See LockBufferForCleanup().
    3319                 :             :  *
    3320                 :             :  * Expected to be called just after releasing a buffer pin (in a BufferDesc,
    3321                 :             :  * not just reducing the backend-local pincount for the buffer).
    3322                 :             :  */
    3323                 :             : static void
    3324                 :           0 : WakePinCountWaiter(BufferDesc *buf)
    3325                 :             : {
    3326                 :             :         /*
    3327                 :             :          * Acquire the buffer header lock, re-check that there's a waiter. Another
    3328                 :             :          * backend could have unpinned this buffer, and already woken up the
    3329                 :             :          * waiter.
    3330                 :             :          *
    3331                 :             :          * There's no danger of the buffer being replaced after we unpinned it
    3332                 :             :          * above, as it's pinned by the waiter. The waiter removes
    3333                 :             :          * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
    3334                 :             :          * backend waking it up.
    3335                 :             :          */
    3336                 :           0 :         uint64          buf_state = LockBufHdr(buf);
    3337                 :             : 
    3338   [ #  #  #  # ]:           0 :         if ((buf_state & BM_PIN_COUNT_WAITER) &&
    3339                 :           0 :                 BUF_STATE_GET_REFCOUNT(buf_state) == 1)
    3340                 :             :         {
    3341                 :             :                 /* we just released the last pin other than the waiter's */
    3342                 :           0 :                 int                     wait_backend_pgprocno = buf->wait_backend_pgprocno;
    3343                 :             : 
    3344                 :           0 :                 UnlockBufHdrExt(buf, buf_state,
    3345                 :             :                                                 0, BM_PIN_COUNT_WAITER,
    3346                 :             :                                                 0);
    3347                 :           0 :                 ProcSendSignal(wait_backend_pgprocno);
    3348                 :           0 :         }
    3349                 :             :         else
    3350                 :           0 :                 UnlockBufHdr(buf);
    3351                 :           0 : }
    3352                 :             : 
    3353                 :             : /*
    3354                 :             :  * UnpinBuffer -- make buffer available for replacement.
    3355                 :             :  *
    3356                 :             :  * This should be applied only to shared buffers, never local ones.  This
    3357                 :             :  * always adjusts CurrentResourceOwner.
    3358                 :             :  */
    3359                 :             : static void
    3360                 :    13667167 : UnpinBuffer(BufferDesc *buf)
    3361                 :             : {
    3362                 :    13667167 :         Buffer          b = BufferDescriptorGetBuffer(buf);
    3363                 :             : 
    3364                 :    13667167 :         ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
    3365                 :    13667167 :         UnpinBufferNoOwner(buf);
    3366                 :    13667167 : }
    3367                 :             : 
    3368                 :             : static void
    3369                 :    13668372 : UnpinBufferNoOwner(BufferDesc *buf)
    3370                 :             : {
    3371                 :    13668372 :         PrivateRefCountEntry *ref;
    3372                 :    13668372 :         Buffer          b = BufferDescriptorGetBuffer(buf);
    3373                 :             : 
    3374         [ +  - ]:    13668372 :         Assert(!BufferIsLocal(b));
    3375                 :             : 
    3376                 :             :         /* not moving as we're likely deleting it soon anyway */
    3377                 :    13668372 :         ref = GetPrivateRefCountEntry(b, false);
    3378         [ +  - ]:    13668372 :         Assert(ref != NULL);
    3379         [ +  - ]:    13668372 :         Assert(ref->data.refcount > 0);
    3380                 :    13668372 :         ref->data.refcount--;
    3381         [ +  + ]:    13668372 :         if (ref->data.refcount == 0)
    3382                 :             :         {
    3383                 :    10641775 :                 uint64          old_buf_state;
    3384                 :             : 
    3385                 :             :                 /*
    3386                 :             :                  * Mark buffer non-accessible to Valgrind.
    3387                 :             :                  *
    3388                 :             :                  * Note that the buffer may have already been marked non-accessible
    3389                 :             :                  * within access method code that enforces that buffers are only
    3390                 :             :                  * accessed while a buffer lock is held.
    3391                 :             :                  */
    3392                 :    10641775 :                 VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
    3393                 :             : 
    3394                 :             :                 /*
    3395                 :             :                  * I'd better not still hold the buffer content lock. Can't use
    3396                 :             :                  * BufferIsLockedByMe(), as that asserts the buffer is pinned.
    3397                 :             :                  */
    3398         [ +  - ]:    10641775 :                 Assert(!BufferLockHeldByMe(buf));
    3399                 :             : 
    3400                 :             :                 /* decrement the shared reference count */
    3401                 :    10641775 :                 old_buf_state = pg_atomic_fetch_sub_u64(&buf->state, BUF_REFCOUNT_ONE);
    3402                 :             : 
    3403                 :             :                 /* Support LockBufferForCleanup() */
    3404         [ +  - ]:    10641775 :                 if (old_buf_state & BM_PIN_COUNT_WAITER)
    3405                 :           0 :                         WakePinCountWaiter(buf);
    3406                 :             : 
    3407                 :    10641775 :                 ForgetPrivateRefCountEntry(ref);
    3408                 :    10641775 :         }
    3409                 :    13668372 : }
    3410                 :             : 
    3411                 :             : /*
    3412                 :             :  * Set up backend-local tracking of a buffer pinned the first time by this
    3413                 :             :  * backend.
    3414                 :             :  */
    3415                 :             : inline void
    3416                 :    10641775 : TrackNewBufferPin(Buffer buf)
    3417                 :             : {
    3418                 :    10641775 :         PrivateRefCountEntry *ref;
    3419                 :             : 
    3420                 :    10641775 :         ref = NewPrivateRefCountEntry(buf);
    3421                 :    10641775 :         ref->data.refcount++;
    3422                 :             : 
    3423                 :    10641775 :         ResourceOwnerRememberBuffer(CurrentResourceOwner, buf);
    3424                 :             : 
    3425                 :             :         /*
    3426                 :             :          * This is the first pin for this page by this backend, mark its page as
    3427                 :             :          * defined to valgrind. While the page contents might not actually be
    3428                 :             :          * valid yet, we don't currently guarantee that such pages are marked
    3429                 :             :          * undefined or non-accessible.
    3430                 :             :          *
    3431                 :             :          * It's not necessarily the prettiest to do this here, but otherwise we'd
    3432                 :             :          * need this block of code in multiple places.
    3433                 :             :          */
    3434                 :    10641775 :         VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(GetBufferDescriptor(buf - 1)),
    3435                 :             :                                                           BLCKSZ);
    3436                 :    10641775 : }
    3437                 :             : 
    3438                 :             : #define ST_SORT sort_checkpoint_bufferids
    3439                 :             : #define ST_ELEMENT_TYPE CkptSortItem
    3440                 :             : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
    3441                 :             : #define ST_SCOPE static
    3442                 :             : #define ST_DEFINE
    3443                 :             : #include "lib/sort_template.h"
    3444                 :             : 
    3445                 :             : /*
    3446                 :             :  * BufferSync -- Write out all dirty buffers in the pool.
    3447                 :             :  *
    3448                 :             :  * This is called at checkpoint time to write out all dirty shared buffers.
    3449                 :             :  * The checkpoint request flags should be passed in.  If CHECKPOINT_FAST is
    3450                 :             :  * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
    3451                 :             :  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
    3452                 :             :  * even unlogged buffers, which are otherwise skipped.  The remaining flags
    3453                 :             :  * currently have no effect here.
    3454                 :             :  */
    3455                 :             : static void
    3456                 :           7 : BufferSync(int flags)
    3457                 :             : {
    3458                 :           7 :         uint64          buf_state;
    3459                 :           7 :         int                     buf_id;
    3460                 :           7 :         int                     num_to_scan;
    3461                 :           7 :         int                     num_spaces;
    3462                 :           7 :         int                     num_processed;
    3463                 :           7 :         int                     num_written;
    3464                 :           7 :         CkptTsStatus *per_ts_stat = NULL;
    3465                 :           7 :         Oid                     last_tsid;
    3466                 :           7 :         binaryheap *ts_heap;
    3467                 :           7 :         int                     i;
    3468                 :           7 :         uint64          mask = BM_DIRTY;
    3469                 :           7 :         WritebackContext wb_context;
    3470                 :             : 
    3471                 :             :         /*
    3472                 :             :          * Unless this is a shutdown checkpoint or we have been explicitly told,
    3473                 :             :          * we write only permanent, dirty buffers.  But at shutdown or end of
    3474                 :             :          * recovery, we write all dirty buffers.
    3475                 :             :          */
    3476         [ +  + ]:           7 :         if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
    3477                 :             :                                         CHECKPOINT_FLUSH_UNLOGGED))))
    3478                 :           2 :                 mask |= BM_PERMANENT;
    3479                 :             : 
    3480                 :             :         /*
    3481                 :             :          * Loop over all buffers, and mark the ones that need to be written with
    3482                 :             :          * BM_CHECKPOINT_NEEDED.  Count them as we go (num_to_scan), so that we
    3483                 :             :          * can estimate how much work needs to be done.
    3484                 :             :          *
    3485                 :             :          * This allows us to write only those pages that were dirty when the
    3486                 :             :          * checkpoint began, and not those that get dirtied while it proceeds.
    3487                 :             :          * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
    3488                 :             :          * later in this function, or by normal backends or the bgwriter cleaning
    3489                 :             :          * scan, the flag is cleared.  Any buffer dirtied after this point won't
    3490                 :             :          * have the flag set.
    3491                 :             :          *
    3492                 :             :          * Note that if we fail to write some buffer, we may leave buffers with
    3493                 :             :          * BM_CHECKPOINT_NEEDED still set.  This is OK since any such buffer would
    3494                 :             :          * certainly need to be written for the next checkpoint attempt, too.
    3495                 :             :          */
    3496                 :           7 :         num_to_scan = 0;
    3497         [ +  + ]:      114695 :         for (buf_id = 0; buf_id < NBuffers; buf_id++)
    3498                 :             :         {
    3499                 :      114688 :                 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
    3500                 :      114688 :                 uint64          set_bits = 0;
    3501                 :             : 
    3502                 :             :                 /*
    3503                 :             :                  * Header spinlock is enough to examine BM_DIRTY, see comment in
    3504                 :             :                  * SyncOneBuffer.
    3505                 :             :                  */
    3506                 :      114688 :                 buf_state = LockBufHdr(bufHdr);
    3507                 :             : 
    3508         [ +  + ]:      114688 :                 if ((buf_state & mask) == mask)
    3509                 :             :                 {
    3510                 :        5230 :                         CkptSortItem *item;
    3511                 :             : 
    3512                 :        5230 :                         set_bits = BM_CHECKPOINT_NEEDED;
    3513                 :             : 
    3514                 :        5230 :                         item = &CkptBufferIds[num_to_scan++];
    3515                 :        5230 :                         item->buf_id = buf_id;
    3516                 :        5230 :                         item->tsId = bufHdr->tag.spcOid;
    3517                 :        5230 :                         item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
    3518                 :        5230 :                         item->forkNum = BufTagGetForkNum(&bufHdr->tag);
    3519                 :        5230 :                         item->blockNum = bufHdr->tag.blockNum;
    3520                 :        5230 :                 }
    3521                 :             : 
    3522                 :      229376 :                 UnlockBufHdrExt(bufHdr, buf_state,
    3523                 :      114688 :                                                 set_bits, 0,
    3524                 :             :                                                 0);
    3525                 :             : 
    3526                 :             :                 /* Check for barrier events in case NBuffers is large. */
    3527         [ +  - ]:      114688 :                 if (ProcSignalBarrierPending)
    3528                 :           0 :                         ProcessProcSignalBarrier();
    3529                 :      114688 :         }
    3530                 :             : 
    3531         [ +  + ]:           7 :         if (num_to_scan == 0)
    3532                 :           2 :                 return;                                 /* nothing to do */
    3533                 :             : 
    3534                 :           5 :         WritebackContextInit(&wb_context, &checkpoint_flush_after);
    3535                 :             : 
    3536                 :           5 :         TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
    3537                 :             : 
    3538                 :             :         /*
    3539                 :             :          * Sort buffers that need to be written to reduce the likelihood of random
    3540                 :             :          * IO. The sorting is also important for the implementation of balancing
    3541                 :             :          * writes between tablespaces. Without balancing writes we'd potentially
    3542                 :             :          * end up writing to the tablespaces one-by-one; possibly overloading the
    3543                 :             :          * underlying system.
    3544                 :             :          */
    3545                 :           5 :         sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
    3546                 :             : 
    3547                 :           5 :         num_spaces = 0;
    3548                 :             : 
    3549                 :             :         /*
    3550                 :             :          * Allocate progress status for each tablespace with buffers that need to
    3551                 :             :          * be flushed. This requires the to-be-flushed array to be sorted.
    3552                 :             :          */
    3553                 :           5 :         last_tsid = InvalidOid;
    3554         [ +  + ]:        5235 :         for (i = 0; i < num_to_scan; i++)
    3555                 :             :         {
    3556                 :        5230 :                 CkptTsStatus *s;
    3557                 :        5230 :                 Oid                     cur_tsid;
    3558                 :             : 
    3559                 :        5230 :                 cur_tsid = CkptBufferIds[i].tsId;
    3560                 :             : 
    3561                 :             :                 /*
    3562                 :             :                  * Grow array of per-tablespace status structs, every time a new
    3563                 :             :                  * tablespace is found.
    3564                 :             :                  */
    3565   [ +  +  +  + ]:        5230 :                 if (last_tsid == InvalidOid || last_tsid != cur_tsid)
    3566                 :             :                 {
    3567                 :           9 :                         Size            sz;
    3568                 :             : 
    3569                 :           9 :                         num_spaces++;
    3570                 :             : 
    3571                 :             :                         /*
    3572                 :             :                          * Not worth adding grow-by-power-of-2 logic here - even with a
    3573                 :             :                          * few hundred tablespaces this should be fine.
    3574                 :             :                          */
    3575                 :           9 :                         sz = sizeof(CkptTsStatus) * num_spaces;
    3576                 :             : 
    3577         [ +  + ]:           9 :                         if (per_ts_stat == NULL)
    3578                 :           5 :                                 per_ts_stat = (CkptTsStatus *) palloc(sz);
    3579                 :             :                         else
    3580                 :           4 :                                 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
    3581                 :             : 
    3582                 :           9 :                         s = &per_ts_stat[num_spaces - 1];
    3583                 :           9 :                         memset(s, 0, sizeof(*s));
    3584                 :           9 :                         s->tsId = cur_tsid;
    3585                 :             : 
    3586                 :             :                         /*
    3587                 :             :                          * The first buffer in this tablespace. As CkptBufferIds is sorted
    3588                 :             :                          * by tablespace all (s->num_to_scan) buffers in this tablespace
    3589                 :             :                          * will follow afterwards.
    3590                 :             :                          */
    3591                 :           9 :                         s->index = i;
    3592                 :             : 
    3593                 :             :                         /*
    3594                 :             :                          * progress_slice will be determined once we know how many buffers
    3595                 :             :                          * are in each tablespace, i.e. after this loop.
    3596                 :             :                          */
    3597                 :             : 
    3598                 :           9 :                         last_tsid = cur_tsid;
    3599                 :           9 :                 }
    3600                 :             :                 else
    3601                 :             :                 {
    3602                 :        5221 :                         s = &per_ts_stat[num_spaces - 1];
    3603                 :             :                 }
    3604                 :             : 
    3605                 :        5230 :                 s->num_to_scan++;
    3606                 :             : 
    3607                 :             :                 /* Check for barrier events. */
    3608         [ +  - ]:        5230 :                 if (ProcSignalBarrierPending)
    3609                 :           0 :                         ProcessProcSignalBarrier();
    3610                 :        5230 :         }
    3611                 :             : 
    3612         [ +  - ]:           5 :         Assert(num_spaces > 0);
    3613                 :             : 
    3614                 :             :         /*
    3615                 :             :          * Build a min-heap over the write-progress in the individual tablespaces,
    3616                 :             :          * and compute how large a portion of the total progress a single
    3617                 :             :          * processed buffer is.
    3618                 :             :          */
    3619                 :           5 :         ts_heap = binaryheap_allocate(num_spaces,
    3620                 :             :                                                                   ts_ckpt_progress_comparator,
    3621                 :             :                                                                   NULL);
    3622                 :             : 
    3623         [ +  + ]:          14 :         for (i = 0; i < num_spaces; i++)
    3624                 :             :         {
    3625                 :           9 :                 CkptTsStatus *ts_stat = &per_ts_stat[i];
    3626                 :             : 
    3627                 :           9 :                 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
    3628                 :             : 
    3629                 :           9 :                 binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
    3630                 :           9 :         }
    3631                 :             : 
    3632                 :           5 :         binaryheap_build(ts_heap);
    3633                 :             : 
    3634                 :             :         /*
    3635                 :             :          * Iterate through to-be-checkpointed buffers and write the ones (still)
    3636                 :             :          * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
    3637                 :             :          * tablespaces; otherwise the sorting would lead to only one tablespace
    3638                 :             :          * receiving writes at a time, making inefficient use of the hardware.
    3639                 :             :          */
    3640                 :           5 :         num_processed = 0;
    3641                 :           5 :         num_written = 0;
    3642         [ +  + ]:        5235 :         while (!binaryheap_empty(ts_heap))
    3643                 :             :         {
    3644                 :        5230 :                 BufferDesc *bufHdr = NULL;
    3645                 :       10460 :                 CkptTsStatus *ts_stat = (CkptTsStatus *)
    3646                 :        5230 :                         DatumGetPointer(binaryheap_first(ts_heap));
    3647                 :             : 
    3648                 :        5230 :                 buf_id = CkptBufferIds[ts_stat->index].buf_id;
    3649         [ +  - ]:        5230 :                 Assert(buf_id != -1);
    3650                 :             : 
    3651                 :        5230 :                 bufHdr = GetBufferDescriptor(buf_id);
    3652                 :             : 
    3653                 :        5230 :                 num_processed++;
    3654                 :             : 
    3655                 :             :                 /*
    3656                 :             :                  * We don't need to acquire the lock here, because we're only looking
    3657                 :             :                  * at a single bit. It's possible that someone else writes the buffer
    3658                 :             :                  * and clears the flag right after we check, but that doesn't matter
    3659                 :             :                  * since SyncOneBuffer will then do nothing.  However, there is a
    3660                 :             :                  * further race condition: it's conceivable that between the time we
    3661                 :             :                  * examine the bit here and the time SyncOneBuffer acquires the lock,
    3662                 :             :                  * someone else not only wrote the buffer but replaced it with another
    3663                 :             :                  * page and dirtied it.  In that improbable case, SyncOneBuffer will
    3664                 :             :                  * write the buffer though we didn't need to.  It doesn't seem worth
    3665                 :             :                  * guarding against this, though.
    3666                 :             :                  */
    3667         [ -  + ]:        5230 :                 if (pg_atomic_read_u64(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
    3668                 :             :                 {
    3669         [ -  + ]:        5230 :                         if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
    3670                 :             :                         {
    3671                 :        5230 :                                 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
    3672                 :        5230 :                                 PendingCheckpointerStats.buffers_written++;
    3673                 :        5230 :                                 num_written++;
    3674                 :        5230 :                         }
    3675                 :        5230 :                 }
    3676                 :             : 
    3677                 :             :                 /*
    3678                 :             :                  * Measure progress independent of actually having to flush the buffer
    3679                 :             :                  * - otherwise writing become unbalanced.
    3680                 :             :                  */
    3681                 :        5230 :                 ts_stat->progress += ts_stat->progress_slice;
    3682                 :        5230 :                 ts_stat->num_scanned++;
    3683                 :        5230 :                 ts_stat->index++;
    3684                 :             : 
    3685                 :             :                 /* Have all the buffers from the tablespace been processed? */
    3686         [ +  + ]:        5230 :                 if (ts_stat->num_scanned == ts_stat->num_to_scan)
    3687                 :             :                 {
    3688                 :           9 :                         binaryheap_remove_first(ts_heap);
    3689                 :           9 :                 }
    3690                 :             :                 else
    3691                 :             :                 {
    3692                 :             :                         /* update heap with the new progress */
    3693                 :        5221 :                         binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
    3694                 :             :                 }
    3695                 :             : 
    3696                 :             :                 /*
    3697                 :             :                  * Sleep to throttle our I/O rate.
    3698                 :             :                  *
    3699                 :             :                  * (This will check for barrier events even if it doesn't sleep.)
    3700                 :             :                  */
    3701                 :        5230 :                 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
    3702                 :        5230 :         }
    3703                 :             : 
    3704                 :             :         /*
    3705                 :             :          * Issue all pending flushes. Only checkpointer calls BufferSync(), so
    3706                 :             :          * IOContext will always be IOCONTEXT_NORMAL.
    3707                 :             :          */
    3708                 :           5 :         IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
    3709                 :             : 
    3710                 :           5 :         pfree(per_ts_stat);
    3711                 :           5 :         per_ts_stat = NULL;
    3712                 :           5 :         binaryheap_free(ts_heap);
    3713                 :             : 
    3714                 :             :         /*
    3715                 :             :          * Update checkpoint statistics. As noted above, this doesn't include
    3716                 :             :          * buffers written by other backends or bgwriter scan.
    3717                 :             :          */
    3718                 :           5 :         CheckpointStats.ckpt_bufs_written += num_written;
    3719                 :             : 
    3720                 :           5 :         TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
    3721         [ -  + ]:           7 : }
    3722                 :             : 
    3723                 :             : /*
    3724                 :             :  * BgBufferSync -- Write out some dirty buffers in the pool.
    3725                 :             :  *
    3726                 :             :  * This is called periodically by the background writer process.
    3727                 :             :  *
    3728                 :             :  * Returns true if it's appropriate for the bgwriter process to go into
    3729                 :             :  * low-power hibernation mode.  (This happens if the strategy clock-sweep
    3730                 :             :  * has been "lapped" and no buffer allocations have occurred recently,
    3731                 :             :  * or if the bgwriter has been effectively disabled by setting
    3732                 :             :  * bgwriter_lru_maxpages to 0.)
    3733                 :             :  */
    3734                 :             : bool
    3735                 :          24 : BgBufferSync(WritebackContext *wb_context)
    3736                 :             : {
    3737                 :             :         /* info obtained from freelist.c */
    3738                 :          24 :         int                     strategy_buf_id;
    3739                 :          24 :         uint32          strategy_passes;
    3740                 :          24 :         uint32          recent_alloc;
    3741                 :             : 
    3742                 :             :         /*
    3743                 :             :          * Information saved between calls so we can determine the strategy
    3744                 :             :          * point's advance rate and avoid scanning already-cleaned buffers.
    3745                 :             :          */
    3746                 :             :         static bool saved_info_valid = false;
    3747                 :             :         static int      prev_strategy_buf_id;
    3748                 :             :         static uint32 prev_strategy_passes;
    3749                 :             :         static int      next_to_clean;
    3750                 :             :         static uint32 next_passes;
    3751                 :             : 
    3752                 :             :         /* Moving averages of allocation rate and clean-buffer density */
    3753                 :             :         static float smoothed_alloc = 0;
    3754                 :             :         static float smoothed_density = 10.0;
    3755                 :             : 
    3756                 :             :         /* Potentially these could be tunables, but for now, not */
    3757                 :          24 :         float           smoothing_samples = 16;
    3758                 :          24 :         float           scan_whole_pool_milliseconds = 120000.0;
    3759                 :             : 
    3760                 :             :         /* Used to compute how far we scan ahead */
    3761                 :          24 :         long            strategy_delta;
    3762                 :          24 :         int                     bufs_to_lap;
    3763                 :          24 :         int                     bufs_ahead;
    3764                 :          24 :         float           scans_per_alloc;
    3765                 :          24 :         int                     reusable_buffers_est;
    3766                 :          24 :         int                     upcoming_alloc_est;
    3767                 :          24 :         int                     min_scan_buffers;
    3768                 :             : 
    3769                 :             :         /* Variables for the scanning loop proper */
    3770                 :          24 :         int                     num_to_scan;
    3771                 :          24 :         int                     num_written;
    3772                 :          24 :         int                     reusable_buffers;
    3773                 :             : 
    3774                 :             :         /* Variables for final smoothed_density update */
    3775                 :          24 :         long            new_strategy_delta;
    3776                 :          24 :         uint32          new_recent_alloc;
    3777                 :             : 
    3778                 :             :         /*
    3779                 :             :          * Find out where the clock-sweep currently is, and how many buffer
    3780                 :             :          * allocations have happened since our last call.
    3781                 :             :          */
    3782                 :          24 :         strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
    3783                 :             : 
    3784                 :             :         /* Report buffer alloc counts to pgstat */
    3785                 :          24 :         PendingBgWriterStats.buf_alloc += recent_alloc;
    3786                 :             : 
    3787                 :             :         /*
    3788                 :             :          * If we're not running the LRU scan, just stop after doing the stats
    3789                 :             :          * stuff.  We mark the saved state invalid so that we can recover sanely
    3790                 :             :          * if LRU scan is turned back on later.
    3791                 :             :          */
    3792         [ -  + ]:          24 :         if (bgwriter_lru_maxpages <= 0)
    3793                 :             :         {
    3794                 :           0 :                 saved_info_valid = false;
    3795                 :           0 :                 return true;
    3796                 :             :         }
    3797                 :             : 
    3798                 :             :         /*
    3799                 :             :          * Compute strategy_delta = how many buffers have been scanned by the
    3800                 :             :          * clock-sweep since last time.  If first time through, assume none. Then
    3801                 :             :          * see if we are still ahead of the clock-sweep, and if so, how many
    3802                 :             :          * buffers we could scan before we'd catch up with it and "lap" it. Note:
    3803                 :             :          * weird-looking coding of xxx_passes comparisons are to avoid bogus
    3804                 :             :          * behavior when the passes counts wrap around.
    3805                 :             :          */
    3806         [ +  + ]:          24 :         if (saved_info_valid)
    3807                 :             :         {
    3808                 :          23 :                 int32           passes_delta = strategy_passes - prev_strategy_passes;
    3809                 :             : 
    3810                 :          23 :                 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
    3811                 :          23 :                 strategy_delta += (long) passes_delta * NBuffers;
    3812                 :             : 
    3813         [ +  - ]:          23 :                 Assert(strategy_delta >= 0);
    3814                 :             : 
    3815         [ +  + ]:          23 :                 if ((int32) (next_passes - strategy_passes) > 0)
    3816                 :             :                 {
    3817                 :             :                         /* we're one pass ahead of the strategy point */
    3818                 :          17 :                         bufs_to_lap = strategy_buf_id - next_to_clean;
    3819                 :             : #ifdef BGW_DEBUG
    3820                 :             :                         elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
    3821                 :             :                                  next_passes, next_to_clean,
    3822                 :             :                                  strategy_passes, strategy_buf_id,
    3823                 :             :                                  strategy_delta, bufs_to_lap);
    3824                 :             : #endif
    3825                 :          17 :                 }
    3826   [ +  -  +  + ]:           6 :                 else if (next_passes == strategy_passes &&
    3827                 :           6 :                                  next_to_clean >= strategy_buf_id)
    3828                 :             :                 {
    3829                 :             :                         /* on same pass, but ahead or at least not behind */
    3830                 :           5 :                         bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
    3831                 :             : #ifdef BGW_DEBUG
    3832                 :             :                         elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
    3833                 :             :                                  next_passes, next_to_clean,
    3834                 :             :                                  strategy_passes, strategy_buf_id,
    3835                 :             :                                  strategy_delta, bufs_to_lap);
    3836                 :             : #endif
    3837                 :           5 :                 }
    3838                 :             :                 else
    3839                 :             :                 {
    3840                 :             :                         /*
    3841                 :             :                          * We're behind, so skip forward to the strategy point and start
    3842                 :             :                          * cleaning from there.
    3843                 :             :                          */
    3844                 :             : #ifdef BGW_DEBUG
    3845                 :             :                         elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
    3846                 :             :                                  next_passes, next_to_clean,
    3847                 :             :                                  strategy_passes, strategy_buf_id,
    3848                 :             :                                  strategy_delta);
    3849                 :             : #endif
    3850                 :           1 :                         next_to_clean = strategy_buf_id;
    3851                 :           1 :                         next_passes = strategy_passes;
    3852                 :           1 :                         bufs_to_lap = NBuffers;
    3853                 :             :                 }
    3854                 :          23 :         }
    3855                 :             :         else
    3856                 :             :         {
    3857                 :             :                 /*
    3858                 :             :                  * Initializing at startup or after LRU scanning had been off. Always
    3859                 :             :                  * start at the strategy point.
    3860                 :             :                  */
    3861                 :             : #ifdef BGW_DEBUG
    3862                 :             :                 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
    3863                 :             :                          strategy_passes, strategy_buf_id);
    3864                 :             : #endif
    3865                 :           1 :                 strategy_delta = 0;
    3866                 :           1 :                 next_to_clean = strategy_buf_id;
    3867                 :           1 :                 next_passes = strategy_passes;
    3868                 :           1 :                 bufs_to_lap = NBuffers;
    3869                 :             :         }
    3870                 :             : 
    3871                 :             :         /* Update saved info for next time */
    3872                 :          24 :         prev_strategy_buf_id = strategy_buf_id;
    3873                 :          24 :         prev_strategy_passes = strategy_passes;
    3874                 :          24 :         saved_info_valid = true;
    3875                 :             : 
    3876                 :             :         /*
    3877                 :             :          * Compute how many buffers had to be scanned for each new allocation, ie,
    3878                 :             :          * 1/density of reusable buffers, and track a moving average of that.
    3879                 :             :          *
    3880                 :             :          * If the strategy point didn't move, we don't update the density estimate
    3881                 :             :          */
    3882   [ +  +  -  + ]:          24 :         if (strategy_delta > 0 && recent_alloc > 0)
    3883                 :             :         {
    3884                 :          13 :                 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
    3885                 :          26 :                 smoothed_density += (scans_per_alloc - smoothed_density) /
    3886                 :          13 :                         smoothing_samples;
    3887                 :          13 :         }
    3888                 :             : 
    3889                 :             :         /*
    3890                 :             :          * Estimate how many reusable buffers there are between the current
    3891                 :             :          * strategy point and where we've scanned ahead to, based on the smoothed
    3892                 :             :          * density estimate.
    3893                 :             :          */
    3894                 :          24 :         bufs_ahead = NBuffers - bufs_to_lap;
    3895                 :          24 :         reusable_buffers_est = (float) bufs_ahead / smoothed_density;
    3896                 :             : 
    3897                 :             :         /*
    3898                 :             :          * Track a moving average of recent buffer allocations.  Here, rather than
    3899                 :             :          * a true average we want a fast-attack, slow-decline behavior: we
    3900                 :             :          * immediately follow any increase.
    3901                 :             :          */
    3902         [ +  + ]:          24 :         if (smoothed_alloc <= (float) recent_alloc)
    3903                 :           3 :                 smoothed_alloc = recent_alloc;
    3904                 :             :         else
    3905                 :          42 :                 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
    3906                 :          21 :                         smoothing_samples;
    3907                 :             : 
    3908                 :             :         /* Scale the estimate by a GUC to allow more aggressive tuning. */
    3909                 :          24 :         upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
    3910                 :             : 
    3911                 :             :         /*
    3912                 :             :          * If recent_alloc remains at zero for many cycles, smoothed_alloc will
    3913                 :             :          * eventually underflow to zero, and the underflows produce annoying
    3914                 :             :          * kernel warnings on some platforms.  Once upcoming_alloc_est has gone to
    3915                 :             :          * zero, there's no point in tracking smaller and smaller values of
    3916                 :             :          * smoothed_alloc, so just reset it to exactly zero to avoid this
    3917                 :             :          * syndrome.  It will pop back up as soon as recent_alloc increases.
    3918                 :             :          */
    3919         [ +  + ]:          24 :         if (upcoming_alloc_est == 0)
    3920                 :           2 :                 smoothed_alloc = 0;
    3921                 :             : 
    3922                 :             :         /*
    3923                 :             :          * Even in cases where there's been little or no buffer allocation
    3924                 :             :          * activity, we want to make a small amount of progress through the buffer
    3925                 :             :          * cache so that as many reusable buffers as possible are clean after an
    3926                 :             :          * idle period.
    3927                 :             :          *
    3928                 :             :          * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
    3929                 :             :          * the BGW will be called during the scan_whole_pool time; slice the
    3930                 :             :          * buffer pool into that many sections.
    3931                 :             :          */
    3932                 :          24 :         min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
    3933                 :             : 
    3934         [ +  + ]:          24 :         if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
    3935                 :             :         {
    3936                 :             : #ifdef BGW_DEBUG
    3937                 :             :                 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
    3938                 :             :                          upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
    3939                 :             : #endif
    3940                 :          17 :                 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
    3941                 :          17 :         }
    3942                 :             : 
    3943                 :             :         /*
    3944                 :             :          * Now write out dirty reusable buffers, working forward from the
    3945                 :             :          * next_to_clean point, until we have lapped the strategy scan, or cleaned
    3946                 :             :          * enough buffers to match our estimate of the next cycle's allocation
    3947                 :             :          * requirements, or hit the bgwriter_lru_maxpages limit.
    3948                 :             :          */
    3949                 :             : 
    3950                 :          24 :         num_to_scan = bufs_to_lap;
    3951                 :          24 :         num_written = 0;
    3952                 :          24 :         reusable_buffers = reusable_buffers_est;
    3953                 :             : 
    3954                 :             :         /* Execute the LRU scan */
    3955   [ +  +  +  + ]:       19373 :         while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
    3956                 :             :         {
    3957                 :       38698 :                 int                     sync_state = SyncOneBuffer(next_to_clean, true,
    3958                 :       19349 :                                                                                            wb_context);
    3959                 :             : 
    3960         [ +  + ]:       19349 :                 if (++next_to_clean >= NBuffers)
    3961                 :             :                 {
    3962                 :           1 :                         next_to_clean = 0;
    3963                 :           1 :                         next_passes++;
    3964                 :           1 :                 }
    3965                 :       19349 :                 num_to_scan--;
    3966                 :             : 
    3967         [ -  + ]:       19349 :                 if (sync_state & BUF_WRITTEN)
    3968                 :             :                 {
    3969                 :           0 :                         reusable_buffers++;
    3970         [ #  # ]:           0 :                         if (++num_written >= bgwriter_lru_maxpages)
    3971                 :             :                         {
    3972                 :           0 :                                 PendingBgWriterStats.maxwritten_clean++;
    3973                 :           0 :                                 break;
    3974                 :             :                         }
    3975                 :           0 :                 }
    3976         [ +  + ]:       19349 :                 else if (sync_state & BUF_REUSABLE)
    3977                 :       14161 :                         reusable_buffers++;
    3978      [ -  -  + ]:       19349 :         }
    3979                 :             : 
    3980                 :          24 :         PendingBgWriterStats.buf_written_clean += num_written;
    3981                 :             : 
    3982                 :             : #ifdef BGW_DEBUG
    3983                 :             :         elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
    3984                 :             :                  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
    3985                 :             :                  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
    3986                 :             :                  bufs_to_lap - num_to_scan,
    3987                 :             :                  num_written,
    3988                 :             :                  reusable_buffers - reusable_buffers_est);
    3989                 :             : #endif
    3990                 :             : 
    3991                 :             :         /*
    3992                 :             :          * Consider the above scan as being like a new allocation scan.
    3993                 :             :          * Characterize its density and update the smoothed one based on it. This
    3994                 :             :          * effectively halves the moving average period in cases where both the
    3995                 :             :          * strategy and the background writer are doing some useful scanning,
    3996                 :             :          * which is helpful because a long memory isn't as desirable on the
    3997                 :             :          * density estimates.
    3998                 :             :          */
    3999                 :          24 :         new_strategy_delta = bufs_to_lap - num_to_scan;
    4000                 :          24 :         new_recent_alloc = reusable_buffers - reusable_buffers_est;
    4001   [ +  +  +  + ]:          24 :         if (new_strategy_delta > 0 && new_recent_alloc > 0)
    4002                 :             :         {
    4003                 :           7 :                 scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
    4004                 :          14 :                 smoothed_density += (scans_per_alloc - smoothed_density) /
    4005                 :           7 :                         smoothing_samples;
    4006                 :             : 
    4007                 :             : #ifdef BGW_DEBUG
    4008                 :             :                 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
    4009                 :             :                          new_recent_alloc, new_strategy_delta,
    4010                 :             :                          scans_per_alloc, smoothed_density);
    4011                 :             : #endif
    4012                 :           7 :         }
    4013                 :             : 
    4014                 :             :         /* Return true if OK to hibernate */
    4015         [ +  + ]:          24 :         return (bufs_to_lap == 0 && recent_alloc == 0);
    4016                 :          24 : }
    4017                 :             : 
    4018                 :             : /*
    4019                 :             :  * SyncOneBuffer -- process a single buffer during syncing.
    4020                 :             :  *
    4021                 :             :  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
    4022                 :             :  * buffers marked recently used, as these are not replacement candidates.
    4023                 :             :  *
    4024                 :             :  * Returns a bitmask containing the following flag bits:
    4025                 :             :  *      BUF_WRITTEN: we wrote the buffer.
    4026                 :             :  *      BUF_REUSABLE: buffer is available for replacement, ie, it has
    4027                 :             :  *              pin count 0 and usage count 0.
    4028                 :             :  *
    4029                 :             :  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
    4030                 :             :  * after locking it, but we don't care all that much.)
    4031                 :             :  */
    4032                 :             : static int
    4033                 :       24579 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
    4034                 :             : {
    4035                 :       24579 :         BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
    4036                 :       24579 :         int                     result = 0;
    4037                 :       24579 :         uint64          buf_state;
    4038                 :       24579 :         BufferTag       tag;
    4039                 :             : 
    4040                 :             :         /* Make sure we can handle the pin */
    4041                 :       24579 :         ReservePrivateRefCountEntry();
    4042                 :       24579 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    4043                 :             : 
    4044                 :             :         /*
    4045                 :             :          * Check whether buffer needs writing.
    4046                 :             :          *
    4047                 :             :          * We can make this check without taking the buffer content lock so long
    4048                 :             :          * as we mark pages dirty in access methods *before* logging changes with
    4049                 :             :          * XLogInsert(): if someone marks the buffer dirty just after our check we
    4050                 :             :          * don't worry because our checkpoint.redo points before log record for
    4051                 :             :          * upcoming changes and so we are not required to write such dirty buffer.
    4052                 :             :          */
    4053                 :       24579 :         buf_state = LockBufHdr(bufHdr);
    4054                 :             : 
    4055   [ +  +  +  + ]:       24579 :         if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
    4056                 :       24569 :                 BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
    4057                 :             :         {
    4058                 :       14161 :                 result |= BUF_REUSABLE;
    4059                 :       14161 :         }
    4060         [ +  + ]:       10418 :         else if (skip_recently_used)
    4061                 :             :         {
    4062                 :             :                 /* Caller told us not to write recently-used buffers */
    4063                 :        5188 :                 UnlockBufHdr(bufHdr);
    4064                 :        5188 :                 return result;
    4065                 :             :         }
    4066                 :             : 
    4067   [ +  +  -  + ]:       19391 :         if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
    4068                 :             :         {
    4069                 :             :                 /* It's clean, so nothing to do */
    4070                 :       14161 :                 UnlockBufHdr(bufHdr);
    4071                 :       14161 :                 return result;
    4072                 :             :         }
    4073                 :             : 
    4074                 :             :         /*
    4075                 :             :          * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
    4076                 :             :          * buffer is clean by the time we've locked it.)
    4077                 :             :          */
    4078                 :        5230 :         PinBuffer_Locked(bufHdr);
    4079                 :             : 
    4080                 :        5230 :         FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    4081                 :             : 
    4082                 :        5230 :         tag = bufHdr->tag;
    4083                 :             : 
    4084                 :        5230 :         UnpinBuffer(bufHdr);
    4085                 :             : 
    4086                 :             :         /*
    4087                 :             :          * SyncOneBuffer() is only called by checkpointer and bgwriter, so
    4088                 :             :          * IOContext will always be IOCONTEXT_NORMAL.
    4089                 :             :          */
    4090                 :        5230 :         ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
    4091                 :             : 
    4092                 :        5230 :         return result | BUF_WRITTEN;
    4093                 :       24579 : }
    4094                 :             : 
    4095                 :             : /*
    4096                 :             :  *              AtEOXact_Buffers - clean up at end of transaction.
    4097                 :             :  *
    4098                 :             :  *              As of PostgreSQL 8.0, buffer pins should get released by the
    4099                 :             :  *              ResourceOwner mechanism.  This routine is just a debugging
    4100                 :             :  *              cross-check that no pins remain.
    4101                 :             :  */
    4102                 :             : void
    4103                 :       57914 : AtEOXact_Buffers(bool isCommit)
    4104                 :             : {
    4105                 :       57914 :         CheckForBufferLeaks();
    4106                 :             : 
    4107                 :       57914 :         AtEOXact_LocalBuffers(isCommit);
    4108                 :             : 
    4109         [ +  - ]:       57914 :         Assert(PrivateRefCountOverflowed == 0);
    4110                 :       57914 : }
    4111                 :             : 
    4112                 :             : /*
    4113                 :             :  * Initialize access to shared buffer pool
    4114                 :             :  *
    4115                 :             :  * This is called during backend startup (whether standalone or under the
    4116                 :             :  * postmaster).  It sets up for this backend's access to the already-existing
    4117                 :             :  * buffer pool.
    4118                 :             :  */
    4119                 :             : void
    4120                 :         806 : InitBufferManagerAccess(void)
    4121                 :             : {
    4122                 :         806 :         HASHCTL         hash_ctl;
    4123                 :             : 
    4124                 :             :         /*
    4125                 :             :          * An advisory limit on the number of pins each backend should hold, based
    4126                 :             :          * on shared_buffers and the maximum number of connections possible.
    4127                 :             :          * That's very pessimistic, but outside toy-sized shared_buffers it should
    4128                 :             :          * allow plenty of pins.  LimitAdditionalPins() and
    4129                 :             :          * GetAdditionalPinLimit() can be used to check the remaining balance.
    4130                 :             :          */
    4131                 :         806 :         MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS);
    4132                 :             : 
    4133                 :         806 :         memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
    4134                 :         806 :         memset(&PrivateRefCountArrayKeys, 0, sizeof(PrivateRefCountArrayKeys));
    4135                 :             : 
    4136                 :         806 :         hash_ctl.keysize = sizeof(Buffer);
    4137                 :         806 :         hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
    4138                 :             : 
    4139                 :         806 :         PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
    4140                 :             :                                                                           HASH_ELEM | HASH_BLOBS);
    4141                 :             : 
    4142                 :             :         /*
    4143                 :             :          * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
    4144                 :             :          * the corresponding phase of backend shutdown.
    4145                 :             :          */
    4146         [ +  - ]:         806 :         Assert(MyProc != NULL);
    4147                 :         806 :         on_shmem_exit(AtProcExit_Buffers, 0);
    4148                 :         806 : }
    4149                 :             : 
    4150                 :             : /*
    4151                 :             :  * During backend exit, ensure that we released all shared-buffer locks and
    4152                 :             :  * assert that we have no remaining pins.
    4153                 :             :  */
    4154                 :             : static void
    4155                 :         806 : AtProcExit_Buffers(int code, Datum arg)
    4156                 :             : {
    4157                 :         806 :         UnlockBuffers();
    4158                 :             : 
    4159                 :         806 :         CheckForBufferLeaks();
    4160                 :             : 
    4161                 :             :         /* localbuf.c needs a chance too */
    4162                 :         806 :         AtProcExit_LocalBuffers();
    4163                 :         806 : }
    4164                 :             : 
    4165                 :             : /*
    4166                 :             :  *              CheckForBufferLeaks - ensure this backend holds no buffer pins
    4167                 :             :  *
    4168                 :             :  *              As of PostgreSQL 8.0, buffer pins should get released by the
    4169                 :             :  *              ResourceOwner mechanism.  This routine is just a debugging
    4170                 :             :  *              cross-check that no pins remain.
    4171                 :             :  */
    4172                 :             : static void
    4173                 :       58720 : CheckForBufferLeaks(void)
    4174                 :             : {
    4175                 :             : #ifdef USE_ASSERT_CHECKING
    4176                 :       58720 :         int                     RefCountErrors = 0;
    4177                 :       58720 :         PrivateRefCountEntry *res;
    4178                 :       58720 :         int                     i;
    4179                 :       58720 :         char       *s;
    4180                 :             : 
    4181                 :             :         /* check the array */
    4182         [ +  + ]:      528480 :         for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
    4183                 :             :         {
    4184         [ +  - ]:      469760 :                 if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
    4185                 :             :                 {
    4186                 :           0 :                         res = &PrivateRefCountArray[i];
    4187                 :             : 
    4188                 :           0 :                         s = DebugPrintBufferRefcount(res->buffer);
    4189   [ #  #  #  # ]:           0 :                         elog(WARNING, "buffer refcount leak: %s", s);
    4190                 :           0 :                         pfree(s);
    4191                 :             : 
    4192                 :           0 :                         RefCountErrors++;
    4193                 :           0 :                 }
    4194                 :      469760 :         }
    4195                 :             : 
    4196                 :             :         /* if necessary search the hash */
    4197         [ +  - ]:       58720 :         if (PrivateRefCountOverflowed)
    4198                 :             :         {
    4199                 :           0 :                 HASH_SEQ_STATUS hstat;
    4200                 :             : 
    4201                 :           0 :                 hash_seq_init(&hstat, PrivateRefCountHash);
    4202         [ #  # ]:           0 :                 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
    4203                 :             :                 {
    4204                 :           0 :                         s = DebugPrintBufferRefcount(res->buffer);
    4205   [ #  #  #  # ]:           0 :                         elog(WARNING, "buffer refcount leak: %s", s);
    4206                 :           0 :                         pfree(s);
    4207                 :           0 :                         RefCountErrors++;
    4208                 :             :                 }
    4209                 :           0 :         }
    4210                 :             : 
    4211         [ +  - ]:       58720 :         Assert(RefCountErrors == 0);
    4212                 :             : #endif
    4213                 :       58720 : }
    4214                 :             : 
    4215                 :             : #ifdef USE_ASSERT_CHECKING
    4216                 :             : /*
    4217                 :             :  * Check for exclusive-locked catalog buffers.  This is the core of
    4218                 :             :  * AssertCouldGetRelation().
    4219                 :             :  *
    4220                 :             :  * A backend would self-deadlock on the content lock if the catalog scan read
    4221                 :             :  * the exclusive-locked buffer.  The main threat is exclusive-locked buffers
    4222                 :             :  * of catalogs used in relcache, because a catcache search on any catalog may
    4223                 :             :  * build that catalog's relcache entry.  We don't have an inventory of
    4224                 :             :  * catalogs relcache uses, so just check buffers of most catalogs.
    4225                 :             :  *
    4226                 :             :  * It's better to minimize waits while holding an exclusive buffer lock, so it
    4227                 :             :  * would be nice to broaden this check not to be catalog-specific.  However,
    4228                 :             :  * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
    4229                 :             :  * read tables.  That is deadlock-free as long as there's no loop in the
    4230                 :             :  * dependency graph: modifying table A may cause an opclass to read table B,
    4231                 :             :  * but it must not cause a read of table A.
    4232                 :             :  */
    4233                 :             : void
    4234                 :    19867411 : AssertBufferLocksPermitCatalogRead(void)
    4235                 :             : {
    4236                 :    19867411 :         PrivateRefCountEntry *res;
    4237                 :             : 
    4238                 :             :         /* check the array */
    4239         [ +  + ]:   178806699 :         for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
    4240                 :             :         {
    4241         [ +  + ]:   158939288 :                 if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
    4242                 :             :                 {
    4243                 :     8081887 :                         res = &PrivateRefCountArray[i];
    4244                 :             : 
    4245         [ +  - ]:     8081887 :                         if (res->buffer == InvalidBuffer)
    4246                 :           0 :                                 continue;
    4247                 :             : 
    4248                 :     8081887 :                         AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
    4249                 :     8081887 :                 }
    4250                 :   158939288 :         }
    4251                 :             : 
    4252                 :             :         /* if necessary search the hash */
    4253         [ +  + ]:    19867411 :         if (PrivateRefCountOverflowed)
    4254                 :             :         {
    4255                 :        2316 :                 HASH_SEQ_STATUS hstat;
    4256                 :             : 
    4257                 :        2316 :                 hash_seq_init(&hstat, PrivateRefCountHash);
    4258         [ +  + ]:        5575 :                 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
    4259                 :             :                 {
    4260                 :        3259 :                         AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
    4261                 :             :                 }
    4262                 :        2316 :         }
    4263                 :    19867411 : }
    4264                 :             : 
    4265                 :             : static void
    4266                 :     8085146 : AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode)
    4267                 :             : {
    4268                 :     8085146 :         BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
    4269                 :     8085146 :         BufferTag       tag;
    4270                 :     8085146 :         Oid                     relid;
    4271                 :             : 
    4272         [ +  + ]:     8085146 :         if (mode != BUFFER_LOCK_EXCLUSIVE)
    4273                 :     8084031 :                 return;
    4274                 :             : 
    4275                 :        1115 :         tag = bufHdr->tag;
    4276                 :             : 
    4277                 :             :         /*
    4278                 :             :          * This relNumber==relid assumption holds until a catalog experiences
    4279                 :             :          * VACUUM FULL or similar.  After a command like that, relNumber will be
    4280                 :             :          * in the normal (non-catalog) range, and we lose the ability to detect
    4281                 :             :          * hazardous access to that catalog.  Calling RelidByRelfilenumber() would
    4282                 :             :          * close that gap, but RelidByRelfilenumber() might then deadlock with a
    4283                 :             :          * held lock.
    4284                 :             :          */
    4285                 :        1115 :         relid = tag.relNumber;
    4286                 :             : 
    4287         [ -  + ]:        1115 :         if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
    4288                 :           0 :                 return;
    4289                 :             : 
    4290         [ +  - ]:        1115 :         Assert(!IsCatalogRelationOid(relid));
    4291         [ -  + ]:     8085146 : }
    4292                 :             : #endif
    4293                 :             : 
    4294                 :             : 
    4295                 :             : /*
    4296                 :             :  * Helper routine to issue warnings when a buffer is unexpectedly pinned
    4297                 :             :  */
    4298                 :             : char *
    4299                 :           0 : DebugPrintBufferRefcount(Buffer buffer)
    4300                 :             : {
    4301                 :           0 :         BufferDesc *buf;
    4302                 :           0 :         int32           loccount;
    4303                 :           0 :         char       *result;
    4304                 :           0 :         ProcNumber      backend;
    4305                 :           0 :         uint64          buf_state;
    4306                 :             : 
    4307         [ #  # ]:           0 :         Assert(BufferIsValid(buffer));
    4308         [ #  # ]:           0 :         if (BufferIsLocal(buffer))
    4309                 :             :         {
    4310                 :           0 :                 buf = GetLocalBufferDescriptor(-buffer - 1);
    4311                 :           0 :                 loccount = LocalRefCount[-buffer - 1];
    4312                 :           0 :                 backend = MyProcNumber;
    4313                 :           0 :         }
    4314                 :             :         else
    4315                 :             :         {
    4316                 :           0 :                 buf = GetBufferDescriptor(buffer - 1);
    4317                 :           0 :                 loccount = GetPrivateRefCount(buffer);
    4318                 :           0 :                 backend = INVALID_PROC_NUMBER;
    4319                 :             :         }
    4320                 :             : 
    4321                 :             :         /* theoretically we should lock the bufHdr here */
    4322                 :           0 :         buf_state = pg_atomic_read_u64(&buf->state);
    4323                 :             : 
    4324                 :           0 :         result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
    4325                 :           0 :                                           buffer,
    4326                 :           0 :                                           relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
    4327                 :           0 :                                                                          BufTagGetForkNum(&buf->tag)).str,
    4328                 :           0 :                                           buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
    4329                 :           0 :                                           BUF_STATE_GET_REFCOUNT(buf_state), loccount);
    4330                 :           0 :         return result;
    4331                 :           0 : }
    4332                 :             : 
    4333                 :             : /*
    4334                 :             :  * CheckPointBuffers
    4335                 :             :  *
    4336                 :             :  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
    4337                 :             :  *
    4338                 :             :  * Note: temporary relations do not participate in checkpoints, so they don't
    4339                 :             :  * need to be flushed.
    4340                 :             :  */
    4341                 :             : void
    4342                 :           7 : CheckPointBuffers(int flags)
    4343                 :             : {
    4344                 :           7 :         BufferSync(flags);
    4345                 :           7 : }
    4346                 :             : 
    4347                 :             : /*
    4348                 :             :  * BufferGetBlockNumber
    4349                 :             :  *              Returns the block number associated with a buffer.
    4350                 :             :  *
    4351                 :             :  * Note:
    4352                 :             :  *              Assumes that the buffer is valid and pinned, else the
    4353                 :             :  *              value may be obsolete immediately...
    4354                 :             :  */
    4355                 :             : BlockNumber
    4356                 :    28623221 : BufferGetBlockNumber(Buffer buffer)
    4357                 :             : {
    4358                 :    28623221 :         BufferDesc *bufHdr;
    4359                 :             : 
    4360   [ -  +  #  #  :    28623221 :         Assert(BufferIsPinned(buffer));
                   +  + ]
    4361                 :             : 
    4362         [ +  + ]:    28623221 :         if (BufferIsLocal(buffer))
    4363                 :     1013794 :                 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    4364                 :             :         else
    4365                 :    27609427 :                 bufHdr = GetBufferDescriptor(buffer - 1);
    4366                 :             : 
    4367                 :             :         /* pinned, so OK to read tag without spinlock */
    4368                 :    57246442 :         return bufHdr->tag.blockNum;
    4369                 :    28623221 : }
    4370                 :             : 
    4371                 :             : /*
    4372                 :             :  * BufferGetTag
    4373                 :             :  *              Returns the relfilelocator, fork number and block number associated with
    4374                 :             :  *              a buffer.
    4375                 :             :  */
    4376                 :             : void
    4377                 :     2595855 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
    4378                 :             :                          BlockNumber *blknum)
    4379                 :             : {
    4380                 :     2595855 :         BufferDesc *bufHdr;
    4381                 :             : 
    4382                 :             :         /* Do the same checks as BufferGetBlockNumber. */
    4383   [ -  +  #  #  :     2595855 :         Assert(BufferIsPinned(buffer));
                   -  + ]
    4384                 :             : 
    4385         [ +  - ]:     2595855 :         if (BufferIsLocal(buffer))
    4386                 :           0 :                 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    4387                 :             :         else
    4388                 :     2595855 :                 bufHdr = GetBufferDescriptor(buffer - 1);
    4389                 :             : 
    4390                 :             :         /* pinned, so OK to read tag without spinlock */
    4391                 :     2595855 :         *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
    4392                 :     2595855 :         *forknum = BufTagGetForkNum(&bufHdr->tag);
    4393                 :     2595855 :         *blknum = bufHdr->tag.blockNum;
    4394                 :     2595855 : }
    4395                 :             : 
    4396                 :             : /*
    4397                 :             :  * FlushBuffer
    4398                 :             :  *              Physically write out a shared buffer.
    4399                 :             :  *
    4400                 :             :  * NOTE: this actually just passes the buffer contents to the kernel; the
    4401                 :             :  * real write to disk won't happen until the kernel feels like it.  This
    4402                 :             :  * is okay from our point of view since we can redo the changes from WAL.
    4403                 :             :  * However, we will need to force the changes to disk via fsync before
    4404                 :             :  * we can checkpoint WAL.
    4405                 :             :  *
    4406                 :             :  * The caller must hold a pin on the buffer and have share-locked the
    4407                 :             :  * buffer contents.  (Note: a share-lock does not prevent updates of
    4408                 :             :  * hint bits in the buffer, so the page could change while the write
    4409                 :             :  * is in progress, but we assume that that will not invalidate the data
    4410                 :             :  * written.)
    4411                 :             :  *
    4412                 :             :  * If the caller has an smgr reference for the buffer's relation, pass it
    4413                 :             :  * as the second parameter.  If not, pass NULL.
    4414                 :             :  */
    4415                 :             : static void
    4416                 :        6998 : FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
    4417                 :             :                         IOContext io_context)
    4418                 :             : {
    4419                 :        6998 :         XLogRecPtr      recptr;
    4420                 :        6998 :         ErrorContextCallback errcallback;
    4421                 :        6998 :         instr_time      io_start;
    4422                 :        6998 :         Block           bufBlock;
    4423                 :        6998 :         char       *bufToWrite;
    4424                 :        6998 :         uint64          buf_state;
    4425                 :             : 
    4426                 :             :         /*
    4427                 :             :          * Try to start an I/O operation.  If StartBufferIO returns false, then
    4428                 :             :          * someone else flushed the buffer before we could, so we need not do
    4429                 :             :          * anything.
    4430                 :             :          */
    4431         [ +  - ]:        6998 :         if (!StartBufferIO(buf, false, false))
    4432                 :           0 :                 return;
    4433                 :             : 
    4434                 :             :         /* Setup error traceback support for ereport() */
    4435                 :        6998 :         errcallback.callback = shared_buffer_write_error_callback;
    4436                 :        6998 :         errcallback.arg = buf;
    4437                 :        6998 :         errcallback.previous = error_context_stack;
    4438                 :        6998 :         error_context_stack = &errcallback;
    4439                 :             : 
    4440                 :             :         /* Find smgr relation for buffer */
    4441         [ +  + ]:        6998 :         if (reln == NULL)
    4442                 :        5230 :                 reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
    4443                 :             : 
    4444                 :        6998 :         TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
    4445                 :             :                                                                                 buf->tag.blockNum,
    4446                 :             :                                                                                 reln->smgr_rlocator.locator.spcOid,
    4447                 :             :                                                                                 reln->smgr_rlocator.locator.dbOid,
    4448                 :             :                                                                                 reln->smgr_rlocator.locator.relNumber);
    4449                 :             : 
    4450                 :        6998 :         buf_state = LockBufHdr(buf);
    4451                 :             : 
    4452                 :             :         /*
    4453                 :             :          * Run PageGetLSN while holding header lock, since we don't have the
    4454                 :             :          * buffer locked exclusively in all cases.
    4455                 :             :          */
    4456                 :        6998 :         recptr = BufferGetLSN(buf);
    4457                 :             : 
    4458                 :             :         /* To check if block content changes while flushing. - vadim 01/17/97 */
    4459                 :        6998 :         UnlockBufHdrExt(buf, buf_state,
    4460                 :             :                                         0, BM_JUST_DIRTIED,
    4461                 :             :                                         0);
    4462                 :             : 
    4463                 :             :         /*
    4464                 :             :          * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
    4465                 :             :          * rule that log updates must hit disk before any of the data-file changes
    4466                 :             :          * they describe do.
    4467                 :             :          *
    4468                 :             :          * However, this rule does not apply to unlogged relations, which will be
    4469                 :             :          * lost after a crash anyway.  Most unlogged relation pages do not bear
    4470                 :             :          * LSNs since we never emit WAL records for them, and therefore flushing
    4471                 :             :          * up through the buffer LSN would be useless, but harmless.  However,
    4472                 :             :          * GiST indexes use LSNs internally to track page-splits, and therefore
    4473                 :             :          * unlogged GiST pages bear "fake" LSNs generated by
    4474                 :             :          * GetFakeLSNForUnloggedRel.  It is unlikely but possible that the fake
    4475                 :             :          * LSN counter could advance past the WAL insertion point; and if it did
    4476                 :             :          * happen, attempting to flush WAL through that location would fail, with
    4477                 :             :          * disastrous system-wide consequences.  To make sure that can't happen,
    4478                 :             :          * skip the flush if the buffer isn't permanent.
    4479                 :             :          */
    4480         [ +  + ]:        6998 :         if (buf_state & BM_PERMANENT)
    4481                 :        6983 :                 XLogFlush(recptr);
    4482                 :             : 
    4483                 :             :         /*
    4484                 :             :          * Now it's safe to write the buffer to disk. Note that no one else should
    4485                 :             :          * have been able to write it, while we were busy with log flushing,
    4486                 :             :          * because we got the exclusive right to perform I/O by setting the
    4487                 :             :          * BM_IO_IN_PROGRESS bit.
    4488                 :             :          */
    4489                 :        6998 :         bufBlock = BufHdrGetBlock(buf);
    4490                 :             : 
    4491                 :             :         /*
    4492                 :             :          * Update page checksum if desired.  Since we have only shared lock on the
    4493                 :             :          * buffer, other processes might be updating hint bits in it, so we must
    4494                 :             :          * copy the page to private storage if we do checksumming.
    4495                 :             :          */
    4496                 :        6998 :         bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
    4497                 :             : 
    4498                 :        6998 :         io_start = pgstat_prepare_io_time(track_io_timing);
    4499                 :             : 
    4500                 :             :         /*
    4501                 :             :          * bufToWrite is either the shared buffer or a copy, as appropriate.
    4502                 :             :          */
    4503                 :       13996 :         smgrwrite(reln,
    4504                 :        6998 :                           BufTagGetForkNum(&buf->tag),
    4505                 :        6998 :                           buf->tag.blockNum,
    4506                 :        6998 :                           bufToWrite,
    4507                 :             :                           false);
    4508                 :             : 
    4509                 :             :         /*
    4510                 :             :          * When a strategy is in use, only flushes of dirty buffers already in the
    4511                 :             :          * strategy ring are counted as strategy writes (IOCONTEXT
    4512                 :             :          * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
    4513                 :             :          * statistics tracking.
    4514                 :             :          *
    4515                 :             :          * If a shared buffer initially added to the ring must be flushed before
    4516                 :             :          * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
    4517                 :             :          *
    4518                 :             :          * If a shared buffer which was added to the ring later because the
    4519                 :             :          * current strategy buffer is pinned or in use or because all strategy
    4520                 :             :          * buffers were dirty and rejected (for BAS_BULKREAD operations only)
    4521                 :             :          * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
    4522                 :             :          * (from_ring will be false).
    4523                 :             :          *
    4524                 :             :          * When a strategy is not in use, the write can only be a "regular" write
    4525                 :             :          * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
    4526                 :             :          */
    4527                 :        6998 :         pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
    4528                 :             :                                                         IOOP_WRITE, io_start, 1, BLCKSZ);
    4529                 :             : 
    4530                 :        6998 :         pgBufferUsage.shared_blks_written++;
    4531                 :             : 
    4532                 :             :         /*
    4533                 :             :          * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
    4534                 :             :          * end the BM_IO_IN_PROGRESS state.
    4535                 :             :          */
    4536                 :        6998 :         TerminateBufferIO(buf, true, 0, true, false);
    4537                 :             : 
    4538                 :        6998 :         TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
    4539                 :             :                                                                            buf->tag.blockNum,
    4540                 :             :                                                                            reln->smgr_rlocator.locator.spcOid,
    4541                 :             :                                                                            reln->smgr_rlocator.locator.dbOid,
    4542                 :             :                                                                            reln->smgr_rlocator.locator.relNumber);
    4543                 :             : 
    4544                 :             :         /* Pop the error context stack */
    4545                 :        6998 :         error_context_stack = errcallback.previous;
    4546         [ -  + ]:        6998 : }
    4547                 :             : 
    4548                 :             : /*
    4549                 :             :  * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
    4550                 :             :  * before/after calling FlushBuffer().
    4551                 :             :  */
    4552                 :             : static void
    4553                 :        6998 : FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
    4554                 :             :                                         IOObject io_object, IOContext io_context)
    4555                 :             : {
    4556                 :        6998 :         Buffer          buffer = BufferDescriptorGetBuffer(buf);
    4557                 :             : 
    4558                 :        6998 :         BufferLockAcquire(buffer, buf, BUFFER_LOCK_SHARE);
    4559                 :        6998 :         FlushBuffer(buf, reln, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    4560                 :        6998 :         BufferLockUnlock(buffer, buf);
    4561                 :        6998 : }
    4562                 :             : 
    4563                 :             : /*
    4564                 :             :  * RelationGetNumberOfBlocksInFork
    4565                 :             :  *              Determines the current number of pages in the specified relation fork.
    4566                 :             :  *
    4567                 :             :  * Note that the accuracy of the result will depend on the details of the
    4568                 :             :  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
    4569                 :             :  * it might not be.
    4570                 :             :  */
    4571                 :             : BlockNumber
    4572                 :      488167 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
    4573                 :             : {
    4574   [ +  +  +  +  :      488167 :         if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
                   +  + ]
    4575                 :             :         {
    4576                 :             :                 /*
    4577                 :             :                  * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
    4578                 :             :                  * tableam returns the size in bytes - but for the purpose of this
    4579                 :             :                  * routine, we want the number of blocks. Therefore divide, rounding
    4580                 :             :                  * up.
    4581                 :             :                  */
    4582                 :      405814 :                 uint64          szbytes;
    4583                 :             : 
    4584                 :      405814 :                 szbytes = table_relation_size(relation, forkNum);
    4585                 :             : 
    4586                 :      405814 :                 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
    4587                 :      405814 :         }
    4588   [ +  -  +  +  :       82353 :         else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
          -  +  #  #  #  
                      # ]
    4589                 :             :         {
    4590                 :       82353 :                 return smgrnblocks(RelationGetSmgr(relation), forkNum);
    4591                 :             :         }
    4592                 :             :         else
    4593                 :           0 :                 Assert(false);
    4594                 :             : 
    4595                 :           0 :         return 0;                                       /* keep compiler quiet */
    4596                 :      488167 : }
    4597                 :             : 
    4598                 :             : /*
    4599                 :             :  * BufferIsPermanent
    4600                 :             :  *              Determines whether a buffer will potentially still be around after
    4601                 :             :  *              a crash.  Caller must hold a buffer pin.
    4602                 :             :  */
    4603                 :             : bool
    4604                 :     2011239 : BufferIsPermanent(Buffer buffer)
    4605                 :             : {
    4606                 :     2011239 :         BufferDesc *bufHdr;
    4607                 :             : 
    4608                 :             :         /* Local buffers are used only for temp relations. */
    4609         [ +  + ]:     2011239 :         if (BufferIsLocal(buffer))
    4610                 :      194899 :                 return false;
    4611                 :             : 
    4612                 :             :         /* Make sure we've got a real buffer, and that we hold a pin on it. */
    4613         [ +  - ]:     1816340 :         Assert(BufferIsValid(buffer));
    4614   [ -  +  #  #  :     1816340 :         Assert(BufferIsPinned(buffer));
                   -  + ]
    4615                 :             : 
    4616                 :             :         /*
    4617                 :             :          * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
    4618                 :             :          * need not bother with the buffer header spinlock.  Even if someone else
    4619                 :             :          * changes the buffer header state while we're doing this, the state is
    4620                 :             :          * changed atomically, so we'll read the old value or the new value, but
    4621                 :             :          * not random garbage.
    4622                 :             :          */
    4623                 :     1816340 :         bufHdr = GetBufferDescriptor(buffer - 1);
    4624                 :     1816340 :         return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
    4625                 :     2011239 : }
    4626                 :             : 
    4627                 :             : /*
    4628                 :             :  * BufferGetLSNAtomic
    4629                 :             :  *              Retrieves the LSN of the buffer atomically using a buffer header lock.
    4630                 :             :  *              This is necessary for some callers who may not have an exclusive lock
    4631                 :             :  *              on the buffer.
    4632                 :             :  */
    4633                 :             : XLogRecPtr
    4634                 :      950571 : BufferGetLSNAtomic(Buffer buffer)
    4635                 :             : {
    4636                 :      950571 :         char       *page = BufferGetPage(buffer);
    4637                 :      950571 :         BufferDesc *bufHdr;
    4638                 :      950571 :         XLogRecPtr      lsn;
    4639                 :             : 
    4640                 :             :         /*
    4641                 :             :          * If we don't need locking for correctness, fastpath out.
    4642                 :             :          */
    4643   [ -  +  +  + ]:      950571 :         if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
    4644                 :       26769 :                 return PageGetLSN(page);
    4645                 :             : 
    4646                 :             :         /* Make sure we've got a real buffer, and that we hold a pin on it. */
    4647         [ +  - ]:      923802 :         Assert(BufferIsValid(buffer));
    4648   [ -  +  #  #  :      923802 :         Assert(BufferIsPinned(buffer));
                   -  + ]
    4649                 :             : 
    4650                 :      923802 :         bufHdr = GetBufferDescriptor(buffer - 1);
    4651                 :      923802 :         LockBufHdr(bufHdr);
    4652                 :      923802 :         lsn = PageGetLSN(page);
    4653                 :      923802 :         UnlockBufHdr(bufHdr);
    4654                 :             : 
    4655                 :      923802 :         return lsn;
    4656                 :      950571 : }
    4657                 :             : 
    4658                 :             : /* ---------------------------------------------------------------------
    4659                 :             :  *              DropRelationBuffers
    4660                 :             :  *
    4661                 :             :  *              This function removes from the buffer pool all the pages of the
    4662                 :             :  *              specified relation forks that have block numbers >= firstDelBlock.
    4663                 :             :  *              (In particular, with firstDelBlock = 0, all pages are removed.)
    4664                 :             :  *              Dirty pages are simply dropped, without bothering to write them
    4665                 :             :  *              out first.  Therefore, this is NOT rollback-able, and so should be
    4666                 :             :  *              used only with extreme caution!
    4667                 :             :  *
    4668                 :             :  *              Currently, this is called only from smgr.c when the underlying file
    4669                 :             :  *              is about to be deleted or truncated (firstDelBlock is needed for
    4670                 :             :  *              the truncation case).  The data in the affected pages would therefore
    4671                 :             :  *              be deleted momentarily anyway, and there is no point in writing it.
    4672                 :             :  *              It is the responsibility of higher-level code to ensure that the
    4673                 :             :  *              deletion or truncation does not lose any data that could be needed
    4674                 :             :  *              later.  It is also the responsibility of higher-level code to ensure
    4675                 :             :  *              that no other process could be trying to load more pages of the
    4676                 :             :  *              relation into buffers.
    4677                 :             :  * --------------------------------------------------------------------
    4678                 :             :  */
    4679                 :             : void
    4680                 :         155 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
    4681                 :             :                                         int nforks, BlockNumber *firstDelBlock)
    4682                 :             : {
    4683                 :         155 :         int                     i;
    4684                 :         155 :         int                     j;
    4685                 :         155 :         RelFileLocatorBackend rlocator;
    4686                 :         155 :         BlockNumber nForkBlock[MAX_FORKNUM];
    4687                 :         155 :         uint64          nBlocksToInvalidate = 0;
    4688                 :             : 
    4689                 :         155 :         rlocator = smgr_reln->smgr_rlocator;
    4690                 :             : 
    4691                 :             :         /* If it's a local relation, it's localbuf.c's problem. */
    4692         [ +  + ]:         155 :         if (RelFileLocatorBackendIsTemp(rlocator))
    4693                 :             :         {
    4694         [ -  + ]:         123 :                 if (rlocator.backend == MyProcNumber)
    4695                 :         246 :                         DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
    4696                 :         123 :                                                                          firstDelBlock);
    4697                 :             : 
    4698                 :         123 :                 return;
    4699                 :             :         }
    4700                 :             : 
    4701                 :             :         /*
    4702                 :             :          * To remove all the pages of the specified relation forks from the buffer
    4703                 :             :          * pool, we need to scan the entire buffer pool but we can optimize it by
    4704                 :             :          * finding the buffers from BufMapping table provided we know the exact
    4705                 :             :          * size of each fork of the relation. The exact size is required to ensure
    4706                 :             :          * that we don't leave any buffer for the relation being dropped as
    4707                 :             :          * otherwise the background writer or checkpointer can lead to a PANIC
    4708                 :             :          * error while flushing buffers corresponding to files that don't exist.
    4709                 :             :          *
    4710                 :             :          * To know the exact size, we rely on the size cached for each fork by us
    4711                 :             :          * during recovery which limits the optimization to recovery and on
    4712                 :             :          * standbys but we can easily extend it once we have shared cache for
    4713                 :             :          * relation size.
    4714                 :             :          *
    4715                 :             :          * In recovery, we cache the value returned by the first lseek(SEEK_END)
    4716                 :             :          * and the future writes keeps the cached value up-to-date. See
    4717                 :             :          * smgrextend. It is possible that the value of the first lseek is smaller
    4718                 :             :          * than the actual number of existing blocks in the file due to buggy
    4719                 :             :          * Linux kernels that might not have accounted for the recent write. But
    4720                 :             :          * that should be fine because there must not be any buffers after that
    4721                 :             :          * file size.
    4722                 :             :          */
    4723         [ -  + ]:          32 :         for (i = 0; i < nforks; i++)
    4724                 :             :         {
    4725                 :             :                 /* Get the number of blocks for a relation's fork */
    4726                 :          32 :                 nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
    4727                 :             : 
    4728         [ +  - ]:          32 :                 if (nForkBlock[i] == InvalidBlockNumber)
    4729                 :             :                 {
    4730                 :          32 :                         nBlocksToInvalidate = InvalidBlockNumber;
    4731                 :          32 :                         break;
    4732                 :             :                 }
    4733                 :             : 
    4734                 :             :                 /* calculate the number of blocks to be invalidated */
    4735                 :           0 :                 nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
    4736                 :           0 :         }
    4737                 :             : 
    4738                 :             :         /*
    4739                 :             :          * We apply the optimization iff the total number of blocks to invalidate
    4740                 :             :          * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
    4741                 :             :          */
    4742   [ -  +  #  # ]:          32 :         if (BlockNumberIsValid(nBlocksToInvalidate) &&
    4743                 :           0 :                 nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
    4744                 :             :         {
    4745         [ #  # ]:           0 :                 for (j = 0; j < nforks; j++)
    4746                 :           0 :                         FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
    4747                 :           0 :                                                                            nForkBlock[j], firstDelBlock[j]);
    4748                 :           0 :                 return;
    4749                 :             :         }
    4750                 :             : 
    4751         [ +  + ]:      524320 :         for (i = 0; i < NBuffers; i++)
    4752                 :             :         {
    4753                 :      524288 :                 BufferDesc *bufHdr = GetBufferDescriptor(i);
    4754                 :             : 
    4755                 :             :                 /*
    4756                 :             :                  * We can make this a tad faster by prechecking the buffer tag before
    4757                 :             :                  * we attempt to lock the buffer; this saves a lot of lock
    4758                 :             :                  * acquisitions in typical cases.  It should be safe because the
    4759                 :             :                  * caller must have AccessExclusiveLock on the relation, or some other
    4760                 :             :                  * reason to be certain that no one is loading new pages of the rel
    4761                 :             :                  * into the buffer pool.  (Otherwise we might well miss such pages
    4762                 :             :                  * entirely.)  Therefore, while the tag might be changing while we
    4763                 :             :                  * look at it, it can't be changing *to* a value we care about, only
    4764                 :             :                  * *away* from such a value.  So false negatives are impossible, and
    4765                 :             :                  * false positives are safe because we'll recheck after getting the
    4766                 :             :                  * buffer lock.
    4767                 :             :                  *
    4768                 :             :                  * We could check forkNum and blockNum as well as the rlocator, but
    4769                 :             :                  * the incremental win from doing so seems small.
    4770                 :             :                  */
    4771         [ +  + ]:      524288 :                 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
    4772                 :      522561 :                         continue;
    4773                 :             : 
    4774                 :        1727 :                 LockBufHdr(bufHdr);
    4775                 :             : 
    4776         [ +  + ]:        4530 :                 for (j = 0; j < nforks; j++)
    4777                 :             :                 {
    4778         [ +  - ]:        3151 :                         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
    4779   [ +  +  +  + ]:        3151 :                                 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
    4780                 :        1707 :                                 bufHdr->tag.blockNum >= firstDelBlock[j])
    4781                 :             :                         {
    4782                 :         348 :                                 InvalidateBuffer(bufHdr);       /* releases spinlock */
    4783                 :         348 :                                 break;
    4784                 :             :                         }
    4785                 :        2803 :                 }
    4786         [ +  + ]:        1727 :                 if (j >= nforks)
    4787                 :        1379 :                         UnlockBufHdr(bufHdr);
    4788         [ +  + ]:      524288 :         }
    4789                 :         155 : }
    4790                 :             : 
    4791                 :             : /* ---------------------------------------------------------------------
    4792                 :             :  *              DropRelationsAllBuffers
    4793                 :             :  *
    4794                 :             :  *              This function removes from the buffer pool all the pages of all
    4795                 :             :  *              forks of the specified relations.  It's equivalent to calling
    4796                 :             :  *              DropRelationBuffers once per fork per relation with firstDelBlock = 0.
    4797                 :             :  *              --------------------------------------------------------------------
    4798                 :             :  */
    4799                 :             : void
    4800                 :        2704 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
    4801                 :             : {
    4802                 :        2704 :         int                     i;
    4803                 :        2704 :         int                     n = 0;
    4804                 :        2704 :         SMgrRelation *rels;
    4805                 :        2704 :         BlockNumber (*block)[MAX_FORKNUM + 1];
    4806                 :        2704 :         uint64          nBlocksToInvalidate = 0;
    4807                 :        2704 :         RelFileLocator *locators;
    4808                 :        2704 :         bool            cached = true;
    4809                 :        2704 :         bool            use_bsearch;
    4810                 :             : 
    4811         [ +  - ]:        2704 :         if (nlocators == 0)
    4812                 :           0 :                 return;
    4813                 :             : 
    4814                 :        2704 :         rels = palloc_array(SMgrRelation, nlocators);   /* non-local relations */
    4815                 :             : 
    4816                 :             :         /* If it's a local relation, it's localbuf.c's problem. */
    4817         [ +  + ]:       11628 :         for (i = 0; i < nlocators; i++)
    4818                 :             :         {
    4819         [ +  + ]:        8924 :                 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
    4820                 :             :                 {
    4821         [ -  + ]:        1012 :                         if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
    4822                 :        1012 :                                 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
    4823                 :        1012 :                 }
    4824                 :             :                 else
    4825                 :        7912 :                         rels[n++] = smgr_reln[i];
    4826                 :        8924 :         }
    4827                 :             : 
    4828                 :             :         /*
    4829                 :             :          * If there are no non-local relations, then we're done. Release the
    4830                 :             :          * memory and return.
    4831                 :             :          */
    4832         [ +  + ]:        2704 :         if (n == 0)
    4833                 :             :         {
    4834                 :         257 :                 pfree(rels);
    4835                 :         257 :                 return;
    4836                 :             :         }
    4837                 :             : 
    4838                 :             :         /*
    4839                 :             :          * This is used to remember the number of blocks for all the relations
    4840                 :             :          * forks.
    4841                 :             :          */
    4842                 :        2447 :         block = (BlockNumber (*)[MAX_FORKNUM + 1])
    4843                 :        2447 :                 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
    4844                 :             : 
    4845                 :             :         /*
    4846                 :             :          * We can avoid scanning the entire buffer pool if we know the exact size
    4847                 :             :          * of each of the given relation forks. See DropRelationBuffers.
    4848                 :             :          */
    4849   [ +  +  +  + ]:        4895 :         for (i = 0; i < n && cached; i++)
    4850                 :             :         {
    4851         [ +  + ]:        4899 :                 for (int j = 0; j <= MAX_FORKNUM; j++)
    4852                 :             :                 {
    4853                 :             :                         /* Get the number of blocks for a relation's fork. */
    4854                 :        2451 :                         block[i][j] = smgrnblocks_cached(rels[i], j);
    4855                 :             : 
    4856                 :             :                         /* We need to only consider the relation forks that exists. */
    4857         [ -  + ]:        2451 :                         if (block[i][j] == InvalidBlockNumber)
    4858                 :             :                         {
    4859         [ +  + ]:        2451 :                                 if (!smgrexists(rels[i], j))
    4860                 :           4 :                                         continue;
    4861                 :        2447 :                                 cached = false;
    4862                 :        2447 :                                 break;
    4863                 :             :                         }
    4864                 :             : 
    4865                 :             :                         /* calculate the total number of blocks to be invalidated */
    4866                 :           0 :                         nBlocksToInvalidate += block[i][j];
    4867                 :           0 :                 }
    4868                 :        2448 :         }
    4869                 :             : 
    4870                 :             :         /*
    4871                 :             :          * We apply the optimization iff the total number of blocks to invalidate
    4872                 :             :          * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
    4873                 :             :          */
    4874   [ -  +  #  # ]:        2447 :         if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
    4875                 :             :         {
    4876         [ #  # ]:           0 :                 for (i = 0; i < n; i++)
    4877                 :             :                 {
    4878         [ #  # ]:           0 :                         for (int j = 0; j <= MAX_FORKNUM; j++)
    4879                 :             :                         {
    4880                 :             :                                 /* ignore relation forks that doesn't exist */
    4881         [ #  # ]:           0 :                                 if (!BlockNumberIsValid(block[i][j]))
    4882                 :           0 :                                         continue;
    4883                 :             : 
    4884                 :             :                                 /* drop all the buffers for a particular relation fork */
    4885                 :           0 :                                 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
    4886                 :           0 :                                                                                    j, block[i][j], 0);
    4887                 :           0 :                         }
    4888                 :           0 :                 }
    4889                 :             : 
    4890                 :           0 :                 pfree(block);
    4891                 :           0 :                 pfree(rels);
    4892                 :           0 :                 return;
    4893                 :             :         }
    4894                 :             : 
    4895                 :        2447 :         pfree(block);
    4896                 :        2447 :         locators = palloc_array(RelFileLocator, n); /* non-local relations */
    4897         [ +  + ]:       10359 :         for (i = 0; i < n; i++)
    4898                 :        7912 :                 locators[i] = rels[i]->smgr_rlocator.locator;
    4899                 :             : 
    4900                 :             :         /*
    4901                 :             :          * For low number of relations to drop just use a simple walk through, to
    4902                 :             :          * save the bsearch overhead. The threshold to use is rather a guess than
    4903                 :             :          * an exactly determined value, as it depends on many factors (CPU and RAM
    4904                 :             :          * speeds, amount of shared buffers etc.).
    4905                 :             :          */
    4906                 :        2447 :         use_bsearch = n > RELS_BSEARCH_THRESHOLD;
    4907                 :             : 
    4908                 :             :         /* sort the list of rlocators if necessary */
    4909         [ +  + ]:        2447 :         if (use_bsearch)
    4910                 :          29 :                 qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
    4911                 :             : 
    4912         [ +  + ]:    40094095 :         for (i = 0; i < NBuffers; i++)
    4913                 :             :         {
    4914                 :    40091648 :                 RelFileLocator *rlocator = NULL;
    4915                 :    40091648 :                 BufferDesc *bufHdr = GetBufferDescriptor(i);
    4916                 :             : 
    4917                 :             :                 /*
    4918                 :             :                  * As in DropRelationBuffers, an unlocked precheck should be safe and
    4919                 :             :                  * saves some cycles.
    4920                 :             :                  */
    4921                 :             : 
    4922         [ +  + ]:    40091648 :                 if (!use_bsearch)
    4923                 :             :                 {
    4924                 :    39616512 :                         int                     j;
    4925                 :             : 
    4926         [ +  + ]:   152914393 :                         for (j = 0; j < n; j++)
    4927                 :             :                         {
    4928         [ +  + ]:   113315049 :                                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
    4929                 :             :                                 {
    4930                 :       17168 :                                         rlocator = &locators[j];
    4931                 :       17168 :                                         break;
    4932                 :             :                                 }
    4933                 :   113297881 :                         }
    4934                 :    39616512 :                 }
    4935                 :             :                 else
    4936                 :             :                 {
    4937                 :      475136 :                         RelFileLocator locator;
    4938                 :             : 
    4939                 :      475136 :                         locator = BufTagGetRelFileLocator(&bufHdr->tag);
    4940                 :      475136 :                         rlocator = bsearch(&locator,
    4941                 :      475136 :                                                            locators, n, sizeof(RelFileLocator),
    4942                 :             :                                                            rlocator_comparator);
    4943                 :      475136 :                 }
    4944                 :             : 
    4945                 :             :                 /* buffer doesn't belong to any of the given relfilelocators; skip it */
    4946         [ +  + ]:    40091648 :                 if (rlocator == NULL)
    4947                 :    40073942 :                         continue;
    4948                 :             : 
    4949                 :       17706 :                 LockBufHdr(bufHdr);
    4950         [ +  - ]:       17706 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
    4951                 :       17706 :                         InvalidateBuffer(bufHdr);       /* releases spinlock */
    4952                 :             :                 else
    4953                 :           0 :                         UnlockBufHdr(bufHdr);
    4954         [ +  + ]:    40091648 :         }
    4955                 :             : 
    4956                 :        2447 :         pfree(locators);
    4957                 :        2447 :         pfree(rels);
    4958                 :        2704 : }
    4959                 :             : 
    4960                 :             : /* ---------------------------------------------------------------------
    4961                 :             :  *              FindAndDropRelationBuffers
    4962                 :             :  *
    4963                 :             :  *              This function performs look up in BufMapping table and removes from the
    4964                 :             :  *              buffer pool all the pages of the specified relation fork that has block
    4965                 :             :  *              number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
    4966                 :             :  *              pages are removed.)
    4967                 :             :  * --------------------------------------------------------------------
    4968                 :             :  */
    4969                 :             : static void
    4970                 :           0 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
    4971                 :             :                                                    BlockNumber nForkBlock,
    4972                 :             :                                                    BlockNumber firstDelBlock)
    4973                 :             : {
    4974                 :           0 :         BlockNumber curBlock;
    4975                 :             : 
    4976         [ #  # ]:           0 :         for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
    4977                 :             :         {
    4978                 :           0 :                 uint32          bufHash;        /* hash value for tag */
    4979                 :           0 :                 BufferTag       bufTag;         /* identity of requested block */
    4980                 :           0 :                 LWLock     *bufPartitionLock;   /* buffer partition lock for it */
    4981                 :           0 :                 int                     buf_id;
    4982                 :           0 :                 BufferDesc *bufHdr;
    4983                 :             : 
    4984                 :             :                 /* create a tag so we can lookup the buffer */
    4985                 :           0 :                 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
    4986                 :             : 
    4987                 :             :                 /* determine its hash code and partition lock ID */
    4988                 :           0 :                 bufHash = BufTableHashCode(&bufTag);
    4989                 :           0 :                 bufPartitionLock = BufMappingPartitionLock(bufHash);
    4990                 :             : 
    4991                 :             :                 /* Check that it is in the buffer pool. If not, do nothing. */
    4992                 :           0 :                 LWLockAcquire(bufPartitionLock, LW_SHARED);
    4993                 :           0 :                 buf_id = BufTableLookup(&bufTag, bufHash);
    4994                 :           0 :                 LWLockRelease(bufPartitionLock);
    4995                 :             : 
    4996         [ #  # ]:           0 :                 if (buf_id < 0)
    4997                 :           0 :                         continue;
    4998                 :             : 
    4999                 :           0 :                 bufHdr = GetBufferDescriptor(buf_id);
    5000                 :             : 
    5001                 :             :                 /*
    5002                 :             :                  * We need to lock the buffer header and recheck if the buffer is
    5003                 :             :                  * still associated with the same block because the buffer could be
    5004                 :             :                  * evicted by some other backend loading blocks for a different
    5005                 :             :                  * relation after we release lock on the BufMapping table.
    5006                 :             :                  */
    5007                 :           0 :                 LockBufHdr(bufHdr);
    5008                 :             : 
    5009         [ #  # ]:           0 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
    5010   [ #  #  #  # ]:           0 :                         BufTagGetForkNum(&bufHdr->tag) == forkNum &&
    5011                 :           0 :                         bufHdr->tag.blockNum >= firstDelBlock)
    5012                 :           0 :                         InvalidateBuffer(bufHdr);       /* releases spinlock */
    5013                 :             :                 else
    5014                 :           0 :                         UnlockBufHdr(bufHdr);
    5015      [ #  #  # ]:           0 :         }
    5016                 :           0 : }
    5017                 :             : 
    5018                 :             : /* ---------------------------------------------------------------------
    5019                 :             :  *              DropDatabaseBuffers
    5020                 :             :  *
    5021                 :             :  *              This function removes all the buffers in the buffer cache for a
    5022                 :             :  *              particular database.  Dirty pages are simply dropped, without
    5023                 :             :  *              bothering to write them out first.  This is used when we destroy a
    5024                 :             :  *              database, to avoid trying to flush data to disk when the directory
    5025                 :             :  *              tree no longer exists.  Implementation is pretty similar to
    5026                 :             :  *              DropRelationBuffers() which is for destroying just one relation.
    5027                 :             :  * --------------------------------------------------------------------
    5028                 :             :  */
    5029                 :             : void
    5030                 :           3 : DropDatabaseBuffers(Oid dbid)
    5031                 :             : {
    5032                 :           3 :         int                     i;
    5033                 :             : 
    5034                 :             :         /*
    5035                 :             :          * We needn't consider local buffers, since by assumption the target
    5036                 :             :          * database isn't our own.
    5037                 :             :          */
    5038                 :             : 
    5039         [ +  + ]:       49155 :         for (i = 0; i < NBuffers; i++)
    5040                 :             :         {
    5041                 :       49152 :                 BufferDesc *bufHdr = GetBufferDescriptor(i);
    5042                 :             : 
    5043                 :             :                 /*
    5044                 :             :                  * As in DropRelationBuffers, an unlocked precheck should be safe and
    5045                 :             :                  * saves some cycles.
    5046                 :             :                  */
    5047         [ +  + ]:       49152 :                 if (bufHdr->tag.dbOid != dbid)
    5048                 :       48198 :                         continue;
    5049                 :             : 
    5050                 :         954 :                 LockBufHdr(bufHdr);
    5051         [ +  - ]:         954 :                 if (bufHdr->tag.dbOid == dbid)
    5052                 :         954 :                         InvalidateBuffer(bufHdr);       /* releases spinlock */
    5053                 :             :                 else
    5054                 :           0 :                         UnlockBufHdr(bufHdr);
    5055      [ -  +  + ]:       49152 :         }
    5056                 :           3 : }
    5057                 :             : 
    5058                 :             : /* ---------------------------------------------------------------------
    5059                 :             :  *              FlushRelationBuffers
    5060                 :             :  *
    5061                 :             :  *              This function writes all dirty pages of a relation out to disk
    5062                 :             :  *              (or more accurately, out to kernel disk buffers), ensuring that the
    5063                 :             :  *              kernel has an up-to-date view of the relation.
    5064                 :             :  *
    5065                 :             :  *              Generally, the caller should be holding AccessExclusiveLock on the
    5066                 :             :  *              target relation to ensure that no other backend is busy dirtying
    5067                 :             :  *              more blocks of the relation; the effects can't be expected to last
    5068                 :             :  *              after the lock is released.
    5069                 :             :  *
    5070                 :             :  *              XXX currently it sequentially searches the buffer pool, should be
    5071                 :             :  *              changed to more clever ways of searching.  This routine is not
    5072                 :             :  *              used in any performance-critical code paths, so it's not worth
    5073                 :             :  *              adding additional overhead to normal paths to make it go faster.
    5074                 :             :  * --------------------------------------------------------------------
    5075                 :             :  */
    5076                 :             : void
    5077                 :          39 : FlushRelationBuffers(Relation rel)
    5078                 :             : {
    5079                 :          39 :         int                     i;
    5080                 :          39 :         BufferDesc *bufHdr;
    5081                 :          39 :         SMgrRelation srel = RelationGetSmgr(rel);
    5082                 :             : 
    5083         [ +  + ]:          39 :         if (RelationUsesLocalBuffers(rel))
    5084                 :             :         {
    5085         [ +  + ]:         303 :                 for (i = 0; i < NLocBuffer; i++)
    5086                 :             :                 {
    5087                 :         300 :                         uint64          buf_state;
    5088                 :             : 
    5089                 :         300 :                         bufHdr = GetLocalBufferDescriptor(i);
    5090   [ +  +  -  + ]:         300 :                         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
    5091                 :         100 :                                 ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
    5092                 :         100 :                                  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    5093                 :             :                         {
    5094                 :         100 :                                 ErrorContextCallback errcallback;
    5095                 :             : 
    5096                 :             :                                 /* Setup error traceback support for ereport() */
    5097                 :         100 :                                 errcallback.callback = local_buffer_write_error_callback;
    5098                 :         100 :                                 errcallback.arg = bufHdr;
    5099                 :         100 :                                 errcallback.previous = error_context_stack;
    5100                 :         100 :                                 error_context_stack = &errcallback;
    5101                 :             : 
    5102                 :             :                                 /* Make sure we can handle the pin */
    5103                 :         100 :                                 ReservePrivateRefCountEntry();
    5104                 :         100 :                                 ResourceOwnerEnlarge(CurrentResourceOwner);
    5105                 :             : 
    5106                 :             :                                 /*
    5107                 :             :                                  * Pin/unpin mostly to make valgrind work, but it also seems
    5108                 :             :                                  * like the right thing to do.
    5109                 :             :                                  */
    5110                 :         100 :                                 PinLocalBuffer(bufHdr, false);
    5111                 :             : 
    5112                 :             : 
    5113                 :         100 :                                 FlushLocalBuffer(bufHdr, srel);
    5114                 :             : 
    5115                 :         100 :                                 UnpinLocalBuffer(BufferDescriptorGetBuffer(bufHdr));
    5116                 :             : 
    5117                 :             :                                 /* Pop the error context stack */
    5118                 :         100 :                                 error_context_stack = errcallback.previous;
    5119                 :         100 :                         }
    5120                 :         300 :                 }
    5121                 :             : 
    5122                 :           3 :                 return;
    5123                 :             :         }
    5124                 :             : 
    5125         [ +  + ]:      589860 :         for (i = 0; i < NBuffers; i++)
    5126                 :             :         {
    5127                 :      589824 :                 uint64          buf_state;
    5128                 :             : 
    5129                 :      589824 :                 bufHdr = GetBufferDescriptor(i);
    5130                 :             : 
    5131                 :             :                 /*
    5132                 :             :                  * As in DropRelationBuffers, an unlocked precheck should be safe and
    5133                 :             :                  * saves some cycles.
    5134                 :             :                  */
    5135         [ +  + ]:      589824 :                 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
    5136                 :      589769 :                         continue;
    5137                 :             : 
    5138                 :             :                 /* Make sure we can handle the pin */
    5139                 :          55 :                 ReservePrivateRefCountEntry();
    5140                 :          55 :                 ResourceOwnerEnlarge(CurrentResourceOwner);
    5141                 :             : 
    5142                 :          55 :                 buf_state = LockBufHdr(bufHdr);
    5143   [ +  -  +  + ]:          55 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
    5144                 :          55 :                         (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    5145                 :             :                 {
    5146                 :          42 :                         PinBuffer_Locked(bufHdr);
    5147                 :          42 :                         FlushUnlockedBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    5148                 :          42 :                         UnpinBuffer(bufHdr);
    5149                 :          42 :                 }
    5150                 :             :                 else
    5151                 :          13 :                         UnlockBufHdr(bufHdr);
    5152         [ +  + ]:      589824 :         }
    5153                 :          39 : }
    5154                 :             : 
    5155                 :             : /* ---------------------------------------------------------------------
    5156                 :             :  *              FlushRelationsAllBuffers
    5157                 :             :  *
    5158                 :             :  *              This function flushes out of the buffer pool all the pages of all
    5159                 :             :  *              forks of the specified smgr relations.  It's equivalent to calling
    5160                 :             :  *              FlushRelationBuffers once per relation.  The relations are assumed not
    5161                 :             :  *              to use local buffers.
    5162                 :             :  * --------------------------------------------------------------------
    5163                 :             :  */
    5164                 :             : void
    5165                 :           8 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
    5166                 :             : {
    5167                 :           8 :         int                     i;
    5168                 :           8 :         SMgrSortArray *srels;
    5169                 :           8 :         bool            use_bsearch;
    5170                 :             : 
    5171         [ +  - ]:           8 :         if (nrels == 0)
    5172                 :           0 :                 return;
    5173                 :             : 
    5174                 :             :         /* fill-in array for qsort */
    5175                 :           8 :         srels = palloc_array(SMgrSortArray, nrels);
    5176                 :             : 
    5177         [ +  + ]:          16 :         for (i = 0; i < nrels; i++)
    5178                 :             :         {
    5179         [ +  - ]:           8 :                 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
    5180                 :             : 
    5181                 :           8 :                 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
    5182                 :           8 :                 srels[i].srel = smgrs[i];
    5183                 :           8 :         }
    5184                 :             : 
    5185                 :             :         /*
    5186                 :             :          * Save the bsearch overhead for low number of relations to sync. See
    5187                 :             :          * DropRelationsAllBuffers for details.
    5188                 :             :          */
    5189                 :           8 :         use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
    5190                 :             : 
    5191                 :             :         /* sort the list of SMgrRelations if necessary */
    5192         [ +  - ]:           8 :         if (use_bsearch)
    5193                 :           0 :                 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
    5194                 :             : 
    5195         [ +  + ]:      131080 :         for (i = 0; i < NBuffers; i++)
    5196                 :             :         {
    5197                 :      131072 :                 SMgrSortArray *srelent = NULL;
    5198                 :      131072 :                 BufferDesc *bufHdr = GetBufferDescriptor(i);
    5199                 :      131072 :                 uint64          buf_state;
    5200                 :             : 
    5201                 :             :                 /*
    5202                 :             :                  * As in DropRelationBuffers, an unlocked precheck should be safe and
    5203                 :             :                  * saves some cycles.
    5204                 :             :                  */
    5205                 :             : 
    5206         [ -  + ]:      131072 :                 if (!use_bsearch)
    5207                 :             :                 {
    5208                 :      131072 :                         int                     j;
    5209                 :             : 
    5210         [ +  + ]:      259995 :                         for (j = 0; j < nrels; j++)
    5211                 :             :                         {
    5212         [ +  + ]:      131072 :                                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
    5213                 :             :                                 {
    5214                 :        2149 :                                         srelent = &srels[j];
    5215                 :        2149 :                                         break;
    5216                 :             :                                 }
    5217                 :      128923 :                         }
    5218                 :      131072 :                 }
    5219                 :             :                 else
    5220                 :             :                 {
    5221                 :           0 :                         RelFileLocator rlocator;
    5222                 :             : 
    5223                 :           0 :                         rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
    5224                 :           0 :                         srelent = bsearch(&rlocator,
    5225                 :           0 :                                                           srels, nrels, sizeof(SMgrSortArray),
    5226                 :             :                                                           rlocator_comparator);
    5227                 :           0 :                 }
    5228                 :             : 
    5229                 :             :                 /* buffer doesn't belong to any of the given relfilelocators; skip it */
    5230         [ +  + ]:      131072 :                 if (srelent == NULL)
    5231                 :      128923 :                         continue;
    5232                 :             : 
    5233                 :             :                 /* Make sure we can handle the pin */
    5234                 :        2149 :                 ReservePrivateRefCountEntry();
    5235                 :        2149 :                 ResourceOwnerEnlarge(CurrentResourceOwner);
    5236                 :             : 
    5237                 :        2149 :                 buf_state = LockBufHdr(bufHdr);
    5238   [ +  -  +  + ]:        2149 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
    5239                 :        2149 :                         (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    5240                 :             :                 {
    5241                 :        1726 :                         PinBuffer_Locked(bufHdr);
    5242                 :        1726 :                         FlushUnlockedBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    5243                 :        1726 :                         UnpinBuffer(bufHdr);
    5244                 :        1726 :                 }
    5245                 :             :                 else
    5246                 :         423 :                         UnlockBufHdr(bufHdr);
    5247         [ +  + ]:      131072 :         }
    5248                 :             : 
    5249                 :           8 :         pfree(srels);
    5250                 :           8 : }
    5251                 :             : 
    5252                 :             : /* ---------------------------------------------------------------------
    5253                 :             :  *              RelationCopyStorageUsingBuffer
    5254                 :             :  *
    5255                 :             :  *              Copy fork's data using bufmgr.  Same as RelationCopyStorage but instead
    5256                 :             :  *              of using smgrread and smgrextend this will copy using bufmgr APIs.
    5257                 :             :  *
    5258                 :             :  *              Refer comments atop CreateAndCopyRelationData() for details about
    5259                 :             :  *              'permanent' parameter.
    5260                 :             :  * --------------------------------------------------------------------
    5261                 :             :  */
    5262                 :             : static void
    5263                 :         894 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
    5264                 :             :                                                            RelFileLocator dstlocator,
    5265                 :             :                                                            ForkNumber forkNum, bool permanent)
    5266                 :             : {
    5267                 :         894 :         Buffer          srcBuf;
    5268                 :         894 :         Buffer          dstBuf;
    5269                 :         894 :         Page            srcPage;
    5270                 :         894 :         Page            dstPage;
    5271                 :         894 :         bool            use_wal;
    5272                 :         894 :         BlockNumber nblocks;
    5273                 :         894 :         BlockNumber blkno;
    5274                 :         894 :         PGIOAlignedBlock buf;
    5275                 :         894 :         BufferAccessStrategy bstrategy_src;
    5276                 :         894 :         BufferAccessStrategy bstrategy_dst;
    5277                 :         894 :         BlockRangeReadStreamPrivate p;
    5278                 :         894 :         ReadStream *src_stream;
    5279                 :         894 :         SMgrRelation src_smgr;
    5280                 :             : 
    5281                 :             :         /*
    5282                 :             :          * In general, we want to write WAL whenever wal_level > 'minimal', but we
    5283                 :             :          * can skip it when copying any fork of an unlogged relation other than
    5284                 :             :          * the init fork.
    5285                 :             :          */
    5286   [ +  +  +  - ]:        1192 :         use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
    5287                 :             : 
    5288                 :             :         /* Get number of blocks in the source relation. */
    5289                 :        1788 :         nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
    5290                 :         894 :                                                   forkNum);
    5291                 :             : 
    5292                 :             :         /* Nothing to copy; just return. */
    5293         [ +  + ]:         894 :         if (nblocks == 0)
    5294                 :         156 :                 return;
    5295                 :             : 
    5296                 :             :         /*
    5297                 :             :          * Bulk extend the destination relation of the same size as the source
    5298                 :             :          * relation before starting to copy block by block.
    5299                 :             :          */
    5300                 :         738 :         memset(buf.data, 0, BLCKSZ);
    5301                 :        1476 :         smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
    5302                 :         738 :                            buf.data, true);
    5303                 :             : 
    5304                 :             :         /* This is a bulk operation, so use buffer access strategies. */
    5305                 :         738 :         bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
    5306                 :         738 :         bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
    5307                 :             : 
    5308                 :             :         /* Initialize streaming read */
    5309                 :         738 :         p.current_blocknum = 0;
    5310                 :         738 :         p.last_exclusive = nblocks;
    5311                 :         738 :         src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
    5312                 :             : 
    5313                 :             :         /*
    5314                 :             :          * It is safe to use batchmode as block_range_read_stream_cb takes no
    5315                 :             :          * locks.
    5316                 :             :          */
    5317                 :         738 :         src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL |
    5318                 :             :                                                                                                  READ_STREAM_USE_BATCHING,
    5319                 :         738 :                                                                                                  bstrategy_src,
    5320                 :         738 :                                                                                                  src_smgr,
    5321                 :         738 :                                                                                                  permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
    5322                 :         738 :                                                                                                  forkNum,
    5323                 :             :                                                                                                  block_range_read_stream_cb,
    5324                 :             :                                                                                                  &p,
    5325                 :             :                                                                                                  0);
    5326                 :             : 
    5327                 :             :         /* Iterate over each block of the source relation file. */
    5328         [ +  + ]:        3600 :         for (blkno = 0; blkno < nblocks; blkno++)
    5329                 :             :         {
    5330         [ +  - ]:        2862 :                 CHECK_FOR_INTERRUPTS();
    5331                 :             : 
    5332                 :             :                 /* Read block from source relation. */
    5333                 :        2862 :                 srcBuf = read_stream_next_buffer(src_stream, NULL);
    5334                 :        2862 :                 LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
    5335                 :        2862 :                 srcPage = BufferGetPage(srcBuf);
    5336                 :             : 
    5337                 :        5724 :                 dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
    5338                 :        2862 :                                                                                    BufferGetBlockNumber(srcBuf),
    5339                 :        2862 :                                                                                    RBM_ZERO_AND_LOCK, bstrategy_dst,
    5340                 :        2862 :                                                                                    permanent);
    5341                 :        2862 :                 dstPage = BufferGetPage(dstBuf);
    5342                 :             : 
    5343                 :        2862 :                 START_CRIT_SECTION();
    5344                 :             : 
    5345                 :             :                 /* Copy page data from the source to the destination. */
    5346                 :        2862 :                 memcpy(dstPage, srcPage, BLCKSZ);
    5347                 :        2862 :                 MarkBufferDirty(dstBuf);
    5348                 :             : 
    5349                 :             :                 /* WAL-log the copied page. */
    5350         [ +  + ]:        2862 :                 if (use_wal)
    5351                 :         954 :                         log_newpage_buffer(dstBuf, true);
    5352                 :             : 
    5353         [ +  - ]:        2862 :                 END_CRIT_SECTION();
    5354                 :             : 
    5355                 :        2862 :                 UnlockReleaseBuffer(dstBuf);
    5356                 :        2862 :                 UnlockReleaseBuffer(srcBuf);
    5357                 :        2862 :         }
    5358         [ +  - ]:         738 :         Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
    5359                 :         738 :         read_stream_end(src_stream);
    5360                 :             : 
    5361                 :         738 :         FreeAccessStrategy(bstrategy_src);
    5362                 :         738 :         FreeAccessStrategy(bstrategy_dst);
    5363         [ -  + ]:         894 : }
    5364                 :             : 
    5365                 :             : /* ---------------------------------------------------------------------
    5366                 :             :  *              CreateAndCopyRelationData
    5367                 :             :  *
    5368                 :             :  *              Create destination relation storage and copy all forks from the
    5369                 :             :  *              source relation to the destination.
    5370                 :             :  *
    5371                 :             :  *              Pass permanent as true for permanent relations and false for
    5372                 :             :  *              unlogged relations.  Currently this API is not supported for
    5373                 :             :  *              temporary relations.
    5374                 :             :  * --------------------------------------------------------------------
    5375                 :             :  */
    5376                 :             : void
    5377                 :         672 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
    5378                 :             :                                                   RelFileLocator dst_rlocator, bool permanent)
    5379                 :             : {
    5380                 :         672 :         char            relpersistence;
    5381                 :         672 :         SMgrRelation src_rel;
    5382                 :         672 :         SMgrRelation dst_rel;
    5383                 :             : 
    5384                 :             :         /* Set the relpersistence. */
    5385                 :         672 :         relpersistence = permanent ?
    5386                 :             :                 RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
    5387                 :             : 
    5388                 :         672 :         src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
    5389                 :         672 :         dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
    5390                 :             : 
    5391                 :             :         /*
    5392                 :             :          * Create and copy all forks of the relation.  During create database we
    5393                 :             :          * have a separate cleanup mechanism which deletes complete database
    5394                 :             :          * directory.  Therefore, each individual relation doesn't need to be
    5395                 :             :          * registered for cleanup.
    5396                 :             :          */
    5397                 :         672 :         RelationCreateStorage(dst_rlocator, relpersistence, false);
    5398                 :             : 
    5399                 :             :         /* copy main fork. */
    5400                 :         672 :         RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
    5401                 :         672 :                                                                    permanent);
    5402                 :             : 
    5403                 :             :         /* copy those extra forks that exist */
    5404         [ +  + ]:        2688 :         for (ForkNumber forkNum = MAIN_FORKNUM + 1;
    5405                 :        2688 :                  forkNum <= MAX_FORKNUM; forkNum++)
    5406                 :             :         {
    5407         [ +  + ]:        2016 :                 if (smgrexists(src_rel, forkNum))
    5408                 :             :                 {
    5409                 :         222 :                         smgrcreate(dst_rel, forkNum, false);
    5410                 :             : 
    5411                 :             :                         /*
    5412                 :             :                          * WAL log creation if the relation is persistent, or this is the
    5413                 :             :                          * init fork of an unlogged relation.
    5414                 :             :                          */
    5415   [ -  +  #  # ]:         222 :                         if (permanent || forkNum == INIT_FORKNUM)
    5416                 :         222 :                                 log_smgrcreate(&dst_rlocator, forkNum);
    5417                 :             : 
    5418                 :             :                         /* Copy a fork's data, block by block. */
    5419                 :         444 :                         RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
    5420                 :         222 :                                                                                    permanent);
    5421                 :         222 :                 }
    5422                 :        2016 :         }
    5423                 :         672 : }
    5424                 :             : 
    5425                 :             : /* ---------------------------------------------------------------------
    5426                 :             :  *              FlushDatabaseBuffers
    5427                 :             :  *
    5428                 :             :  *              This function writes all dirty pages of a database out to disk
    5429                 :             :  *              (or more accurately, out to kernel disk buffers), ensuring that the
    5430                 :             :  *              kernel has an up-to-date view of the database.
    5431                 :             :  *
    5432                 :             :  *              Generally, the caller should be holding an appropriate lock to ensure
    5433                 :             :  *              no other backend is active in the target database; otherwise more
    5434                 :             :  *              pages could get dirtied.
    5435                 :             :  *
    5436                 :             :  *              Note we don't worry about flushing any pages of temporary relations.
    5437                 :             :  *              It's assumed these wouldn't be interesting.
    5438                 :             :  * --------------------------------------------------------------------
    5439                 :             :  */
    5440                 :             : void
    5441                 :           0 : FlushDatabaseBuffers(Oid dbid)
    5442                 :             : {
    5443                 :           0 :         int                     i;
    5444                 :           0 :         BufferDesc *bufHdr;
    5445                 :             : 
    5446         [ #  # ]:           0 :         for (i = 0; i < NBuffers; i++)
    5447                 :             :         {
    5448                 :           0 :                 uint64          buf_state;
    5449                 :             : 
    5450                 :           0 :                 bufHdr = GetBufferDescriptor(i);
    5451                 :             : 
    5452                 :             :                 /*
    5453                 :             :                  * As in DropRelationBuffers, an unlocked precheck should be safe and
    5454                 :             :                  * saves some cycles.
    5455                 :             :                  */
    5456         [ #  # ]:           0 :                 if (bufHdr->tag.dbOid != dbid)
    5457                 :           0 :                         continue;
    5458                 :             : 
    5459                 :             :                 /* Make sure we can handle the pin */
    5460                 :           0 :                 ReservePrivateRefCountEntry();
    5461                 :           0 :                 ResourceOwnerEnlarge(CurrentResourceOwner);
    5462                 :             : 
    5463                 :           0 :                 buf_state = LockBufHdr(bufHdr);
    5464   [ #  #  #  # ]:           0 :                 if (bufHdr->tag.dbOid == dbid &&
    5465                 :           0 :                         (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    5466                 :             :                 {
    5467                 :           0 :                         PinBuffer_Locked(bufHdr);
    5468                 :           0 :                         FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    5469                 :           0 :                         UnpinBuffer(bufHdr);
    5470                 :           0 :                 }
    5471                 :             :                 else
    5472                 :           0 :                         UnlockBufHdr(bufHdr);
    5473      [ #  #  # ]:           0 :         }
    5474                 :           0 : }
    5475                 :             : 
    5476                 :             : /*
    5477                 :             :  * Flush a previously, shared or exclusively, locked and pinned buffer to the
    5478                 :             :  * OS.
    5479                 :             :  */
    5480                 :             : void
    5481                 :           0 : FlushOneBuffer(Buffer buffer)
    5482                 :             : {
    5483                 :           0 :         BufferDesc *bufHdr;
    5484                 :             : 
    5485                 :             :         /* currently not needed, but no fundamental reason not to support */
    5486         [ #  # ]:           0 :         Assert(!BufferIsLocal(buffer));
    5487                 :             : 
    5488   [ #  #  #  #  :           0 :         Assert(BufferIsPinned(buffer));
                   #  # ]
    5489                 :             : 
    5490                 :           0 :         bufHdr = GetBufferDescriptor(buffer - 1);
    5491                 :             : 
    5492         [ #  # ]:           0 :         Assert(BufferIsLockedByMe(buffer));
    5493                 :             : 
    5494                 :           0 :         FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    5495                 :           0 : }
    5496                 :             : 
    5497                 :             : /*
    5498                 :             :  * ReleaseBuffer -- release the pin on a buffer
    5499                 :             :  */
    5500                 :             : void
    5501                 :    12195514 : ReleaseBuffer(Buffer buffer)
    5502                 :             : {
    5503         [ +  - ]:    12195514 :         if (!BufferIsValid(buffer))
    5504   [ #  #  #  # ]:           0 :                 elog(ERROR, "bad buffer ID: %d", buffer);
    5505                 :             : 
    5506         [ +  + ]:    12195514 :         if (BufferIsLocal(buffer))
    5507                 :      460412 :                 UnpinLocalBuffer(buffer);
    5508                 :             :         else
    5509                 :    11735102 :                 UnpinBuffer(GetBufferDescriptor(buffer - 1));
    5510                 :    12195514 : }
    5511                 :             : 
    5512                 :             : /*
    5513                 :             :  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
    5514                 :             :  *
    5515                 :             :  * This is just a shorthand for a common combination.
    5516                 :             :  */
    5517                 :             : void
    5518                 :     2621155 : UnlockReleaseBuffer(Buffer buffer)
    5519                 :             : {
    5520                 :     2621155 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    5521                 :     2621155 :         ReleaseBuffer(buffer);
    5522                 :     2621155 : }
    5523                 :             : 
    5524                 :             : /*
    5525                 :             :  * IncrBufferRefCount
    5526                 :             :  *              Increment the pin count on a buffer that we have *already* pinned
    5527                 :             :  *              at least once.
    5528                 :             :  *
    5529                 :             :  *              This function cannot be used on a buffer we do not have pinned,
    5530                 :             :  *              because it doesn't change the shared buffer state.
    5531                 :             :  */
    5532                 :             : void
    5533                 :     2413916 : IncrBufferRefCount(Buffer buffer)
    5534                 :             : {
    5535   [ -  +  #  #  :     2413916 :         Assert(BufferIsPinned(buffer));
                   +  + ]
    5536                 :     2413916 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    5537         [ +  + ]:     2413916 :         if (BufferIsLocal(buffer))
    5538                 :      114637 :                 LocalRefCount[-buffer - 1]++;
    5539                 :             :         else
    5540                 :             :         {
    5541                 :     2299279 :                 PrivateRefCountEntry *ref;
    5542                 :             : 
    5543                 :     2299279 :                 ref = GetPrivateRefCountEntry(buffer, true);
    5544         [ +  - ]:     2299279 :                 Assert(ref != NULL);
    5545                 :     2299279 :                 ref->data.refcount++;
    5546                 :     2299279 :         }
    5547                 :     2413916 :         ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
    5548                 :     2413916 : }
    5549                 :             : 
    5550                 :             : /*
    5551                 :             :  * MarkBufferDirtyHint
    5552                 :             :  *
    5553                 :             :  *      Mark a buffer dirty for non-critical changes.
    5554                 :             :  *
    5555                 :             :  * This is essentially the same as MarkBufferDirty, except:
    5556                 :             :  *
    5557                 :             :  * 1. The caller does not write WAL; so if checksums are enabled, we may need
    5558                 :             :  *        to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
    5559                 :             :  * 2. The caller might have only share-lock instead of exclusive-lock on the
    5560                 :             :  *        buffer's content lock.
    5561                 :             :  * 3. This function does not guarantee that the buffer is always marked dirty
    5562                 :             :  *        (due to a race condition), so it cannot be used for important changes.
    5563                 :             :  */
    5564                 :             : void
    5565                 :     2059388 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
    5566                 :             : {
    5567                 :     2059388 :         BufferDesc *bufHdr;
    5568                 :     2059388 :         Page            page = BufferGetPage(buffer);
    5569                 :             : 
    5570         [ +  - ]:     2059388 :         if (!BufferIsValid(buffer))
    5571   [ #  #  #  # ]:           0 :                 elog(ERROR, "bad buffer ID: %d", buffer);
    5572                 :             : 
    5573         [ +  + ]:     2059388 :         if (BufferIsLocal(buffer))
    5574                 :             :         {
    5575                 :      197225 :                 MarkLocalBufferDirty(buffer);
    5576                 :      197225 :                 return;
    5577                 :             :         }
    5578                 :             : 
    5579                 :     1862163 :         bufHdr = GetBufferDescriptor(buffer - 1);
    5580                 :             : 
    5581         [ +  - ]:     1862163 :         Assert(GetPrivateRefCount(buffer) > 0);
    5582                 :             :         /* here, either share or exclusive lock is OK */
    5583         [ +  - ]:     1862163 :         Assert(BufferIsLockedByMe(buffer));
    5584                 :             : 
    5585                 :             :         /*
    5586                 :             :          * This routine might get called many times on the same page, if we are
    5587                 :             :          * making the first scan after commit of an xact that added/deleted many
    5588                 :             :          * tuples. So, be as quick as we can if the buffer is already dirty.  We
    5589                 :             :          * do this by not acquiring spinlock if it looks like the status bits are
    5590                 :             :          * already set.  Since we make this test unlocked, there's a chance we
    5591                 :             :          * might fail to notice that the flags have just been cleared, and failed
    5592                 :             :          * to reset them, due to memory-ordering issues.  But since this function
    5593                 :             :          * is only intended to be used in cases where failing to write out the
    5594                 :             :          * data would be harmless anyway, it doesn't really matter.
    5595                 :             :          */
    5596         [ +  + ]:     1862163 :         if ((pg_atomic_read_u64(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
    5597                 :             :                 (BM_DIRTY | BM_JUST_DIRTIED))
    5598                 :             :         {
    5599                 :        4953 :                 XLogRecPtr      lsn = InvalidXLogRecPtr;
    5600                 :        4953 :                 bool            dirtied = false;
    5601                 :        4953 :                 bool            delayChkptFlags = false;
    5602                 :        4953 :                 uint64          buf_state;
    5603                 :             : 
    5604                 :             :                 /*
    5605                 :             :                  * If we need to protect hint bit updates from torn writes, WAL-log a
    5606                 :             :                  * full page image of the page. This full page image is only necessary
    5607                 :             :                  * if the hint bit update is the first change to the page since the
    5608                 :             :                  * last checkpoint.
    5609                 :             :                  *
    5610                 :             :                  * We don't check full_page_writes here because that logic is included
    5611                 :             :                  * when we call XLogInsert() since the value changes dynamically.
    5612                 :             :                  */
    5613   [ -  +  +  + ]:        4953 :                 if (XLogHintBitIsNeeded() &&
    5614                 :        4953 :                         (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT))
    5615                 :             :                 {
    5616                 :             :                         /*
    5617                 :             :                          * If we must not write WAL, due to a relfilelocator-specific
    5618                 :             :                          * condition or being in recovery, don't dirty the page.  We can
    5619                 :             :                          * set the hint, just not dirty the page as a result so the hint
    5620                 :             :                          * is lost when we evict the page or shutdown.
    5621                 :             :                          *
    5622                 :             :                          * See src/backend/storage/page/README for longer discussion.
    5623                 :             :                          */
    5624   [ +  -  +  + ]:        4950 :                         if (RecoveryInProgress() ||
    5625                 :        4950 :                                 RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
    5626                 :        1637 :                                 return;
    5627                 :             : 
    5628                 :             :                         /*
    5629                 :             :                          * If the block is already dirty because we either made a change
    5630                 :             :                          * or set a hint already, then we don't need to write a full page
    5631                 :             :                          * image.  Note that aggressive cleaning of blocks dirtied by hint
    5632                 :             :                          * bit setting would increase the call rate. Bulk setting of hint
    5633                 :             :                          * bits would reduce the call rate...
    5634                 :             :                          *
    5635                 :             :                          * We must issue the WAL record before we mark the buffer dirty.
    5636                 :             :                          * Otherwise we might write the page before we write the WAL. That
    5637                 :             :                          * causes a race condition, since a checkpoint might occur between
    5638                 :             :                          * writing the WAL record and marking the buffer dirty. We solve
    5639                 :             :                          * that with a kluge, but one that is already in use during
    5640                 :             :                          * transaction commit to prevent race conditions. Basically, we
    5641                 :             :                          * simply prevent the checkpoint WAL record from being written
    5642                 :             :                          * until we have marked the buffer dirty. We don't start the
    5643                 :             :                          * checkpoint flush until we have marked dirty, so our checkpoint
    5644                 :             :                          * must flush the change to disk successfully or the checkpoint
    5645                 :             :                          * never gets written, so crash recovery will fix.
    5646                 :             :                          *
    5647                 :             :                          * It's possible we may enter here without an xid, so it is
    5648                 :             :                          * essential that CreateCheckPoint waits for virtual transactions
    5649                 :             :                          * rather than full transactionids.
    5650                 :             :                          */
    5651         [ +  - ]:        3313 :                         Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
    5652                 :        3313 :                         MyProc->delayChkptFlags |= DELAY_CHKPT_START;
    5653                 :        3313 :                         delayChkptFlags = true;
    5654                 :        3313 :                         lsn = XLogSaveBufferForHint(buffer, buffer_std);
    5655                 :        3313 :                 }
    5656                 :             : 
    5657                 :        3316 :                 buf_state = LockBufHdr(bufHdr);
    5658                 :             : 
    5659         [ +  - ]:        3316 :                 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    5660                 :             : 
    5661         [ -  + ]:        3316 :                 if (!(buf_state & BM_DIRTY))
    5662                 :             :                 {
    5663                 :        3316 :                         dirtied = true;         /* Means "will be dirtied by this action" */
    5664                 :             : 
    5665                 :             :                         /*
    5666                 :             :                          * Set the page LSN if we wrote a backup block. We aren't supposed
    5667                 :             :                          * to set this when only holding a share lock but as long as we
    5668                 :             :                          * serialise it somehow we're OK. We choose to set LSN while
    5669                 :             :                          * holding the buffer header lock, which causes any reader of an
    5670                 :             :                          * LSN who holds only a share lock to also obtain a buffer header
    5671                 :             :                          * lock before using PageGetLSN(), which is enforced in
    5672                 :             :                          * BufferGetLSNAtomic().
    5673                 :             :                          *
    5674                 :             :                          * If checksums are enabled, you might think we should reset the
    5675                 :             :                          * checksum here. That will happen when the page is written
    5676                 :             :                          * sometime later in this checkpoint cycle.
    5677                 :             :                          */
    5678         [ +  + ]:        3316 :                         if (XLogRecPtrIsValid(lsn))
    5679                 :        3282 :                                 PageSetLSN(page, lsn);
    5680                 :        3316 :                 }
    5681                 :             : 
    5682                 :        3316 :                 UnlockBufHdrExt(bufHdr, buf_state,
    5683                 :             :                                                 BM_DIRTY | BM_JUST_DIRTIED,
    5684                 :             :                                                 0, 0);
    5685                 :             : 
    5686         [ +  + ]:        3316 :                 if (delayChkptFlags)
    5687                 :        3313 :                         MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
    5688                 :             : 
    5689         [ -  + ]:        3316 :                 if (dirtied)
    5690                 :             :                 {
    5691                 :        3316 :                         pgBufferUsage.shared_blks_dirtied++;
    5692         [ +  - ]:        3316 :                         if (VacuumCostActive)
    5693                 :           0 :                                 VacuumCostBalance += VacuumCostPageDirty;
    5694                 :        3316 :                 }
    5695         [ +  + ]:        4953 :         }
    5696         [ -  + ]:     2059388 : }
    5697                 :             : 
    5698                 :             : /*
    5699                 :             :  * Release buffer content locks for shared buffers.
    5700                 :             :  *
    5701                 :             :  * Used to clean up after errors.
    5702                 :             :  *
    5703                 :             :  * Currently, we can expect that resource owner cleanup, via
    5704                 :             :  * ResOwnerReleaseBufferPin(), took care of releasing buffer content locks per
    5705                 :             :  * se; the only thing we need to deal with here is clearing any PIN_COUNT
    5706                 :             :  * request that was in progress.
    5707                 :             :  */
    5708                 :             : void
    5709                 :        9005 : UnlockBuffers(void)
    5710                 :             : {
    5711                 :        9005 :         BufferDesc *buf = PinCountWaitBuf;
    5712                 :             : 
    5713         [ +  - ]:        9005 :         if (buf)
    5714                 :             :         {
    5715                 :           0 :                 uint64          buf_state;
    5716                 :           0 :                 uint64          unset_bits = 0;
    5717                 :             : 
    5718                 :           0 :                 buf_state = LockBufHdr(buf);
    5719                 :             : 
    5720                 :             :                 /*
    5721                 :             :                  * Don't complain if flag bit not set; it could have been reset but we
    5722                 :             :                  * got a cancel/die interrupt before getting the signal.
    5723                 :             :                  */
    5724   [ #  #  #  # ]:           0 :                 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
    5725                 :           0 :                         buf->wait_backend_pgprocno == MyProcNumber)
    5726                 :           0 :                         unset_bits = BM_PIN_COUNT_WAITER;
    5727                 :             : 
    5728                 :           0 :                 UnlockBufHdrExt(buf, buf_state,
    5729                 :           0 :                                                 0, unset_bits,
    5730                 :             :                                                 0);
    5731                 :             : 
    5732                 :           0 :                 PinCountWaitBuf = NULL;
    5733                 :           0 :         }
    5734                 :        9005 : }
    5735                 :             : 
    5736                 :             : /*
    5737                 :             :  * Acquire the buffer content lock in the specified mode
    5738                 :             :  *
    5739                 :             :  * If the lock is not available, sleep until it is.
    5740                 :             :  *
    5741                 :             :  * Side effect: cancel/die interrupts are held off until lock release.
    5742                 :             :  *
    5743                 :             :  * This uses almost the same locking approach as lwlock.c's
    5744                 :             :  * LWLockAcquire(). See documentation at the top of lwlock.c for a more
    5745                 :             :  * detailed discussion.
    5746                 :             :  *
    5747                 :             :  * The reason that this, and most of the other BufferLock* functions, get both
    5748                 :             :  * the Buffer and BufferDesc* as parameters, is that looking up one from the
    5749                 :             :  * other repeatedly shows up noticeably in profiles.
    5750                 :             :  *
    5751                 :             :  * Callers should provide a constant for mode, for more efficient code
    5752                 :             :  * generation.
    5753                 :             :  */
    5754                 :             : static inline void
    5755                 :    15028952 : BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
    5756                 :             : {
    5757                 :    15028952 :         PrivateRefCountEntry *entry;
    5758                 :    15028952 :         int                     extraWaits = 0;
    5759                 :             : 
    5760                 :             :         /*
    5761                 :             :          * Get reference to the refcount entry before we hold the lock, it seems
    5762                 :             :          * better to do before holding the lock.
    5763                 :             :          */
    5764                 :    15028952 :         entry = GetPrivateRefCountEntry(buffer, true);
    5765                 :             : 
    5766                 :             :         /*
    5767                 :             :          * We better not already hold a lock on the buffer.
    5768                 :             :          */
    5769         [ +  - ]:    15028952 :         Assert(entry->data.lockmode == BUFFER_LOCK_UNLOCK);
    5770                 :             : 
    5771                 :             :         /*
    5772                 :             :          * Lock out cancel/die interrupts until we exit the code section protected
    5773                 :             :          * by the content lock.  This ensures that interrupts will not interfere
    5774                 :             :          * with manipulations of data structures in shared memory.
    5775                 :             :          */
    5776                 :    15028952 :         HOLD_INTERRUPTS();
    5777                 :             : 
    5778                 :    15030750 :         for (;;)
    5779                 :             :         {
    5780                 :    15030750 :                 uint32          wait_event = 0; /* initialized to avoid compiler warning */
    5781                 :    15030750 :                 bool            mustwait;
    5782                 :             : 
    5783                 :             :                 /*
    5784                 :             :                  * Try to grab the lock the first time, we're not in the waitqueue
    5785                 :             :                  * yet/anymore.
    5786                 :             :                  */
    5787                 :    15030750 :                 mustwait = BufferLockAttempt(buf_hdr, mode);
    5788                 :             : 
    5789         [ +  + ]:    15030750 :                 if (likely(!mustwait))
    5790                 :             :                 {
    5791                 :    15028808 :                         break;
    5792                 :             :                 }
    5793                 :             : 
    5794                 :             :                 /*
    5795                 :             :                  * Ok, at this point we couldn't grab the lock on the first try. We
    5796                 :             :                  * cannot simply queue ourselves to the end of the list and wait to be
    5797                 :             :                  * woken up because by now the lock could long have been released.
    5798                 :             :                  * Instead add us to the queue and try to grab the lock again. If we
    5799                 :             :                  * succeed we need to revert the queuing and be happy, otherwise we
    5800                 :             :                  * recheck the lock. If we still couldn't grab it, we know that the
    5801                 :             :                  * other locker will see our queue entries when releasing since they
    5802                 :             :                  * existed before we checked for the lock.
    5803                 :             :                  */
    5804                 :             : 
    5805                 :             :                 /* add to the queue */
    5806                 :        1942 :                 BufferLockQueueSelf(buf_hdr, mode);
    5807                 :             : 
    5808                 :             :                 /* we're now guaranteed to be woken up if necessary */
    5809                 :        1942 :                 mustwait = BufferLockAttempt(buf_hdr, mode);
    5810                 :             : 
    5811                 :             :                 /* ok, grabbed the lock the second time round, need to undo queueing */
    5812         [ +  + ]:        1942 :                 if (!mustwait)
    5813                 :             :                 {
    5814                 :         144 :                         BufferLockDequeueSelf(buf_hdr);
    5815                 :         144 :                         break;
    5816                 :             :                 }
    5817                 :             : 
    5818   [ -  +  -  -  :        1798 :                 switch (mode)
                      + ]
    5819                 :             :                 {
    5820                 :             :                         case BUFFER_LOCK_EXCLUSIVE:
    5821                 :        1093 :                                 wait_event = WAIT_EVENT_BUFFER_EXCLUSIVE;
    5822                 :        1093 :                                 break;
    5823                 :             :                         case BUFFER_LOCK_SHARE_EXCLUSIVE:
    5824                 :           0 :                                 wait_event = WAIT_EVENT_BUFFER_SHARE_EXCLUSIVE;
    5825                 :           0 :                                 break;
    5826                 :             :                         case BUFFER_LOCK_SHARE:
    5827                 :         705 :                                 wait_event = WAIT_EVENT_BUFFER_SHARED;
    5828                 :         705 :                                 break;
    5829                 :             :                         case BUFFER_LOCK_UNLOCK:
    5830                 :           0 :                                 pg_unreachable();
    5831                 :             : 
    5832                 :             :                 }
    5833                 :        1798 :                 pgstat_report_wait_start(wait_event);
    5834                 :             : 
    5835                 :             :                 /*
    5836                 :             :                  * Wait until awakened.
    5837                 :             :                  *
    5838                 :             :                  * It is possible that we get awakened for a reason other than being
    5839                 :             :                  * signaled by BufferLockWakeup().  If so, loop back and wait again.
    5840                 :             :                  * Once we've gotten the lock, re-increment the sema by the number of
    5841                 :             :                  * additional signals received.
    5842                 :             :                  */
    5843                 :        1798 :                 for (;;)
    5844                 :             :                 {
    5845                 :        1798 :                         PGSemaphoreLock(MyProc->sem);
    5846         [ -  + ]:        1798 :                         if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
    5847                 :        1798 :                                 break;
    5848                 :           0 :                         extraWaits++;
    5849                 :             :                 }
    5850                 :             : 
    5851                 :        1798 :                 pgstat_report_wait_end();
    5852                 :             : 
    5853                 :             :                 /* Retrying, allow BufferLockRelease to release waiters again. */
    5854                 :        1798 :                 pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
    5855      [ +  -  + ]:    15030750 :         }
    5856                 :             : 
    5857                 :             :         /* Remember that we now hold this lock */
    5858                 :    15028952 :         entry->data.lockmode = mode;
    5859                 :             : 
    5860                 :             :         /*
    5861                 :             :          * Fix the process wait semaphore's count for any absorbed wakeups.
    5862                 :             :          */
    5863         [ -  + ]:    15028952 :         while (unlikely(extraWaits-- > 0))
    5864                 :           0 :                 PGSemaphoreUnlock(MyProc->sem);
    5865                 :    15028952 : }
    5866                 :             : 
    5867                 :             : /*
    5868                 :             :  * Release a previously acquired buffer content lock.
    5869                 :             :  */
    5870                 :             : static void
    5871                 :    15321271 : BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
    5872                 :             : {
    5873                 :    15321271 :         BufferLockMode mode;
    5874                 :    15321271 :         uint64          oldstate;
    5875                 :    15321271 :         uint64          sub;
    5876                 :             : 
    5877                 :    15321271 :         mode = BufferLockDisownInternal(buffer, buf_hdr);
    5878                 :             : 
    5879                 :             :         /*
    5880                 :             :          * Release my hold on lock, after that it can immediately be acquired by
    5881                 :             :          * others, even if we still have to wakeup other waiters.
    5882                 :             :          */
    5883                 :    15321271 :         sub = BufferLockReleaseSub(mode);
    5884                 :             : 
    5885                 :    15321271 :         oldstate = pg_atomic_sub_fetch_u64(&buf_hdr->state, sub);
    5886                 :             : 
    5887                 :    15321271 :         BufferLockProcessRelease(buf_hdr, mode, oldstate);
    5888                 :             : 
    5889                 :             :         /*
    5890                 :             :          * Now okay to allow cancel/die interrupts.
    5891                 :             :          */
    5892         [ +  - ]:    15321271 :         RESUME_INTERRUPTS();
    5893                 :    15321271 : }
    5894                 :             : 
    5895                 :             : 
    5896                 :             : /*
    5897                 :             :  * Acquire the content lock for the buffer, but only if we don't have to wait.
    5898                 :             :  */
    5899                 :             : static bool
    5900                 :      292325 : BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
    5901                 :             : {
    5902                 :      292325 :         PrivateRefCountEntry *entry = GetPrivateRefCountEntry(buffer, true);
    5903                 :      292325 :         bool            mustwait;
    5904                 :             : 
    5905                 :             :         /*
    5906                 :             :          * We better not already hold a lock on the buffer.
    5907                 :             :          */
    5908         [ +  - ]:      292325 :         Assert(entry->data.lockmode == BUFFER_LOCK_UNLOCK);
    5909                 :             : 
    5910                 :             :         /*
    5911                 :             :          * Lock out cancel/die interrupts until we exit the code section protected
    5912                 :             :          * by the content lock.  This ensures that interrupts will not interfere
    5913                 :             :          * with manipulations of data structures in shared memory.
    5914                 :             :          */
    5915                 :      292325 :         HOLD_INTERRUPTS();
    5916                 :             : 
    5917                 :             :         /* Check for the lock */
    5918                 :      292325 :         mustwait = BufferLockAttempt(buf_hdr, mode);
    5919                 :             : 
    5920         [ +  + ]:      292325 :         if (mustwait)
    5921                 :             :         {
    5922                 :             :                 /* Failed to get lock, so release interrupt holdoff */
    5923         [ +  - ]:           6 :                 RESUME_INTERRUPTS();
    5924                 :           6 :         }
    5925                 :             :         else
    5926                 :             :         {
    5927                 :      292319 :                 entry->data.lockmode = mode;
    5928                 :             :         }
    5929                 :             : 
    5930                 :      584650 :         return !mustwait;
    5931                 :      292325 : }
    5932                 :             : 
    5933                 :             : /*
    5934                 :             :  * Internal function that tries to atomically acquire the content lock in the
    5935                 :             :  * passed in mode.
    5936                 :             :  *
    5937                 :             :  * This function will not block waiting for a lock to become free - that's the
    5938                 :             :  * caller's job.
    5939                 :             :  *
    5940                 :             :  * Similar to LWLockAttemptLock().
    5941                 :             :  */
    5942                 :             : static inline bool
    5943                 :    15325017 : BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
    5944                 :             : {
    5945                 :    15325017 :         uint64          old_state;
    5946                 :             : 
    5947                 :             :         /*
    5948                 :             :          * Read once outside the loop, later iterations will get the newer value
    5949                 :             :          * via compare & exchange.
    5950                 :             :          */
    5951                 :    15325017 :         old_state = pg_atomic_read_u64(&buf_hdr->state);
    5952                 :             : 
    5953                 :             :         /* loop until we've determined whether we could acquire the lock or not */
    5954                 :    15333366 :         while (true)
    5955                 :             :         {
    5956                 :    15333366 :                 uint64          desired_state;
    5957                 :    15333366 :                 bool            lock_free;
    5958                 :             : 
    5959                 :    15333366 :                 desired_state = old_state;
    5960                 :             : 
    5961         [ +  + ]:    15333366 :                 if (mode == BUFFER_LOCK_EXCLUSIVE)
    5962                 :             :                 {
    5963                 :     4238184 :                         lock_free = (old_state & BM_LOCK_MASK) == 0;
    5964         [ +  + ]:     4238184 :                         if (lock_free)
    5965                 :     4235819 :                                 desired_state += BM_LOCK_VAL_EXCLUSIVE;
    5966                 :     4238184 :                 }
    5967         [ -  + ]:    11095182 :                 else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    5968                 :             :                 {
    5969                 :           0 :                         lock_free = (old_state & (BM_LOCK_VAL_EXCLUSIVE | BM_LOCK_VAL_SHARE_EXCLUSIVE)) == 0;
    5970         [ #  # ]:           0 :                         if (lock_free)
    5971                 :           0 :                                 desired_state += BM_LOCK_VAL_SHARE_EXCLUSIVE;
    5972                 :           0 :                 }
    5973                 :             :                 else
    5974                 :             :                 {
    5975                 :    11095182 :                         lock_free = (old_state & BM_LOCK_VAL_EXCLUSIVE) == 0;
    5976         [ +  + ]:    11095182 :                         if (lock_free)
    5977                 :    11093690 :                                 desired_state += BM_LOCK_VAL_SHARED;
    5978                 :             :                 }
    5979                 :             : 
    5980                 :             :                 /*
    5981                 :             :                  * Attempt to swap in the state we are expecting. If we didn't see
    5982                 :             :                  * lock to be free, that's just the old value. If we saw it as free,
    5983                 :             :                  * we'll attempt to mark it acquired. The reason that we always swap
    5984                 :             :                  * in the value is that this doubles as a memory barrier. We could try
    5985                 :             :                  * to be smarter and only swap in values if we saw the lock as free,
    5986                 :             :                  * but benchmark haven't shown it as beneficial so far.
    5987                 :             :                  *
    5988                 :             :                  * Retry if the value changed since we last looked at it.
    5989                 :             :                  */
    5990         [ +  + ]:    15333366 :                 if (likely(pg_atomic_compare_exchange_u64(&buf_hdr->state,
    5991                 :             :                                                                                                   &old_state, desired_state)))
    5992                 :             :                 {
    5993         [ +  + ]:    15325017 :                         if (lock_free)
    5994                 :             :                         {
    5995                 :             :                                 /* Great! Got the lock. */
    5996                 :    15321271 :                                 return false;
    5997                 :             :                         }
    5998                 :             :                         else
    5999                 :        3746 :                                 return true;    /* somebody else has the lock */
    6000                 :             :                 }
    6001         [ +  + ]:    15333366 :         }
    6002                 :             : 
    6003                 :             :         pg_unreachable();
    6004                 :    15325017 : }
    6005                 :             : 
    6006                 :             : /*
    6007                 :             :  * Add ourselves to the end of the content lock's wait queue.
    6008                 :             :  */
    6009                 :             : static void
    6010                 :        1942 : BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
    6011                 :             : {
    6012                 :             :         /*
    6013                 :             :          * If we don't have a PGPROC structure, there's no way to wait. This
    6014                 :             :          * should never occur, since MyProc should only be null during shared
    6015                 :             :          * memory initialization.
    6016                 :             :          */
    6017         [ +  - ]:        1942 :         if (MyProc == NULL)
    6018   [ #  #  #  # ]:           0 :                 elog(PANIC, "cannot wait without a PGPROC structure");
    6019                 :             : 
    6020         [ +  - ]:        1942 :         if (MyProc->lwWaiting != LW_WS_NOT_WAITING)
    6021   [ #  #  #  # ]:           0 :                 elog(PANIC, "queueing for lock while waiting on another one");
    6022                 :             : 
    6023                 :        1942 :         LockBufHdr(buf_hdr);
    6024                 :             : 
    6025                 :             :         /* setting the flag is protected by the spinlock */
    6026                 :        1942 :         pg_atomic_fetch_or_u64(&buf_hdr->state, BM_LOCK_HAS_WAITERS);
    6027                 :             : 
    6028                 :             :         /*
    6029                 :             :          * These are currently used both for lwlocks and buffer content locks,
    6030                 :             :          * which is acceptable, although not pretty, because a backend can't wait
    6031                 :             :          * for both types of locks at the same time.
    6032                 :             :          */
    6033                 :        1942 :         MyProc->lwWaiting = LW_WS_WAITING;
    6034                 :        1942 :         MyProc->lwWaitMode = mode;
    6035                 :             : 
    6036                 :        1942 :         proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
    6037                 :             : 
    6038                 :             :         /* Can release the mutex now */
    6039                 :        1942 :         UnlockBufHdr(buf_hdr);
    6040                 :        1942 : }
    6041                 :             : 
    6042                 :             : /*
    6043                 :             :  * Remove ourselves from the waitlist.
    6044                 :             :  *
    6045                 :             :  * This is used if we queued ourselves because we thought we needed to sleep
    6046                 :             :  * but, after further checking, we discovered that we don't actually need to
    6047                 :             :  * do so.
    6048                 :             :  */
    6049                 :             : static void
    6050                 :         144 : BufferLockDequeueSelf(BufferDesc *buf_hdr)
    6051                 :             : {
    6052                 :         144 :         bool            on_waitlist;
    6053                 :             : 
    6054                 :         144 :         LockBufHdr(buf_hdr);
    6055                 :             : 
    6056                 :         144 :         on_waitlist = MyProc->lwWaiting == LW_WS_WAITING;
    6057         [ +  + ]:         144 :         if (on_waitlist)
    6058                 :         121 :                 proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
    6059                 :             : 
    6060   [ +  +  +  + ]:         144 :         if (proclist_is_empty(&buf_hdr->lock_waiters) &&
    6061                 :         142 :                 (pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS) != 0)
    6062                 :             :         {
    6063                 :         119 :                 pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_HAS_WAITERS);
    6064                 :         119 :         }
    6065                 :             : 
    6066                 :             :         /* XXX: combine with fetch_and above? */
    6067                 :         144 :         UnlockBufHdr(buf_hdr);
    6068                 :             : 
    6069                 :             :         /* clear waiting state again, nice for debugging */
    6070         [ +  + ]:         144 :         if (on_waitlist)
    6071                 :         121 :                 MyProc->lwWaiting = LW_WS_NOT_WAITING;
    6072                 :             :         else
    6073                 :             :         {
    6074                 :          23 :                 int                     extraWaits = 0;
    6075                 :             : 
    6076                 :             : 
    6077                 :             :                 /*
    6078                 :             :                  * Somebody else dequeued us and has or will wake us up. Deal with the
    6079                 :             :                  * superfluous absorption of a wakeup.
    6080                 :             :                  */
    6081                 :             : 
    6082                 :             :                 /*
    6083                 :             :                  * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
    6084                 :             :                  * removed ourselves - they'll have set it.
    6085                 :             :                  */
    6086                 :          23 :                 pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
    6087                 :             : 
    6088                 :             :                 /*
    6089                 :             :                  * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
    6090                 :             :                  * get reset at some inconvenient point later. Most of the time this
    6091                 :             :                  * will immediately return.
    6092                 :             :                  */
    6093                 :          23 :                 for (;;)
    6094                 :             :                 {
    6095                 :          23 :                         PGSemaphoreLock(MyProc->sem);
    6096         [ +  - ]:          23 :                         if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
    6097                 :          23 :                                 break;
    6098                 :           0 :                         extraWaits++;
    6099                 :             :                 }
    6100                 :             : 
    6101                 :             :                 /*
    6102                 :             :                  * Fix the process wait semaphore's count for any absorbed wakeups.
    6103                 :             :                  */
    6104         [ -  + ]:          23 :                 while (extraWaits-- > 0)
    6105                 :           0 :                         PGSemaphoreUnlock(MyProc->sem);
    6106                 :          23 :         }
    6107                 :         144 : }
    6108                 :             : 
    6109                 :             : /*
    6110                 :             :  * Stop treating lock as held by current backend.
    6111                 :             :  *
    6112                 :             :  * After calling this function it's the callers responsibility to ensure that
    6113                 :             :  * the lock gets released, even in case of an error. This only is desirable if
    6114                 :             :  * the lock is going to be released in a different process than the process
    6115                 :             :  * that acquired it.
    6116                 :             :  */
    6117                 :             : static inline void
    6118                 :           0 : BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
    6119                 :             : {
    6120                 :           0 :         BufferLockDisownInternal(buffer, buf_hdr);
    6121         [ #  # ]:           0 :         RESUME_INTERRUPTS();
    6122                 :           0 : }
    6123                 :             : 
    6124                 :             : /*
    6125                 :             :  * Stop treating lock as held by current backend.
    6126                 :             :  *
    6127                 :             :  * This is the code that can be shared between actually releasing a lock
    6128                 :             :  * (BufferLockUnlock()) and just not tracking ownership of the lock anymore
    6129                 :             :  * without releasing the lock (BufferLockDisown()).
    6130                 :             :  */
    6131                 :             : static inline int
    6132                 :    15321271 : BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
    6133                 :             : {
    6134                 :    15321271 :         BufferLockMode mode;
    6135                 :    15321271 :         PrivateRefCountEntry *ref;
    6136                 :             : 
    6137                 :    15321271 :         ref = GetPrivateRefCountEntry(buffer, false);
    6138         [ +  - ]:    15321271 :         if (ref == NULL)
    6139   [ #  #  #  # ]:           0 :                 elog(ERROR, "lock %d is not held", buffer);
    6140                 :    15321271 :         mode = ref->data.lockmode;
    6141                 :    15321271 :         ref->data.lockmode = BUFFER_LOCK_UNLOCK;
    6142                 :             : 
    6143                 :    30642542 :         return mode;
    6144                 :    15321271 : }
    6145                 :             : 
    6146                 :             : /*
    6147                 :             :  * Wakeup all the lockers that currently have a chance to acquire the lock.
    6148                 :             :  *
    6149                 :             :  * wake_exclusive indicates whether exclusive lock waiters should be woken up.
    6150                 :             :  */
    6151                 :             : static void
    6152                 :        1814 : BufferLockWakeup(BufferDesc *buf_hdr, bool wake_exclusive)
    6153                 :             : {
    6154                 :        1814 :         bool            new_wake_in_progress = false;
    6155                 :        1814 :         bool            wake_share_exclusive = true;
    6156                 :        1814 :         proclist_head wakeup;
    6157                 :        1814 :         proclist_mutable_iter iter;
    6158                 :             : 
    6159                 :        1814 :         proclist_init(&wakeup);
    6160                 :             : 
    6161                 :             :         /* lock wait list while collecting backends to wake up */
    6162                 :        1814 :         LockBufHdr(buf_hdr);
    6163                 :             : 
    6164   [ +  +  +  +  :        2557 :         proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
                   +  + ]
    6165                 :             :         {
    6166                 :        1846 :                 PGPROC     *waiter = GetPGProcByNumber(iter.cur);
    6167                 :             : 
    6168                 :             :                 /*
    6169                 :             :                  * Already woke up a conflicting lock, so skip over this wait list
    6170                 :             :                  * entry.
    6171                 :             :                  */
    6172   [ +  +  +  + ]:        1846 :                 if (!wake_exclusive && waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
    6173                 :          26 :                         continue;
    6174   [ -  +  #  # ]:        1820 :                 if (!wake_share_exclusive && waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6175                 :           0 :                         continue;
    6176                 :             : 
    6177                 :        1820 :                 proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
    6178                 :        1820 :                 proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
    6179                 :             : 
    6180                 :             :                 /*
    6181                 :             :                  * Prevent additional wakeups until retryer gets to run. Backends that
    6182                 :             :                  * are just waiting for the lock to become free don't retry
    6183                 :             :                  * automatically.
    6184                 :             :                  */
    6185                 :        1820 :                 new_wake_in_progress = true;
    6186                 :             : 
    6187                 :             :                 /*
    6188                 :             :                  * Signal that the process isn't on the wait list anymore. This allows
    6189                 :             :                  * BufferLockDequeueSelf() to remove itself from the waitlist with a
    6190                 :             :                  * proclist_delete(), rather than having to check if it has been
    6191                 :             :                  * removed from the list.
    6192                 :             :                  */
    6193         [ +  - ]:        1820 :                 Assert(waiter->lwWaiting == LW_WS_WAITING);
    6194                 :        1820 :                 waiter->lwWaiting = LW_WS_PENDING_WAKEUP;
    6195                 :             : 
    6196                 :             :                 /*
    6197                 :             :                  * Don't wakeup further waiters after waking a conflicting waiter.
    6198                 :             :                  */
    6199         [ +  + ]:        1820 :                 if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
    6200                 :             :                 {
    6201                 :             :                         /*
    6202                 :             :                          * Share locks conflict with exclusive locks.
    6203                 :             :                          */
    6204                 :         717 :                         wake_exclusive = false;
    6205                 :         717 :                 }
    6206         [ -  + ]:        1103 :                 else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6207                 :             :                 {
    6208                 :             :                         /*
    6209                 :             :                          * Share-exclusive locks conflict with share-exclusive and
    6210                 :             :                          * exclusive locks.
    6211                 :             :                          */
    6212                 :           0 :                         wake_exclusive = false;
    6213                 :           0 :                         wake_share_exclusive = false;
    6214                 :           0 :                 }
    6215         [ +  - ]:        1103 :                 else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
    6216                 :             :                 {
    6217                 :             :                         /*
    6218                 :             :                          * Exclusive locks conflict with all other locks, there's no point
    6219                 :             :                          * in waking up anybody else.
    6220                 :             :                          */
    6221                 :        1103 :                         break;
    6222                 :             :                 }
    6223   [ -  +  +  + ]:        1846 :         }
    6224                 :             : 
    6225   [ +  +  +  - ]:        1814 :         Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS);
    6226                 :             : 
    6227                 :             :         /* unset required flags, and release lock, in one fell swoop */
    6228                 :             :         {
    6229                 :        1814 :                 uint64          old_state;
    6230                 :        1814 :                 uint64          desired_state;
    6231                 :             : 
    6232                 :        1814 :                 old_state = pg_atomic_read_u64(&buf_hdr->state);
    6233                 :        1817 :                 while (true)
    6234                 :             :                 {
    6235                 :        1817 :                         desired_state = old_state;
    6236                 :             : 
    6237                 :             :                         /* compute desired flags */
    6238                 :             : 
    6239         [ +  + ]:        1817 :                         if (new_wake_in_progress)
    6240                 :        1766 :                                 desired_state |= BM_LOCK_WAKE_IN_PROGRESS;
    6241                 :             :                         else
    6242                 :          51 :                                 desired_state &= ~BM_LOCK_WAKE_IN_PROGRESS;
    6243                 :             : 
    6244         [ +  + ]:        1817 :                         if (proclist_is_empty(&buf_hdr->lock_waiters))
    6245                 :        1723 :                                 desired_state &= ~BM_LOCK_HAS_WAITERS;
    6246                 :             : 
    6247                 :        1817 :                         desired_state &= ~BM_LOCKED;        /* release lock */
    6248                 :             : 
    6249   [ +  +  +  + ]:        3634 :                         if (pg_atomic_compare_exchange_u64(&buf_hdr->state, &old_state,
    6250                 :        1817 :                                                                                            desired_state))
    6251                 :        1814 :                                 break;
    6252                 :             :                 }
    6253                 :        1814 :         }
    6254                 :             : 
    6255                 :             :         /* Awaken any waiters I removed from the queue. */
    6256   [ +  +  +  +  :        3634 :         proclist_foreach_modify(iter, &wakeup, lwWaitLink)
                   +  + ]
    6257                 :             :         {
    6258                 :        1820 :                 PGPROC     *waiter = GetPGProcByNumber(iter.cur);
    6259                 :             : 
    6260                 :        1820 :                 proclist_delete(&wakeup, iter.cur, lwWaitLink);
    6261                 :             : 
    6262                 :             :                 /*
    6263                 :             :                  * Guarantee that lwWaiting being unset only becomes visible once the
    6264                 :             :                  * unlink from the link has completed. Otherwise the target backend
    6265                 :             :                  * could be woken up for other reason and enqueue for a new lock - if
    6266                 :             :                  * that happens before the list unlink happens, the list would end up
    6267                 :             :                  * being corrupted.
    6268                 :             :                  *
    6269                 :             :                  * The barrier pairs with the LockBufHdr() when enqueuing for another
    6270                 :             :                  * lock.
    6271                 :             :                  */
    6272                 :        1820 :                 pg_write_barrier();
    6273                 :        1820 :                 waiter->lwWaiting = LW_WS_NOT_WAITING;
    6274                 :        1820 :                 PGSemaphoreUnlock(waiter->sem);
    6275                 :        1820 :         }
    6276                 :        1814 : }
    6277                 :             : 
    6278                 :             : /*
    6279                 :             :  * Compute subtraction from buffer state for a release of a held lock in
    6280                 :             :  * `mode`.
    6281                 :             :  *
    6282                 :             :  * This is separated from BufferLockUnlock() as we want to combine the lock
    6283                 :             :  * release with other atomic operations when possible, leading to the lock
    6284                 :             :  * release being done in multiple places, each needing to compute what to
    6285                 :             :  * subtract from the lock state.
    6286                 :             :  */
    6287                 :             : static inline uint64
    6288                 :    15321271 : BufferLockReleaseSub(BufferLockMode mode)
    6289                 :             : {
    6290                 :             :         /*
    6291                 :             :          * Turns out that a switch() leads gcc to generate sufficiently worse code
    6292                 :             :          * for this to show up in profiles...
    6293                 :             :          */
    6294         [ +  + ]:    15321271 :         if (mode == BUFFER_LOCK_EXCLUSIVE)
    6295                 :     4235715 :                 return BM_LOCK_VAL_EXCLUSIVE;
    6296         [ -  + ]:    11085556 :         else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6297                 :           0 :                 return BM_LOCK_VAL_SHARE_EXCLUSIVE;
    6298                 :             :         else
    6299                 :             :         {
    6300         [ +  - ]:    11085556 :                 Assert(mode == BUFFER_LOCK_SHARE);
    6301                 :    11085556 :                 return BM_LOCK_VAL_SHARED;
    6302                 :             :         }
    6303                 :             : 
    6304                 :             :         return 0;                                       /* keep compiler quiet */
    6305                 :    15321271 : }
    6306                 :             : 
    6307                 :             : /*
    6308                 :             :  * Handle work that needs to be done after releasing a lock that was held in
    6309                 :             :  * `mode`, where `lockstate` is the result of the atomic operation modifying
    6310                 :             :  * the state variable.
    6311                 :             :  *
    6312                 :             :  * This is separated from BufferLockUnlock() as we want to combine the lock
    6313                 :             :  * release with other atomic operations when possible, leading to the lock
    6314                 :             :  * release being done in multiple places.
    6315                 :             :  */
    6316                 :             : static void
    6317                 :    15321271 : BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
    6318                 :             : {
    6319                 :    15321271 :         bool            check_waiters = false;
    6320                 :    15321271 :         bool            wake_exclusive = false;
    6321                 :             : 
    6322                 :             :         /* nobody else can have that kind of lock */
    6323         [ +  - ]:    15321271 :         Assert(!(lockstate & BM_LOCK_VAL_EXCLUSIVE));
    6324                 :             : 
    6325                 :             :         /*
    6326                 :             :          * If we're still waiting for backends to get scheduled, don't wake them
    6327                 :             :          * up again. Otherwise check if we need to look through the waitqueue to
    6328                 :             :          * wake other backends.
    6329                 :             :          */
    6330   [ +  +  +  + ]:    15321271 :         if ((lockstate & BM_LOCK_HAS_WAITERS) &&
    6331                 :        2529 :                 !(lockstate & BM_LOCK_WAKE_IN_PROGRESS))
    6332                 :             :         {
    6333         [ +  + ]:        1825 :                 if ((lockstate & BM_LOCK_MASK) == 0)
    6334                 :             :                 {
    6335                 :             :                         /*
    6336                 :             :                          * We released a lock and the lock was, in that moment, free. We
    6337                 :             :                          * therefore can wake waiters for any kind of lock.
    6338                 :             :                          */
    6339                 :        1814 :                         check_waiters = true;
    6340                 :        1814 :                         wake_exclusive = true;
    6341                 :        1814 :                 }
    6342         [ +  - ]:          11 :                 else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6343                 :             :                 {
    6344                 :             :                         /*
    6345                 :             :                          * We released the lock, but another backend still holds a lock.
    6346                 :             :                          * We can't have released an exclusive lock, as there couldn't
    6347                 :             :                          * have been other lock holders. If we released a share lock, no
    6348                 :             :                          * waiters need to be woken up, as there must be other share
    6349                 :             :                          * lockers. However, if we held a share-exclusive lock, another
    6350                 :             :                          * backend now could acquire a share-exclusive lock.
    6351                 :             :                          */
    6352                 :           0 :                         check_waiters = true;
    6353                 :           0 :                         wake_exclusive = false;
    6354                 :           0 :                 }
    6355                 :        1825 :         }
    6356                 :             : 
    6357                 :             :         /*
    6358                 :             :          * As waking up waiters requires the spinlock to be acquired, only do so
    6359                 :             :          * if necessary.
    6360                 :             :          */
    6361         [ +  + ]:    15321271 :         if (check_waiters)
    6362                 :        1814 :                 BufferLockWakeup(buf_hdr, wake_exclusive);
    6363                 :    15321271 : }
    6364                 :             : 
    6365                 :             : /*
    6366                 :             :  * BufferLockHeldByMeInMode - test whether my process holds the content lock
    6367                 :             :  * in the specified mode
    6368                 :             :  *
    6369                 :             :  * This is meant as debug support only.
    6370                 :             :  */
    6371                 :             : static bool
    6372                 :     8621085 : BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
    6373                 :             : {
    6374                 :    17242170 :         PrivateRefCountEntry *entry =
    6375                 :     8621085 :                 GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
    6376                 :             : 
    6377         [ +  - ]:     8621085 :         if (!entry)
    6378                 :           0 :                 return false;
    6379                 :             :         else
    6380                 :     8621085 :                 return entry->data.lockmode == mode;
    6381                 :     8621085 : }
    6382                 :             : 
    6383                 :             : /*
    6384                 :             :  * BufferLockHeldByMe - test whether my process holds the content lock in any
    6385                 :             :  * mode
    6386                 :             :  *
    6387                 :             :  * This is meant as debug support only.
    6388                 :             :  */
    6389                 :             : static bool
    6390                 :    12503938 : BufferLockHeldByMe(BufferDesc *buf_hdr)
    6391                 :             : {
    6392                 :    25007876 :         PrivateRefCountEntry *entry =
    6393                 :    12503938 :                 GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
    6394                 :             : 
    6395         [ +  - ]:    12503938 :         if (!entry)
    6396                 :           0 :                 return false;
    6397                 :             :         else
    6398                 :    12503938 :                 return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
    6399                 :    12503938 : }
    6400                 :             : 
    6401                 :             : /*
    6402                 :             :  * Release the content lock for the buffer.
    6403                 :             :  */
    6404                 :             : void
    6405                 :    16875395 : UnlockBuffer(Buffer buffer)
    6406                 :             : {
    6407                 :    16875395 :         BufferDesc *buf_hdr;
    6408                 :             : 
    6409   [ -  +  #  #  :    16875395 :         Assert(BufferIsPinned(buffer));
                   +  + ]
    6410         [ +  + ]:    16875395 :         if (BufferIsLocal(buffer))
    6411                 :     1561126 :                 return;                                 /* local buffers need no lock */
    6412                 :             : 
    6413                 :    15314269 :         buf_hdr = GetBufferDescriptor(buffer - 1);
    6414                 :    15314269 :         BufferLockUnlock(buffer, buf_hdr);
    6415         [ -  + ]:    16875395 : }
    6416                 :             : 
    6417                 :             : /*
    6418                 :             :  * Acquire the content_lock for the buffer.
    6419                 :             :  */
    6420                 :             : void
    6421                 :    16558148 : LockBufferInternal(Buffer buffer, BufferLockMode mode)
    6422                 :             : {
    6423                 :    16558148 :         BufferDesc *buf_hdr;
    6424                 :             : 
    6425                 :             :         /*
    6426                 :             :          * We can't wait if we haven't got a PGPROC.  This should only occur
    6427                 :             :          * during bootstrap or shared memory initialization.  Put an Assert here
    6428                 :             :          * to catch unsafe coding practices.
    6429                 :             :          */
    6430   [ -  +  #  # ]:    16558148 :         Assert(!(MyProc == NULL && IsUnderPostmaster));
    6431                 :             : 
    6432                 :             :         /* handled in LockBuffer() wrapper */
    6433         [ +  - ]:    16558148 :         Assert(mode != BUFFER_LOCK_UNLOCK);
    6434                 :             : 
    6435   [ +  -  #  #  :    16558148 :         Assert(BufferIsPinned(buffer));
                   +  + ]
    6436         [ +  + ]:    16558148 :         if (BufferIsLocal(buffer))
    6437                 :     1536194 :                 return;                                 /* local buffers need no lock */
    6438                 :             : 
    6439                 :    15021954 :         buf_hdr = GetBufferDescriptor(buffer - 1);
    6440                 :             : 
    6441                 :             :         /*
    6442                 :             :          * Test the most frequent lock modes first. While a switch (mode) would be
    6443                 :             :          * nice, at least gcc generates considerably worse code for it.
    6444                 :             :          *
    6445                 :             :          * Call BufferLockAcquire() with a constant argument for mode, to generate
    6446                 :             :          * more efficient code for the different lock modes.
    6447                 :             :          */
    6448         [ +  + ]:    15021954 :         if (mode == BUFFER_LOCK_SHARE)
    6449                 :    11078558 :                 BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE);
    6450         [ +  - ]:     3943396 :         else if (mode == BUFFER_LOCK_EXCLUSIVE)
    6451                 :     3943396 :                 BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_EXCLUSIVE);
    6452         [ #  # ]:           0 :         else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6453                 :           0 :                 BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE);
    6454                 :             :         else
    6455   [ #  #  #  # ]:           0 :                 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
    6456         [ -  + ]:    16558148 : }
    6457                 :             : 
    6458                 :             : /*
    6459                 :             :  * Acquire the content_lock for the buffer, but only if we don't have to wait.
    6460                 :             :  *
    6461                 :             :  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
    6462                 :             :  */
    6463                 :             : bool
    6464                 :      313908 : ConditionalLockBuffer(Buffer buffer)
    6465                 :             : {
    6466                 :      313908 :         BufferDesc *buf;
    6467                 :             : 
    6468   [ -  +  #  #  :      313908 :         Assert(BufferIsPinned(buffer));
                   +  + ]
    6469         [ +  + ]:      313908 :         if (BufferIsLocal(buffer))
    6470                 :       21583 :                 return true;                    /* act as though we got it */
    6471                 :             : 
    6472                 :      292325 :         buf = GetBufferDescriptor(buffer - 1);
    6473                 :             : 
    6474                 :      292325 :         return BufferLockConditional(buffer, buf, BUFFER_LOCK_EXCLUSIVE);
    6475                 :      313908 : }
    6476                 :             : 
    6477                 :             : /*
    6478                 :             :  * Verify that this backend is pinning the buffer exactly once.
    6479                 :             :  *
    6480                 :             :  * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
    6481                 :             :  * holds a pin on the buffer.  We do not care whether some other backend does.
    6482                 :             :  */
    6483                 :             : void
    6484                 :       91527 : CheckBufferIsPinnedOnce(Buffer buffer)
    6485                 :             : {
    6486         [ +  + ]:       91527 :         if (BufferIsLocal(buffer))
    6487                 :             :         {
    6488         [ +  - ]:         258 :                 if (LocalRefCount[-buffer - 1] != 1)
    6489   [ #  #  #  # ]:           0 :                         elog(ERROR, "incorrect local pin count: %d",
    6490                 :             :                                  LocalRefCount[-buffer - 1]);
    6491                 :         258 :         }
    6492                 :             :         else
    6493                 :             :         {
    6494         [ +  - ]:       91269 :                 if (GetPrivateRefCount(buffer) != 1)
    6495   [ #  #  #  # ]:           0 :                         elog(ERROR, "incorrect local pin count: %d",
    6496                 :             :                                  GetPrivateRefCount(buffer));
    6497                 :             :         }
    6498                 :       91527 : }
    6499                 :             : 
    6500                 :             : /*
    6501                 :             :  * LockBufferForCleanup - lock a buffer in preparation for deleting items
    6502                 :             :  *
    6503                 :             :  * Items may be deleted from a disk page only when the caller (a) holds an
    6504                 :             :  * exclusive lock on the buffer and (b) has observed that no other backend
    6505                 :             :  * holds a pin on the buffer.  If there is a pin, then the other backend
    6506                 :             :  * might have a pointer into the buffer (for example, a heapscan reference
    6507                 :             :  * to an item --- see README for more details).  It's OK if a pin is added
    6508                 :             :  * after the cleanup starts, however; the newly-arrived backend will be
    6509                 :             :  * unable to look at the page until we release the exclusive lock.
    6510                 :             :  *
    6511                 :             :  * To implement this protocol, a would-be deleter must pin the buffer and
    6512                 :             :  * then call LockBufferForCleanup().  LockBufferForCleanup() is similar to
    6513                 :             :  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
    6514                 :             :  * it has successfully observed pin count = 1.
    6515                 :             :  */
    6516                 :             : void
    6517                 :         766 : LockBufferForCleanup(Buffer buffer)
    6518                 :             : {
    6519                 :         766 :         BufferDesc *bufHdr;
    6520                 :         766 :         TimestampTz waitStart = 0;
    6521                 :         766 :         bool            waiting = false;
    6522                 :         766 :         bool            logged_recovery_conflict = false;
    6523                 :             : 
    6524   [ -  +  #  #  :         766 :         Assert(BufferIsPinned(buffer));
                   +  + ]
    6525         [ +  - ]:         766 :         Assert(PinCountWaitBuf == NULL);
    6526                 :             : 
    6527                 :         766 :         CheckBufferIsPinnedOnce(buffer);
    6528                 :             : 
    6529                 :             :         /*
    6530                 :             :          * We do not yet need to be worried about in-progress AIOs holding a pin,
    6531                 :             :          * as we, so far, only support doing reads via AIO and this function can
    6532                 :             :          * only be called once the buffer is valid (i.e. no read can be in
    6533                 :             :          * flight).
    6534                 :             :          */
    6535                 :             : 
    6536                 :             :         /* Nobody else to wait for */
    6537         [ +  + ]:         766 :         if (BufferIsLocal(buffer))
    6538                 :           2 :                 return;
    6539                 :             : 
    6540                 :         764 :         bufHdr = GetBufferDescriptor(buffer - 1);
    6541                 :             : 
    6542                 :         764 :         for (;;)
    6543                 :             :         {
    6544                 :         764 :                 uint64          buf_state;
    6545                 :         764 :                 uint64          unset_bits = 0;
    6546                 :             : 
    6547                 :             :                 /* Try to acquire lock */
    6548                 :         764 :                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    6549                 :         764 :                 buf_state = LockBufHdr(bufHdr);
    6550                 :             : 
    6551         [ +  - ]:         764 :                 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    6552         [ +  - ]:         764 :                 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
    6553                 :             :                 {
    6554                 :             :                         /* Successfully acquired exclusive lock with pincount 1 */
    6555                 :         764 :                         UnlockBufHdr(bufHdr);
    6556                 :             : 
    6557                 :             :                         /*
    6558                 :             :                          * Emit the log message if recovery conflict on buffer pin was
    6559                 :             :                          * resolved but the startup process waited longer than
    6560                 :             :                          * deadlock_timeout for it.
    6561                 :             :                          */
    6562         [ +  - ]:         764 :                         if (logged_recovery_conflict)
    6563                 :           0 :                                 LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
    6564                 :           0 :                                                                         waitStart, GetCurrentTimestamp(),
    6565                 :             :                                                                         NULL, false);
    6566                 :             : 
    6567         [ +  - ]:         764 :                         if (waiting)
    6568                 :             :                         {
    6569                 :             :                                 /* reset ps display to remove the suffix if we added one */
    6570                 :           0 :                                 set_ps_display_remove_suffix();
    6571                 :           0 :                                 waiting = false;
    6572                 :           0 :                         }
    6573                 :         764 :                         return;
    6574                 :             :                 }
    6575                 :             :                 /* Failed, so mark myself as waiting for pincount 1 */
    6576         [ #  # ]:           0 :                 if (buf_state & BM_PIN_COUNT_WAITER)
    6577                 :             :                 {
    6578                 :           0 :                         UnlockBufHdr(bufHdr);
    6579                 :           0 :                         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    6580   [ #  #  #  # ]:           0 :                         elog(ERROR, "multiple backends attempting to wait for pincount 1");
    6581                 :           0 :                 }
    6582                 :           0 :                 bufHdr->wait_backend_pgprocno = MyProcNumber;
    6583                 :           0 :                 PinCountWaitBuf = bufHdr;
    6584                 :           0 :                 UnlockBufHdrExt(bufHdr, buf_state,
    6585                 :             :                                                 BM_PIN_COUNT_WAITER, 0,
    6586                 :             :                                                 0);
    6587                 :           0 :                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    6588                 :             : 
    6589                 :             :                 /* Wait to be signaled by UnpinBuffer() */
    6590         [ #  # ]:           0 :                 if (InHotStandby)
    6591                 :             :                 {
    6592         [ #  # ]:           0 :                         if (!waiting)
    6593                 :             :                         {
    6594                 :             :                                 /* adjust the process title to indicate that it's waiting */
    6595                 :           0 :                                 set_ps_display_suffix("waiting");
    6596                 :           0 :                                 waiting = true;
    6597                 :           0 :                         }
    6598                 :             : 
    6599                 :             :                         /*
    6600                 :             :                          * Emit the log message if the startup process is waiting longer
    6601                 :             :                          * than deadlock_timeout for recovery conflict on buffer pin.
    6602                 :             :                          *
    6603                 :             :                          * Skip this if first time through because the startup process has
    6604                 :             :                          * not started waiting yet in this case. So, the wait start
    6605                 :             :                          * timestamp is set after this logic.
    6606                 :             :                          */
    6607   [ #  #  #  # ]:           0 :                         if (waitStart != 0 && !logged_recovery_conflict)
    6608                 :             :                         {
    6609                 :           0 :                                 TimestampTz now = GetCurrentTimestamp();
    6610                 :             : 
    6611   [ #  #  #  # ]:           0 :                                 if (TimestampDifferenceExceeds(waitStart, now,
    6612                 :           0 :                                                                                            DeadlockTimeout))
    6613                 :             :                                 {
    6614                 :           0 :                                         LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
    6615                 :           0 :                                                                                 waitStart, now, NULL, true);
    6616                 :           0 :                                         logged_recovery_conflict = true;
    6617                 :           0 :                                 }
    6618                 :           0 :                         }
    6619                 :             : 
    6620                 :             :                         /*
    6621                 :             :                          * Set the wait start timestamp if logging is enabled and first
    6622                 :             :                          * time through.
    6623                 :             :                          */
    6624   [ #  #  #  # ]:           0 :                         if (log_recovery_conflict_waits && waitStart == 0)
    6625                 :           0 :                                 waitStart = GetCurrentTimestamp();
    6626                 :             : 
    6627                 :             :                         /* Publish the bufid that Startup process waits on */
    6628                 :           0 :                         SetStartupBufferPinWaitBufId(buffer - 1);
    6629                 :             :                         /* Set alarm and then wait to be signaled by UnpinBuffer() */
    6630                 :           0 :                         ResolveRecoveryConflictWithBufferPin();
    6631                 :             :                         /* Reset the published bufid */
    6632                 :           0 :                         SetStartupBufferPinWaitBufId(-1);
    6633                 :           0 :                 }
    6634                 :             :                 else
    6635                 :           0 :                         ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
    6636                 :             : 
    6637                 :             :                 /*
    6638                 :             :                  * Remove flag marking us as waiter. Normally this will not be set
    6639                 :             :                  * anymore, but ProcWaitForSignal() can return for other signals as
    6640                 :             :                  * well.  We take care to only reset the flag if we're the waiter, as
    6641                 :             :                  * theoretically another backend could have started waiting. That's
    6642                 :             :                  * impossible with the current usages due to table level locking, but
    6643                 :             :                  * better be safe.
    6644                 :             :                  */
    6645                 :           0 :                 buf_state = LockBufHdr(bufHdr);
    6646   [ #  #  #  # ]:           0 :                 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
    6647                 :           0 :                         bufHdr->wait_backend_pgprocno == MyProcNumber)
    6648                 :           0 :                         unset_bits |= BM_PIN_COUNT_WAITER;
    6649                 :             : 
    6650                 :           0 :                 UnlockBufHdrExt(bufHdr, buf_state,
    6651                 :           0 :                                                 0, unset_bits,
    6652                 :             :                                                 0);
    6653                 :             : 
    6654                 :           0 :                 PinCountWaitBuf = NULL;
    6655                 :             :                 /* Loop back and try again */
    6656         [ +  - ]:         764 :         }
    6657                 :         766 : }
    6658                 :             : 
    6659                 :             : /*
    6660                 :             :  * Check called from ProcessRecoveryConflictInterrupts() when Startup process
    6661                 :             :  * requests cancellation of all pin holders that are blocking it.
    6662                 :             :  */
    6663                 :             : bool
    6664                 :           0 : HoldingBufferPinThatDelaysRecovery(void)
    6665                 :             : {
    6666                 :           0 :         int                     bufid = GetStartupBufferPinWaitBufId();
    6667                 :             : 
    6668                 :             :         /*
    6669                 :             :          * If we get woken slowly then it's possible that the Startup process was
    6670                 :             :          * already woken by other backends before we got here. Also possible that
    6671                 :             :          * we get here by multiple interrupts or interrupts at inappropriate
    6672                 :             :          * times, so make sure we do nothing if the bufid is not set.
    6673                 :             :          */
    6674         [ #  # ]:           0 :         if (bufid < 0)
    6675                 :           0 :                 return false;
    6676                 :             : 
    6677         [ #  # ]:           0 :         if (GetPrivateRefCount(bufid + 1) > 0)
    6678                 :           0 :                 return true;
    6679                 :             : 
    6680                 :           0 :         return false;
    6681                 :           0 : }
    6682                 :             : 
    6683                 :             : /*
    6684                 :             :  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
    6685                 :             :  *
    6686                 :             :  * We won't loop, but just check once to see if the pin count is OK.  If
    6687                 :             :  * not, return false with no lock held.
    6688                 :             :  */
    6689                 :             : bool
    6690                 :       15033 : ConditionalLockBufferForCleanup(Buffer buffer)
    6691                 :             : {
    6692                 :       15033 :         BufferDesc *bufHdr;
    6693                 :       15033 :         uint64          buf_state,
    6694                 :             :                                 refcount;
    6695                 :             : 
    6696         [ +  - ]:       15033 :         Assert(BufferIsValid(buffer));
    6697                 :             : 
    6698                 :             :         /* see AIO related comment in LockBufferForCleanup() */
    6699                 :             : 
    6700         [ +  + ]:       15033 :         if (BufferIsLocal(buffer))
    6701                 :             :         {
    6702                 :         265 :                 refcount = LocalRefCount[-buffer - 1];
    6703                 :             :                 /* There should be exactly one pin */
    6704         [ +  - ]:         265 :                 Assert(refcount > 0);
    6705         [ +  + ]:         265 :                 if (refcount != 1)
    6706                 :           7 :                         return false;
    6707                 :             :                 /* Nobody else to wait for */
    6708                 :         258 :                 return true;
    6709                 :             :         }
    6710                 :             : 
    6711                 :             :         /* There should be exactly one local pin */
    6712                 :       14768 :         refcount = GetPrivateRefCount(buffer);
    6713         [ +  - ]:       14768 :         Assert(refcount);
    6714         [ +  + ]:       14768 :         if (refcount != 1)
    6715                 :          32 :                 return false;
    6716                 :             : 
    6717                 :             :         /* Try to acquire lock */
    6718         [ +  + ]:       14736 :         if (!ConditionalLockBuffer(buffer))
    6719                 :           6 :                 return false;
    6720                 :             : 
    6721                 :       14730 :         bufHdr = GetBufferDescriptor(buffer - 1);
    6722                 :       14730 :         buf_state = LockBufHdr(bufHdr);
    6723                 :       14730 :         refcount = BUF_STATE_GET_REFCOUNT(buf_state);
    6724                 :             : 
    6725         [ +  - ]:       14730 :         Assert(refcount > 0);
    6726         [ +  + ]:       14730 :         if (refcount == 1)
    6727                 :             :         {
    6728                 :             :                 /* Successfully acquired exclusive lock with pincount 1 */
    6729                 :       14722 :                 UnlockBufHdr(bufHdr);
    6730                 :       14722 :                 return true;
    6731                 :             :         }
    6732                 :             : 
    6733                 :             :         /* Failed, so release the lock */
    6734                 :           8 :         UnlockBufHdr(bufHdr);
    6735                 :           8 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    6736                 :           8 :         return false;
    6737                 :       15033 : }
    6738                 :             : 
    6739                 :             : /*
    6740                 :             :  * IsBufferCleanupOK - as above, but we already have the lock
    6741                 :             :  *
    6742                 :             :  * Check whether it's OK to perform cleanup on a buffer we've already
    6743                 :             :  * locked.  If we observe that the pin count is 1, our exclusive lock
    6744                 :             :  * happens to be a cleanup lock, and we can proceed with anything that
    6745                 :             :  * would have been allowable had we sought a cleanup lock originally.
    6746                 :             :  */
    6747                 :             : bool
    6748                 :         662 : IsBufferCleanupOK(Buffer buffer)
    6749                 :             : {
    6750                 :         662 :         BufferDesc *bufHdr;
    6751                 :         662 :         uint64          buf_state;
    6752                 :             : 
    6753         [ +  - ]:         662 :         Assert(BufferIsValid(buffer));
    6754                 :             : 
    6755                 :             :         /* see AIO related comment in LockBufferForCleanup() */
    6756                 :             : 
    6757         [ +  - ]:         662 :         if (BufferIsLocal(buffer))
    6758                 :             :         {
    6759                 :             :                 /* There should be exactly one pin */
    6760         [ #  # ]:           0 :                 if (LocalRefCount[-buffer - 1] != 1)
    6761                 :           0 :                         return false;
    6762                 :             :                 /* Nobody else to wait for */
    6763                 :           0 :                 return true;
    6764                 :             :         }
    6765                 :             : 
    6766                 :             :         /* There should be exactly one local pin */
    6767         [ -  + ]:         662 :         if (GetPrivateRefCount(buffer) != 1)
    6768                 :           0 :                 return false;
    6769                 :             : 
    6770                 :         662 :         bufHdr = GetBufferDescriptor(buffer - 1);
    6771                 :             : 
    6772                 :             :         /* caller must hold exclusive lock on buffer */
    6773         [ +  - ]:         662 :         Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
    6774                 :             : 
    6775                 :         662 :         buf_state = LockBufHdr(bufHdr);
    6776                 :             : 
    6777         [ +  - ]:         662 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    6778         [ +  - ]:         662 :         if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
    6779                 :             :         {
    6780                 :             :                 /* pincount is OK. */
    6781                 :         662 :                 UnlockBufHdr(bufHdr);
    6782                 :         662 :                 return true;
    6783                 :             :         }
    6784                 :             : 
    6785                 :           0 :         UnlockBufHdr(bufHdr);
    6786                 :           0 :         return false;
    6787                 :         662 : }
    6788                 :             : 
    6789                 :             : 
    6790                 :             : /*
    6791                 :             :  *      Functions for buffer I/O handling
    6792                 :             :  *
    6793                 :             :  *      Also note that these are used only for shared buffers, not local ones.
    6794                 :             :  */
    6795                 :             : 
    6796                 :             : /*
    6797                 :             :  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
    6798                 :             :  */
    6799                 :             : static void
    6800                 :           1 : WaitIO(BufferDesc *buf)
    6801                 :             : {
    6802                 :           1 :         ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
    6803                 :             : 
    6804                 :           1 :         ConditionVariablePrepareToSleep(cv);
    6805                 :           2 :         for (;;)
    6806                 :             :         {
    6807                 :           2 :                 uint64          buf_state;
    6808                 :           2 :                 PgAioWaitRef iow;
    6809                 :             : 
    6810                 :             :                 /*
    6811                 :             :                  * It may not be necessary to acquire the spinlock to check the flag
    6812                 :             :                  * here, but since this test is essential for correctness, we'd better
    6813                 :             :                  * play it safe.
    6814                 :             :                  */
    6815                 :           2 :                 buf_state = LockBufHdr(buf);
    6816                 :             : 
    6817                 :             :                 /*
    6818                 :             :                  * Copy the wait reference while holding the spinlock. This protects
    6819                 :             :                  * against a concurrent TerminateBufferIO() in another backend from
    6820                 :             :                  * clearing the wref while it's being read.
    6821                 :             :                  */
    6822                 :           2 :                 iow = buf->io_wref;
    6823                 :           2 :                 UnlockBufHdr(buf);
    6824                 :             : 
    6825                 :             :                 /* no IO in progress, we don't need to wait */
    6826         [ +  + ]:           2 :                 if (!(buf_state & BM_IO_IN_PROGRESS))
    6827                 :           1 :                         break;
    6828                 :             : 
    6829                 :             :                 /*
    6830                 :             :                  * The buffer has asynchronous IO in progress, wait for it to
    6831                 :             :                  * complete.
    6832                 :             :                  */
    6833         [ -  + ]:           1 :                 if (pgaio_wref_valid(&iow))
    6834                 :             :                 {
    6835                 :           0 :                         pgaio_wref_wait(&iow);
    6836                 :             : 
    6837                 :             :                         /*
    6838                 :             :                          * The AIO subsystem internally uses condition variables and thus
    6839                 :             :                          * might remove this backend from the BufferDesc's CV. While that
    6840                 :             :                          * wouldn't cause a correctness issue (the first CV sleep just
    6841                 :             :                          * immediately returns if not already registered), it seems worth
    6842                 :             :                          * avoiding unnecessary loop iterations, given that we take care
    6843                 :             :                          * to do so at the start of the function.
    6844                 :             :                          */
    6845                 :           0 :                         ConditionVariablePrepareToSleep(cv);
    6846                 :           0 :                         continue;
    6847                 :             :                 }
    6848                 :             : 
    6849                 :             :                 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
    6850                 :           1 :                 ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
    6851   [ -  -  +  + ]:           2 :         }
    6852                 :           1 :         ConditionVariableCancelSleep();
    6853                 :           1 : }
    6854                 :             : 
    6855                 :             : /*
    6856                 :             :  * StartBufferIO: begin I/O on this buffer
    6857                 :             :  *      (Assumptions)
    6858                 :             :  *      My process is executing no IO on this buffer
    6859                 :             :  *      The buffer is Pinned
    6860                 :             :  *
    6861                 :             :  * In some scenarios multiple backends could attempt the same I/O operation
    6862                 :             :  * concurrently.  If someone else has already started I/O on this buffer then
    6863                 :             :  * we will wait for completion of the IO using WaitIO().
    6864                 :             :  *
    6865                 :             :  * Input operations are only attempted on buffers that are not BM_VALID,
    6866                 :             :  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
    6867                 :             :  * so we can always tell if the work is already done.
    6868                 :             :  *
    6869                 :             :  * Returns true if we successfully marked the buffer as I/O busy,
    6870                 :             :  * false if someone else already did the work.
    6871                 :             :  *
    6872                 :             :  * If nowait is true, then we don't wait for an I/O to be finished by another
    6873                 :             :  * backend.  In that case, false indicates either that the I/O was already
    6874                 :             :  * finished, or is still in progress.  This is useful for callers that want to
    6875                 :             :  * find out if they can perform the I/O as part of a larger operation, without
    6876                 :             :  * waiting for the answer or distinguishing the reasons why not.
    6877                 :             :  */
    6878                 :             : bool
    6879                 :       47536 : StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
    6880                 :             : {
    6881                 :       47536 :         uint64          buf_state;
    6882                 :             : 
    6883                 :       47536 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    6884                 :             : 
    6885                 :       47537 :         for (;;)
    6886                 :             :         {
    6887                 :       47537 :                 buf_state = LockBufHdr(buf);
    6888                 :             : 
    6889         [ +  + ]:       47537 :                 if (!(buf_state & BM_IO_IN_PROGRESS))
    6890                 :       47536 :                         break;
    6891                 :           1 :                 UnlockBufHdr(buf);
    6892         [ -  + ]:           1 :                 if (nowait)
    6893                 :           0 :                         return false;
    6894                 :           1 :                 WaitIO(buf);
    6895                 :             :         }
    6896                 :             : 
    6897                 :             :         /* Once we get here, there is definitely no I/O active on this buffer */
    6898                 :             : 
    6899                 :             :         /* Check if someone else already did the I/O */
    6900   [ +  +  +  + ]:       47536 :         if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
    6901                 :             :         {
    6902                 :       13995 :                 UnlockBufHdr(buf);
    6903                 :       13995 :                 return false;
    6904                 :             :         }
    6905                 :             : 
    6906                 :       47535 :         UnlockBufHdrExt(buf, buf_state,
    6907                 :             :                                         BM_IO_IN_PROGRESS, 0,
    6908                 :             :                                         0);
    6909                 :             : 
    6910                 :       95070 :         ResourceOwnerRememberBufferIO(CurrentResourceOwner,
    6911                 :       47535 :                                                                   BufferDescriptorGetBuffer(buf));
    6912                 :             : 
    6913                 :       47535 :         return true;
    6914                 :       61530 : }
    6915                 :             : 
    6916                 :             : /*
    6917                 :             :  * TerminateBufferIO: release a buffer we were doing I/O on
    6918                 :             :  *      (Assumptions)
    6919                 :             :  *      My process is executing IO for the buffer
    6920                 :             :  *      BM_IO_IN_PROGRESS bit is set for the buffer
    6921                 :             :  *      The buffer is Pinned
    6922                 :             :  *
    6923                 :             :  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
    6924                 :             :  * buffer's BM_DIRTY flag.  This is appropriate when terminating a
    6925                 :             :  * successful write.  The check on BM_JUST_DIRTIED is necessary to avoid
    6926                 :             :  * marking the buffer clean if it was re-dirtied while we were writing.
    6927                 :             :  *
    6928                 :             :  * set_flag_bits gets ORed into the buffer's flags.  It must include
    6929                 :             :  * BM_IO_ERROR in a failure case.  For successful completion it could
    6930                 :             :  * be 0, or BM_VALID if we just finished reading in the page.
    6931                 :             :  *
    6932                 :             :  * If forget_owner is true, we release the buffer I/O from the current
    6933                 :             :  * resource owner. (forget_owner=false is used when the resource owner itself
    6934                 :             :  * is being released)
    6935                 :             :  */
    6936                 :             : void
    6937                 :       46058 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits,
    6938                 :             :                                   bool forget_owner, bool release_aio)
    6939                 :             : {
    6940                 :       46058 :         uint64          buf_state;
    6941                 :       46058 :         uint64          unset_flag_bits = 0;
    6942                 :       46058 :         int                     refcount_change = 0;
    6943                 :             : 
    6944                 :       46058 :         buf_state = LockBufHdr(buf);
    6945                 :             : 
    6946         [ +  - ]:       46058 :         Assert(buf_state & BM_IO_IN_PROGRESS);
    6947                 :       46058 :         unset_flag_bits |= BM_IO_IN_PROGRESS;
    6948                 :             : 
    6949                 :             :         /* Clear earlier errors, if this IO failed, it'll be marked again */
    6950                 :       46058 :         unset_flag_bits |= BM_IO_ERROR;
    6951                 :             : 
    6952   [ +  +  -  + ]:       46058 :         if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
    6953                 :        6998 :                 unset_flag_bits |= BM_DIRTY | BM_CHECKPOINT_NEEDED;
    6954                 :             : 
    6955         [ +  + ]:       46058 :         if (release_aio)
    6956                 :             :         {
    6957                 :             :                 /* release ownership by the AIO subsystem */
    6958         [ +  - ]:        6904 :                 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    6959                 :        6904 :                 refcount_change = -1;
    6960                 :        6904 :                 pgaio_wref_clear(&buf->io_wref);
    6961                 :        6904 :         }
    6962                 :             : 
    6963                 :       92116 :         buf_state = UnlockBufHdrExt(buf, buf_state,
    6964                 :       46058 :                                                                 set_flag_bits, unset_flag_bits,
    6965                 :       46058 :                                                                 refcount_change);
    6966                 :             : 
    6967         [ +  + ]:       46058 :         if (forget_owner)
    6968                 :       78308 :                 ResourceOwnerForgetBufferIO(CurrentResourceOwner,
    6969                 :       39154 :                                                                         BufferDescriptorGetBuffer(buf));
    6970                 :             : 
    6971                 :       46058 :         ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
    6972                 :             : 
    6973                 :             :         /*
    6974                 :             :          * Support LockBufferForCleanup()
    6975                 :             :          *
    6976                 :             :          * We may have just released the last pin other than the waiter's. In most
    6977                 :             :          * cases, this backend holds another pin on the buffer. But, if, for
    6978                 :             :          * example, this backend is completing an IO issued by another backend, it
    6979                 :             :          * may be time to wake the waiter.
    6980                 :             :          */
    6981   [ +  +  +  - ]:       46058 :         if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
    6982                 :           0 :                 WakePinCountWaiter(buf);
    6983                 :       46058 : }
    6984                 :             : 
    6985                 :             : /*
    6986                 :             :  * AbortBufferIO: Clean up active buffer I/O after an error.
    6987                 :             :  *
    6988                 :             :  *      All LWLocks & content locks we might have held have been released, but we
    6989                 :             :  *      haven't yet released buffer pins, so the buffer is still pinned.
    6990                 :             :  *
    6991                 :             :  *      If I/O was in progress, we always set BM_IO_ERROR, even though it's
    6992                 :             :  *      possible the error condition wasn't related to the I/O.
    6993                 :             :  *
    6994                 :             :  *  Note: this does not remove the buffer I/O from the resource owner.
    6995                 :             :  *  That's correct when we're releasing the whole resource owner, but
    6996                 :             :  *  beware if you use this in other contexts.
    6997                 :             :  */
    6998                 :             : static void
    6999                 :           0 : AbortBufferIO(Buffer buffer)
    7000                 :             : {
    7001                 :           0 :         BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
    7002                 :           0 :         uint64          buf_state;
    7003                 :             : 
    7004                 :           0 :         buf_state = LockBufHdr(buf_hdr);
    7005         [ #  # ]:           0 :         Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
    7006                 :             : 
    7007         [ #  # ]:           0 :         if (!(buf_state & BM_VALID))
    7008                 :             :         {
    7009         [ #  # ]:           0 :                 Assert(!(buf_state & BM_DIRTY));
    7010                 :           0 :                 UnlockBufHdr(buf_hdr);
    7011                 :           0 :         }
    7012                 :             :         else
    7013                 :             :         {
    7014         [ #  # ]:           0 :                 Assert(buf_state & BM_DIRTY);
    7015                 :           0 :                 UnlockBufHdr(buf_hdr);
    7016                 :             : 
    7017                 :             :                 /* Issue notice if this is not the first failure... */
    7018         [ #  # ]:           0 :                 if (buf_state & BM_IO_ERROR)
    7019                 :             :                 {
    7020                 :             :                         /* Buffer is pinned, so we can read tag without spinlock */
    7021   [ #  #  #  # ]:           0 :                         ereport(WARNING,
    7022                 :             :                                         (errcode(ERRCODE_IO_ERROR),
    7023                 :             :                                          errmsg("could not write block %u of %s",
    7024                 :             :                                                         buf_hdr->tag.blockNum,
    7025                 :             :                                                         relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
    7026                 :             :                                                                                 BufTagGetForkNum(&buf_hdr->tag)).str),
    7027                 :             :                                          errdetail("Multiple failures --- write error might be permanent.")));
    7028                 :           0 :                 }
    7029                 :             :         }
    7030                 :             : 
    7031                 :           0 :         TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
    7032                 :           0 : }
    7033                 :             : 
    7034                 :             : /*
    7035                 :             :  * Error context callback for errors occurring during shared buffer writes.
    7036                 :             :  */
    7037                 :             : static void
    7038                 :           0 : shared_buffer_write_error_callback(void *arg)
    7039                 :             : {
    7040                 :           0 :         BufferDesc *bufHdr = (BufferDesc *) arg;
    7041                 :             : 
    7042                 :             :         /* Buffer is pinned, so we can read the tag without locking the spinlock */
    7043         [ #  # ]:           0 :         if (bufHdr != NULL)
    7044                 :           0 :                 errcontext("writing block %u of relation \"%s\"",
    7045                 :           0 :                                    bufHdr->tag.blockNum,
    7046                 :           0 :                                    relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
    7047                 :           0 :                                                            BufTagGetForkNum(&bufHdr->tag)).str);
    7048                 :           0 : }
    7049                 :             : 
    7050                 :             : /*
    7051                 :             :  * Error context callback for errors occurring during local buffer writes.
    7052                 :             :  */
    7053                 :             : static void
    7054                 :           0 : local_buffer_write_error_callback(void *arg)
    7055                 :             : {
    7056                 :           0 :         BufferDesc *bufHdr = (BufferDesc *) arg;
    7057                 :             : 
    7058         [ #  # ]:           0 :         if (bufHdr != NULL)
    7059                 :           0 :                 errcontext("writing block %u of relation \"%s\"",
    7060                 :           0 :                                    bufHdr->tag.blockNum,
    7061                 :           0 :                                    relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
    7062                 :             :                                                                   MyProcNumber,
    7063                 :           0 :                                                                   BufTagGetForkNum(&bufHdr->tag)).str);
    7064                 :           0 : }
    7065                 :             : 
    7066                 :             : /*
    7067                 :             :  * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
    7068                 :             :  */
    7069                 :             : static int
    7070                 :     2573423 : rlocator_comparator(const void *p1, const void *p2)
    7071                 :             : {
    7072                 :     2573423 :         RelFileLocator n1 = *(const RelFileLocator *) p1;
    7073                 :     2573423 :         RelFileLocator n2 = *(const RelFileLocator *) p2;
    7074                 :             : 
    7075         [ +  + ]:     2573423 :         if (n1.relNumber < n2.relNumber)
    7076                 :     2559066 :                 return -1;
    7077         [ +  + ]:       14357 :         else if (n1.relNumber > n2.relNumber)
    7078                 :       13819 :                 return 1;
    7079                 :             : 
    7080         [ -  + ]:         538 :         if (n1.dbOid < n2.dbOid)
    7081                 :           0 :                 return -1;
    7082         [ -  + ]:         538 :         else if (n1.dbOid > n2.dbOid)
    7083                 :           0 :                 return 1;
    7084                 :             : 
    7085         [ -  + ]:         538 :         if (n1.spcOid < n2.spcOid)
    7086                 :           0 :                 return -1;
    7087         [ -  + ]:         538 :         else if (n1.spcOid > n2.spcOid)
    7088                 :           0 :                 return 1;
    7089                 :             :         else
    7090                 :         538 :                 return 0;
    7091                 :     2573423 : }
    7092                 :             : 
    7093                 :             : /*
    7094                 :             :  * Lock buffer header - set BM_LOCKED in buffer state.
    7095                 :             :  */
    7096                 :             : uint64
    7097                 :     1279465 : LockBufHdr(BufferDesc *desc)
    7098                 :             : {
    7099                 :     1279465 :         uint64          old_buf_state;
    7100                 :             : 
    7101         [ +  - ]:     1279465 :         Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
    7102                 :             : 
    7103                 :     1279628 :         while (true)
    7104                 :             :         {
    7105                 :             :                 /*
    7106                 :             :                  * Always try once to acquire the lock directly, without setting up
    7107                 :             :                  * the spin-delay infrastructure. The work necessary for that shows up
    7108                 :             :                  * in profiles and is rarely necessary.
    7109                 :             :                  */
    7110                 :     1279628 :                 old_buf_state = pg_atomic_fetch_or_u64(&desc->state, BM_LOCKED);
    7111         [ +  + ]:     1279628 :                 if (likely(!(old_buf_state & BM_LOCKED)))
    7112                 :     1279465 :                         break;                          /* got lock */
    7113                 :             : 
    7114                 :             :                 /* and then spin without atomic operations until lock is released */
    7115                 :             :                 {
    7116                 :         163 :                         SpinDelayStatus delayStatus;
    7117                 :             : 
    7118                 :         163 :                         init_local_spin_delay(&delayStatus);
    7119                 :             : 
    7120         [ +  + ]:         576 :                         while (old_buf_state & BM_LOCKED)
    7121                 :             :                         {
    7122                 :         413 :                                 perform_spin_delay(&delayStatus);
    7123                 :         413 :                                 old_buf_state = pg_atomic_read_u64(&desc->state);
    7124                 :             :                         }
    7125                 :         163 :                         finish_spin_delay(&delayStatus);
    7126                 :         163 :                 }
    7127                 :             : 
    7128                 :             :                 /*
    7129                 :             :                  * Retry. The lock might obviously already be re-acquired by the time
    7130                 :             :                  * we're attempting to get it again.
    7131                 :             :                  */
    7132                 :             :         }
    7133                 :             : 
    7134                 :     2558930 :         return old_buf_state | BM_LOCKED;
    7135                 :     1279465 : }
    7136                 :             : 
    7137                 :             : /*
    7138                 :             :  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
    7139                 :             :  * state at that point.
    7140                 :             :  *
    7141                 :             :  * Obviously the buffer could be locked by the time the value is returned, so
    7142                 :             :  * this is primarily useful in CAS style loops.
    7143                 :             :  */
    7144                 :             : pg_noinline uint64
    7145                 :         195 : WaitBufHdrUnlocked(BufferDesc *buf)
    7146                 :             : {
    7147                 :         195 :         SpinDelayStatus delayStatus;
    7148                 :         195 :         uint64          buf_state;
    7149                 :             : 
    7150                 :         195 :         init_local_spin_delay(&delayStatus);
    7151                 :             : 
    7152                 :         195 :         buf_state = pg_atomic_read_u64(&buf->state);
    7153                 :             : 
    7154         [ +  + ]:        1322 :         while (buf_state & BM_LOCKED)
    7155                 :             :         {
    7156                 :        1127 :                 perform_spin_delay(&delayStatus);
    7157                 :        1127 :                 buf_state = pg_atomic_read_u64(&buf->state);
    7158                 :             :         }
    7159                 :             : 
    7160                 :         195 :         finish_spin_delay(&delayStatus);
    7161                 :             : 
    7162                 :         390 :         return buf_state;
    7163                 :         195 : }
    7164                 :             : 
    7165                 :             : /*
    7166                 :             :  * BufferTag comparator.
    7167                 :             :  */
    7168                 :             : static inline int
    7169                 :           0 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
    7170                 :             : {
    7171                 :           0 :         int                     ret;
    7172                 :           0 :         RelFileLocator rlocatora;
    7173                 :           0 :         RelFileLocator rlocatorb;
    7174                 :             : 
    7175                 :           0 :         rlocatora = BufTagGetRelFileLocator(ba);
    7176                 :           0 :         rlocatorb = BufTagGetRelFileLocator(bb);
    7177                 :             : 
    7178                 :           0 :         ret = rlocator_comparator(&rlocatora, &rlocatorb);
    7179                 :             : 
    7180         [ #  # ]:           0 :         if (ret != 0)
    7181                 :           0 :                 return ret;
    7182                 :             : 
    7183         [ #  # ]:           0 :         if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
    7184                 :           0 :                 return -1;
    7185         [ #  # ]:           0 :         if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
    7186                 :           0 :                 return 1;
    7187                 :             : 
    7188         [ #  # ]:           0 :         if (ba->blockNum < bb->blockNum)
    7189                 :           0 :                 return -1;
    7190         [ #  # ]:           0 :         if (ba->blockNum > bb->blockNum)
    7191                 :           0 :                 return 1;
    7192                 :             : 
    7193                 :           0 :         return 0;
    7194                 :           0 : }
    7195                 :             : 
    7196                 :             : /*
    7197                 :             :  * Comparator determining the writeout order in a checkpoint.
    7198                 :             :  *
    7199                 :             :  * It is important that tablespaces are compared first, the logic balancing
    7200                 :             :  * writes between tablespaces relies on it.
    7201                 :             :  */
    7202                 :             : static inline int
    7203                 :       64239 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
    7204                 :             : {
    7205                 :             :         /* compare tablespace */
    7206         [ +  + ]:       64239 :         if (a->tsId < b->tsId)
    7207                 :          61 :                 return -1;
    7208         [ +  + ]:       64178 :         else if (a->tsId > b->tsId)
    7209                 :         221 :                 return 1;
    7210                 :             :         /* compare relation */
    7211         [ +  + ]:       63957 :         if (a->relNumber < b->relNumber)
    7212                 :       12490 :                 return -1;
    7213         [ +  + ]:       51467 :         else if (a->relNumber > b->relNumber)
    7214                 :       12338 :                 return 1;
    7215                 :             :         /* compare fork */
    7216         [ +  + ]:       39129 :         else if (a->forkNum < b->forkNum)
    7217                 :         583 :                 return -1;
    7218         [ +  + ]:       38546 :         else if (a->forkNum > b->forkNum)
    7219                 :         712 :                 return 1;
    7220                 :             :         /* compare block number */
    7221         [ +  + ]:       37834 :         else if (a->blockNum < b->blockNum)
    7222                 :       19491 :                 return -1;
    7223         [ +  - ]:       18343 :         else if (a->blockNum > b->blockNum)
    7224                 :       18343 :                 return 1;
    7225                 :             :         /* equal page IDs are unlikely, but not impossible */
    7226                 :           0 :         return 0;
    7227                 :       64239 : }
    7228                 :             : 
    7229                 :             : /*
    7230                 :             :  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
    7231                 :             :  * progress.
    7232                 :             :  */
    7233                 :             : static int
    7234                 :        4450 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
    7235                 :             : {
    7236                 :        4450 :         CkptTsStatus *sa = (CkptTsStatus *) DatumGetPointer(a);
    7237                 :        4450 :         CkptTsStatus *sb = (CkptTsStatus *) DatumGetPointer(b);
    7238                 :             : 
    7239                 :             :         /* we want a min-heap, so return 1 for the a < b */
    7240         [ +  + ]:        4450 :         if (sa->progress < sb->progress)
    7241                 :        4356 :                 return 1;
    7242         [ +  + ]:          94 :         else if (sa->progress == sb->progress)
    7243                 :           5 :                 return 0;
    7244                 :             :         else
    7245                 :          89 :                 return -1;
    7246                 :        4450 : }
    7247                 :             : 
    7248                 :             : /*
    7249                 :             :  * Initialize a writeback context, discarding potential previous state.
    7250                 :             :  *
    7251                 :             :  * *max_pending is a pointer instead of an immediate value, so the coalesce
    7252                 :             :  * limits can easily changed by the GUC mechanism, and so calling code does
    7253                 :             :  * not have to check the current configuration. A value of 0 means that no
    7254                 :             :  * writeback control will be performed.
    7255                 :             :  */
    7256                 :             : void
    7257                 :          12 : WritebackContextInit(WritebackContext *context, int *max_pending)
    7258                 :             : {
    7259         [ +  - ]:          12 :         Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
    7260                 :             : 
    7261                 :          12 :         context->max_pending = max_pending;
    7262                 :          12 :         context->nr_pending = 0;
    7263                 :          12 : }
    7264                 :             : 
    7265                 :             : /*
    7266                 :             :  * Add buffer to list of pending writeback requests.
    7267                 :             :  */
    7268                 :             : void
    7269                 :        5230 : ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context,
    7270                 :             :                                                           BufferTag *tag)
    7271                 :             : {
    7272                 :        5230 :         PendingWriteback *pending;
    7273                 :             : 
    7274                 :             :         /*
    7275                 :             :          * As pg_flush_data() doesn't do anything with fsync disabled, there's no
    7276                 :             :          * point in tracking in that case.
    7277                 :             :          */
    7278   [ +  -  +  - ]:        5230 :         if (io_direct_flags & IO_DIRECT_DATA ||
    7279                 :        5230 :                 !enableFsync)
    7280                 :        5230 :                 return;
    7281                 :             : 
    7282                 :             :         /*
    7283                 :             :          * Add buffer to the pending writeback array, unless writeback control is
    7284                 :             :          * disabled.
    7285                 :             :          */
    7286         [ #  # ]:           0 :         if (*wb_context->max_pending > 0)
    7287                 :             :         {
    7288         [ #  # ]:           0 :                 Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
    7289                 :             : 
    7290                 :           0 :                 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
    7291                 :             : 
    7292                 :           0 :                 pending->tag = *tag;
    7293                 :           0 :         }
    7294                 :             : 
    7295                 :             :         /*
    7296                 :             :          * Perform pending flushes if the writeback limit is exceeded. This
    7297                 :             :          * includes the case where previously an item has been added, but control
    7298                 :             :          * is now disabled.
    7299                 :             :          */
    7300         [ #  # ]:           0 :         if (wb_context->nr_pending >= *wb_context->max_pending)
    7301                 :           0 :                 IssuePendingWritebacks(wb_context, io_context);
    7302         [ -  + ]:        5230 : }
    7303                 :             : 
    7304                 :             : #define ST_SORT sort_pending_writebacks
    7305                 :             : #define ST_ELEMENT_TYPE PendingWriteback
    7306                 :             : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
    7307                 :             : #define ST_SCOPE static
    7308                 :             : #define ST_DEFINE
    7309                 :             : #include "lib/sort_template.h"
    7310                 :             : 
    7311                 :             : /*
    7312                 :             :  * Issue all pending writeback requests, previously scheduled with
    7313                 :             :  * ScheduleBufferTagForWriteback, to the OS.
    7314                 :             :  *
    7315                 :             :  * Because this is only used to improve the OSs IO scheduling we try to never
    7316                 :             :  * error out - it's just a hint.
    7317                 :             :  */
    7318                 :             : void
    7319                 :           5 : IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
    7320                 :             : {
    7321                 :           5 :         instr_time      io_start;
    7322                 :           5 :         int                     i;
    7323                 :             : 
    7324         [ -  + ]:           5 :         if (wb_context->nr_pending == 0)
    7325                 :           5 :                 return;
    7326                 :             : 
    7327                 :             :         /*
    7328                 :             :          * Executing the writes in-order can make them a lot faster, and allows to
    7329                 :             :          * merge writeback requests to consecutive blocks into larger writebacks.
    7330                 :             :          */
    7331                 :           0 :         sort_pending_writebacks(wb_context->pending_writebacks,
    7332                 :           0 :                                                         wb_context->nr_pending);
    7333                 :             : 
    7334                 :           0 :         io_start = pgstat_prepare_io_time(track_io_timing);
    7335                 :             : 
    7336                 :             :         /*
    7337                 :             :          * Coalesce neighbouring writes, but nothing else. For that we iterate
    7338                 :             :          * through the, now sorted, array of pending flushes, and look forward to
    7339                 :             :          * find all neighbouring (or identical) writes.
    7340                 :             :          */
    7341         [ #  # ]:           0 :         for (i = 0; i < wb_context->nr_pending; i++)
    7342                 :             :         {
    7343                 :           0 :                 PendingWriteback *cur;
    7344                 :           0 :                 PendingWriteback *next;
    7345                 :           0 :                 SMgrRelation reln;
    7346                 :           0 :                 int                     ahead;
    7347                 :           0 :                 BufferTag       tag;
    7348                 :           0 :                 RelFileLocator currlocator;
    7349                 :           0 :                 Size            nblocks = 1;
    7350                 :             : 
    7351                 :           0 :                 cur = &wb_context->pending_writebacks[i];
    7352                 :           0 :                 tag = cur->tag;
    7353                 :           0 :                 currlocator = BufTagGetRelFileLocator(&tag);
    7354                 :             : 
    7355                 :             :                 /*
    7356                 :             :                  * Peek ahead, into following writeback requests, to see if they can
    7357                 :             :                  * be combined with the current one.
    7358                 :             :                  */
    7359         [ #  # ]:           0 :                 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
    7360                 :             :                 {
    7361                 :             : 
    7362                 :           0 :                         next = &wb_context->pending_writebacks[i + ahead + 1];
    7363                 :             : 
    7364                 :             :                         /* different file, stop */
    7365   [ #  #  #  #  :           0 :                         if (!RelFileLocatorEquals(currlocator,
                   #  # ]
    7366         [ #  # ]:           0 :                                                                           BufTagGetRelFileLocator(&next->tag)) ||
    7367                 :           0 :                                 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
    7368                 :           0 :                                 break;
    7369                 :             : 
    7370                 :             :                         /* ok, block queued twice, skip */
    7371         [ #  # ]:           0 :                         if (cur->tag.blockNum == next->tag.blockNum)
    7372                 :           0 :                                 continue;
    7373                 :             : 
    7374                 :             :                         /* only merge consecutive writes */
    7375         [ #  # ]:           0 :                         if (cur->tag.blockNum + 1 != next->tag.blockNum)
    7376                 :           0 :                                 break;
    7377                 :             : 
    7378                 :           0 :                         nblocks++;
    7379                 :           0 :                         cur = next;
    7380                 :           0 :                 }
    7381                 :             : 
    7382                 :           0 :                 i += ahead;
    7383                 :             : 
    7384                 :             :                 /* and finally tell the kernel to write the data to storage */
    7385                 :           0 :                 reln = smgropen(currlocator, INVALID_PROC_NUMBER);
    7386                 :           0 :                 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
    7387                 :           0 :         }
    7388                 :             : 
    7389                 :             :         /*
    7390                 :             :          * Assume that writeback requests are only issued for buffers containing
    7391                 :             :          * blocks of permanent relations.
    7392                 :             :          */
    7393                 :           0 :         pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
    7394                 :           0 :                                                         IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
    7395                 :             : 
    7396                 :           0 :         wb_context->nr_pending = 0;
    7397         [ -  + ]:           5 : }
    7398                 :             : 
    7399                 :             : /* ResourceOwner callbacks */
    7400                 :             : 
    7401                 :             : static void
    7402                 :           0 : ResOwnerReleaseBufferIO(Datum res)
    7403                 :             : {
    7404                 :           0 :         Buffer          buffer = DatumGetInt32(res);
    7405                 :             : 
    7406                 :           0 :         AbortBufferIO(buffer);
    7407                 :           0 : }
    7408                 :             : 
    7409                 :             : static char *
    7410                 :           0 : ResOwnerPrintBufferIO(Datum res)
    7411                 :             : {
    7412                 :           0 :         Buffer          buffer = DatumGetInt32(res);
    7413                 :             : 
    7414                 :           0 :         return psprintf("lost track of buffer IO on buffer %d", buffer);
    7415                 :           0 : }
    7416                 :             : 
    7417                 :             : /*
    7418                 :             :  * Release buffer as part of resource owner cleanup. This will only be called
    7419                 :             :  * if the buffer is pinned. If this backend held the content lock at the time
    7420                 :             :  * of the error we also need to release that (note that it is not possible to
    7421                 :             :  * hold a content lock without a pin).
    7422                 :             :  */
    7423                 :             : static void
    7424                 :        2211 : ResOwnerReleaseBuffer(Datum res)
    7425                 :             : {
    7426                 :        2211 :         Buffer          buffer = DatumGetInt32(res);
    7427                 :             : 
    7428                 :             :         /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
    7429         [ +  - ]:        2211 :         if (!BufferIsValid(buffer))
    7430   [ #  #  #  # ]:           0 :                 elog(ERROR, "bad buffer ID: %d", buffer);
    7431                 :             : 
    7432         [ +  + ]:        2211 :         if (BufferIsLocal(buffer))
    7433                 :        1006 :                 UnpinLocalBufferNoOwner(buffer);
    7434                 :             :         else
    7435                 :             :         {
    7436                 :        1205 :                 PrivateRefCountEntry *ref;
    7437                 :             : 
    7438                 :        1205 :                 ref = GetPrivateRefCountEntry(buffer, false);
    7439                 :             : 
    7440                 :             :                 /* not having a private refcount would imply resowner corruption */
    7441         [ +  - ]:        1205 :                 Assert(ref != NULL);
    7442                 :             : 
    7443                 :             :                 /*
    7444                 :             :                  * If the buffer was locked at the time of the resowner release,
    7445                 :             :                  * release the lock now. This should only happen after errors.
    7446                 :             :                  */
    7447         [ +  + ]:        1205 :                 if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
    7448                 :             :                 {
    7449                 :           4 :                         BufferDesc *buf = GetBufferDescriptor(buffer - 1);
    7450                 :             : 
    7451                 :           4 :                         HOLD_INTERRUPTS();      /* match the upcoming RESUME_INTERRUPTS */
    7452                 :           4 :                         BufferLockUnlock(buffer, buf);
    7453                 :           4 :                 }
    7454                 :             : 
    7455                 :        1205 :                 UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1));
    7456                 :        1205 :         }
    7457                 :        2211 : }
    7458                 :             : 
    7459                 :             : static char *
    7460                 :           0 : ResOwnerPrintBuffer(Datum res)
    7461                 :             : {
    7462                 :           0 :         return DebugPrintBufferRefcount(DatumGetInt32(res));
    7463                 :             : }
    7464                 :             : 
    7465                 :             : /*
    7466                 :             :  * Helper function to evict unpinned buffer whose buffer header lock is
    7467                 :             :  * already acquired.
    7468                 :             :  */
    7469                 :             : static bool
    7470                 :           0 : EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
    7471                 :             : {
    7472                 :           0 :         uint64          buf_state;
    7473                 :           0 :         bool            result;
    7474                 :             : 
    7475                 :           0 :         *buffer_flushed = false;
    7476                 :             : 
    7477                 :           0 :         buf_state = pg_atomic_read_u64(&(desc->state));
    7478         [ #  # ]:           0 :         Assert(buf_state & BM_LOCKED);
    7479                 :             : 
    7480         [ #  # ]:           0 :         if ((buf_state & BM_VALID) == 0)
    7481                 :             :         {
    7482                 :           0 :                 UnlockBufHdr(desc);
    7483                 :           0 :                 return false;
    7484                 :             :         }
    7485                 :             : 
    7486                 :             :         /* Check that it's not pinned already. */
    7487         [ #  # ]:           0 :         if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
    7488                 :             :         {
    7489                 :           0 :                 UnlockBufHdr(desc);
    7490                 :           0 :                 return false;
    7491                 :             :         }
    7492                 :             : 
    7493                 :           0 :         PinBuffer_Locked(desc);         /* releases spinlock */
    7494                 :             : 
    7495                 :             :         /* If it was dirty, try to clean it once. */
    7496         [ #  # ]:           0 :         if (buf_state & BM_DIRTY)
    7497                 :             :         {
    7498                 :           0 :                 FlushUnlockedBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    7499                 :           0 :                 *buffer_flushed = true;
    7500                 :           0 :         }
    7501                 :             : 
    7502                 :             :         /* This will return false if it becomes dirty or someone else pins it. */
    7503                 :           0 :         result = InvalidateVictimBuffer(desc);
    7504                 :             : 
    7505                 :           0 :         UnpinBuffer(desc);
    7506                 :             : 
    7507                 :           0 :         return result;
    7508                 :           0 : }
    7509                 :             : 
    7510                 :             : /*
    7511                 :             :  * Try to evict the current block in a shared buffer.
    7512                 :             :  *
    7513                 :             :  * This function is intended for testing/development use only!
    7514                 :             :  *
    7515                 :             :  * To succeed, the buffer must not be pinned on entry, so if the caller had a
    7516                 :             :  * particular block in mind, it might already have been replaced by some other
    7517                 :             :  * block by the time this function runs.  It's also unpinned on return, so the
    7518                 :             :  * buffer might be occupied again by the time control is returned, potentially
    7519                 :             :  * even by the same block.  This inherent raciness without other interlocking
    7520                 :             :  * makes the function unsuitable for non-testing usage.
    7521                 :             :  *
    7522                 :             :  * *buffer_flushed is set to true if the buffer was dirty and has been
    7523                 :             :  * flushed, false otherwise.  However, *buffer_flushed=true does not
    7524                 :             :  * necessarily mean that we flushed the buffer, it could have been flushed by
    7525                 :             :  * someone else.
    7526                 :             :  *
    7527                 :             :  * Returns true if the buffer was valid and it has now been made invalid.
    7528                 :             :  * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
    7529                 :             :  * or if the buffer becomes dirty again while we're trying to write it out.
    7530                 :             :  */
    7531                 :             : bool
    7532                 :           0 : EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
    7533                 :             : {
    7534                 :           0 :         BufferDesc *desc;
    7535                 :             : 
    7536         [ #  # ]:           0 :         Assert(BufferIsValid(buf) && !BufferIsLocal(buf));
    7537                 :             : 
    7538                 :             :         /* Make sure we can pin the buffer. */
    7539                 :           0 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    7540                 :           0 :         ReservePrivateRefCountEntry();
    7541                 :             : 
    7542                 :           0 :         desc = GetBufferDescriptor(buf - 1);
    7543                 :           0 :         LockBufHdr(desc);
    7544                 :             : 
    7545                 :           0 :         return EvictUnpinnedBufferInternal(desc, buffer_flushed);
    7546                 :           0 : }
    7547                 :             : 
    7548                 :             : /*
    7549                 :             :  * Try to evict all the shared buffers.
    7550                 :             :  *
    7551                 :             :  * This function is intended for testing/development use only! See
    7552                 :             :  * EvictUnpinnedBuffer().
    7553                 :             :  *
    7554                 :             :  * The buffers_* parameters are mandatory and indicate the total count of
    7555                 :             :  * buffers that:
    7556                 :             :  * - buffers_evicted - were evicted
    7557                 :             :  * - buffers_flushed - were flushed
    7558                 :             :  * - buffers_skipped - could not be evicted
    7559                 :             :  */
    7560                 :             : void
    7561                 :           0 : EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed,
    7562                 :             :                                                 int32 *buffers_skipped)
    7563                 :             : {
    7564                 :           0 :         *buffers_evicted = 0;
    7565                 :           0 :         *buffers_skipped = 0;
    7566                 :           0 :         *buffers_flushed = 0;
    7567                 :             : 
    7568         [ #  # ]:           0 :         for (int buf = 1; buf <= NBuffers; buf++)
    7569                 :             :         {
    7570                 :           0 :                 BufferDesc *desc = GetBufferDescriptor(buf - 1);
    7571                 :           0 :                 uint64          buf_state;
    7572                 :           0 :                 bool            buffer_flushed;
    7573                 :             : 
    7574         [ #  # ]:           0 :                 CHECK_FOR_INTERRUPTS();
    7575                 :             : 
    7576                 :           0 :                 buf_state = pg_atomic_read_u64(&desc->state);
    7577         [ #  # ]:           0 :                 if (!(buf_state & BM_VALID))
    7578                 :           0 :                         continue;
    7579                 :             : 
    7580                 :           0 :                 ResourceOwnerEnlarge(CurrentResourceOwner);
    7581                 :           0 :                 ReservePrivateRefCountEntry();
    7582                 :             : 
    7583                 :           0 :                 LockBufHdr(desc);
    7584                 :             : 
    7585         [ #  # ]:           0 :                 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
    7586                 :           0 :                         (*buffers_evicted)++;
    7587                 :             :                 else
    7588                 :           0 :                         (*buffers_skipped)++;
    7589                 :             : 
    7590         [ #  # ]:           0 :                 if (buffer_flushed)
    7591                 :           0 :                         (*buffers_flushed)++;
    7592      [ #  #  # ]:           0 :         }
    7593                 :           0 : }
    7594                 :             : 
    7595                 :             : /*
    7596                 :             :  * Try to evict all the shared buffers containing provided relation's pages.
    7597                 :             :  *
    7598                 :             :  * This function is intended for testing/development use only! See
    7599                 :             :  * EvictUnpinnedBuffer().
    7600                 :             :  *
    7601                 :             :  * The caller must hold at least AccessShareLock on the relation to prevent
    7602                 :             :  * the relation from being dropped.
    7603                 :             :  *
    7604                 :             :  * The buffers_* parameters are mandatory and indicate the total count of
    7605                 :             :  * buffers that:
    7606                 :             :  * - buffers_evicted - were evicted
    7607                 :             :  * - buffers_flushed - were flushed
    7608                 :             :  * - buffers_skipped - could not be evicted
    7609                 :             :  */
    7610                 :             : void
    7611                 :           0 : EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted,
    7612                 :             :                                                 int32 *buffers_flushed, int32 *buffers_skipped)
    7613                 :             : {
    7614         [ #  # ]:           0 :         Assert(!RelationUsesLocalBuffers(rel));
    7615                 :             : 
    7616                 :           0 :         *buffers_skipped = 0;
    7617                 :           0 :         *buffers_evicted = 0;
    7618                 :           0 :         *buffers_flushed = 0;
    7619                 :             : 
    7620         [ #  # ]:           0 :         for (int buf = 1; buf <= NBuffers; buf++)
    7621                 :             :         {
    7622                 :           0 :                 BufferDesc *desc = GetBufferDescriptor(buf - 1);
    7623                 :           0 :                 uint64          buf_state = pg_atomic_read_u64(&(desc->state));
    7624                 :           0 :                 bool            buffer_flushed;
    7625                 :             : 
    7626         [ #  # ]:           0 :                 CHECK_FOR_INTERRUPTS();
    7627                 :             : 
    7628                 :             :                 /* An unlocked precheck should be safe and saves some cycles. */
    7629   [ #  #  #  # ]:           0 :                 if ((buf_state & BM_VALID) == 0 ||
    7630                 :           0 :                         !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
    7631                 :           0 :                         continue;
    7632                 :             : 
    7633                 :             :                 /* Make sure we can pin the buffer. */
    7634                 :           0 :                 ResourceOwnerEnlarge(CurrentResourceOwner);
    7635                 :           0 :                 ReservePrivateRefCountEntry();
    7636                 :             : 
    7637                 :           0 :                 buf_state = LockBufHdr(desc);
    7638                 :             : 
    7639                 :             :                 /* recheck, could have changed without the lock */
    7640   [ #  #  #  # ]:           0 :                 if ((buf_state & BM_VALID) == 0 ||
    7641                 :           0 :                         !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
    7642                 :             :                 {
    7643                 :           0 :                         UnlockBufHdr(desc);
    7644                 :           0 :                         continue;
    7645                 :             :                 }
    7646                 :             : 
    7647         [ #  # ]:           0 :                 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
    7648                 :           0 :                         (*buffers_evicted)++;
    7649                 :             :                 else
    7650                 :           0 :                         (*buffers_skipped)++;
    7651                 :             : 
    7652         [ #  # ]:           0 :                 if (buffer_flushed)
    7653                 :           0 :                         (*buffers_flushed)++;
    7654      [ #  #  # ]:           0 :         }
    7655                 :           0 : }
    7656                 :             : 
    7657                 :             : /*
    7658                 :             :  * Helper function to mark unpinned buffer dirty whose buffer header lock is
    7659                 :             :  * already acquired.
    7660                 :             :  */
    7661                 :             : static bool
    7662                 :           0 : MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc,
    7663                 :             :                                                                 bool *buffer_already_dirty)
    7664                 :             : {
    7665                 :           0 :         uint64          buf_state;
    7666                 :           0 :         bool            result = false;
    7667                 :             : 
    7668                 :           0 :         *buffer_already_dirty = false;
    7669                 :             : 
    7670                 :           0 :         buf_state = pg_atomic_read_u64(&(desc->state));
    7671         [ #  # ]:           0 :         Assert(buf_state & BM_LOCKED);
    7672                 :             : 
    7673         [ #  # ]:           0 :         if ((buf_state & BM_VALID) == 0)
    7674                 :             :         {
    7675                 :           0 :                 UnlockBufHdr(desc);
    7676                 :           0 :                 return false;
    7677                 :             :         }
    7678                 :             : 
    7679                 :             :         /* Check that it's not pinned already. */
    7680         [ #  # ]:           0 :         if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
    7681                 :             :         {
    7682                 :           0 :                 UnlockBufHdr(desc);
    7683                 :           0 :                 return false;
    7684                 :             :         }
    7685                 :             : 
    7686                 :             :         /* Pin the buffer and then release the buffer spinlock */
    7687                 :           0 :         PinBuffer_Locked(desc);
    7688                 :             : 
    7689                 :             :         /* If it was not already dirty, mark it as dirty. */
    7690         [ #  # ]:           0 :         if (!(buf_state & BM_DIRTY))
    7691                 :             :         {
    7692                 :           0 :                 BufferLockAcquire(buf, desc, BUFFER_LOCK_EXCLUSIVE);
    7693                 :           0 :                 MarkBufferDirty(buf);
    7694                 :           0 :                 result = true;
    7695                 :           0 :                 BufferLockUnlock(buf, desc);
    7696                 :           0 :         }
    7697                 :             :         else
    7698                 :           0 :                 *buffer_already_dirty = true;
    7699                 :             : 
    7700                 :           0 :         UnpinBuffer(desc);
    7701                 :             : 
    7702                 :           0 :         return result;
    7703                 :           0 : }
    7704                 :             : 
    7705                 :             : /*
    7706                 :             :  * Try to mark the provided shared buffer as dirty.
    7707                 :             :  *
    7708                 :             :  * This function is intended for testing/development use only!
    7709                 :             :  *
    7710                 :             :  * Same as EvictUnpinnedBuffer() but with MarkBufferDirty() call inside.
    7711                 :             :  *
    7712                 :             :  * The buffer_already_dirty parameter is mandatory and indicate if the buffer
    7713                 :             :  * could not be dirtied because it is already dirty.
    7714                 :             :  *
    7715                 :             :  * Returns true if the buffer has successfully been marked as dirty.
    7716                 :             :  */
    7717                 :             : bool
    7718                 :           0 : MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty)
    7719                 :             : {
    7720                 :           0 :         BufferDesc *desc;
    7721                 :           0 :         bool            buffer_dirtied = false;
    7722                 :             : 
    7723         [ #  # ]:           0 :         Assert(!BufferIsLocal(buf));
    7724                 :             : 
    7725                 :             :         /* Make sure we can pin the buffer. */
    7726                 :           0 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    7727                 :           0 :         ReservePrivateRefCountEntry();
    7728                 :             : 
    7729                 :           0 :         desc = GetBufferDescriptor(buf - 1);
    7730                 :           0 :         LockBufHdr(desc);
    7731                 :             : 
    7732                 :           0 :         buffer_dirtied = MarkDirtyUnpinnedBufferInternal(buf, desc, buffer_already_dirty);
    7733                 :             :         /* Both can not be true at the same time */
    7734   [ #  #  #  # ]:           0 :         Assert(!(buffer_dirtied && *buffer_already_dirty));
    7735                 :             : 
    7736                 :           0 :         return buffer_dirtied;
    7737                 :           0 : }
    7738                 :             : 
    7739                 :             : /*
    7740                 :             :  * Try to mark all the shared buffers containing provided relation's pages as
    7741                 :             :  * dirty.
    7742                 :             :  *
    7743                 :             :  * This function is intended for testing/development use only! See
    7744                 :             :  * MarkDirtyUnpinnedBuffer().
    7745                 :             :  *
    7746                 :             :  * The buffers_* parameters are mandatory and indicate the total count of
    7747                 :             :  * buffers that:
    7748                 :             :  * - buffers_dirtied - were dirtied
    7749                 :             :  * - buffers_already_dirty - were already dirty
    7750                 :             :  * - buffers_skipped - could not be dirtied because of a reason different
    7751                 :             :  * than a buffer being already dirty.
    7752                 :             :  */
    7753                 :             : void
    7754                 :           0 : MarkDirtyRelUnpinnedBuffers(Relation rel,
    7755                 :             :                                                         int32 *buffers_dirtied,
    7756                 :             :                                                         int32 *buffers_already_dirty,
    7757                 :             :                                                         int32 *buffers_skipped)
    7758                 :             : {
    7759         [ #  # ]:           0 :         Assert(!RelationUsesLocalBuffers(rel));
    7760                 :             : 
    7761                 :           0 :         *buffers_dirtied = 0;
    7762                 :           0 :         *buffers_already_dirty = 0;
    7763                 :           0 :         *buffers_skipped = 0;
    7764                 :             : 
    7765         [ #  # ]:           0 :         for (int buf = 1; buf <= NBuffers; buf++)
    7766                 :             :         {
    7767                 :           0 :                 BufferDesc *desc = GetBufferDescriptor(buf - 1);
    7768                 :           0 :                 uint64          buf_state = pg_atomic_read_u64(&(desc->state));
    7769                 :           0 :                 bool            buffer_already_dirty;
    7770                 :             : 
    7771         [ #  # ]:           0 :                 CHECK_FOR_INTERRUPTS();
    7772                 :             : 
    7773                 :             :                 /* An unlocked precheck should be safe and saves some cycles. */
    7774   [ #  #  #  # ]:           0 :                 if ((buf_state & BM_VALID) == 0 ||
    7775                 :           0 :                         !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
    7776                 :           0 :                         continue;
    7777                 :             : 
    7778                 :             :                 /* Make sure we can pin the buffer. */
    7779                 :           0 :                 ResourceOwnerEnlarge(CurrentResourceOwner);
    7780                 :           0 :                 ReservePrivateRefCountEntry();
    7781                 :             : 
    7782                 :           0 :                 buf_state = LockBufHdr(desc);
    7783                 :             : 
    7784                 :             :                 /* recheck, could have changed without the lock */
    7785   [ #  #  #  # ]:           0 :                 if ((buf_state & BM_VALID) == 0 ||
    7786                 :           0 :                         !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
    7787                 :             :                 {
    7788                 :           0 :                         UnlockBufHdr(desc);
    7789                 :           0 :                         continue;
    7790                 :             :                 }
    7791                 :             : 
    7792         [ #  # ]:           0 :                 if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
    7793                 :           0 :                         (*buffers_dirtied)++;
    7794         [ #  # ]:           0 :                 else if (buffer_already_dirty)
    7795                 :           0 :                         (*buffers_already_dirty)++;
    7796                 :             :                 else
    7797                 :           0 :                         (*buffers_skipped)++;
    7798      [ #  #  # ]:           0 :         }
    7799                 :           0 : }
    7800                 :             : 
    7801                 :             : /*
    7802                 :             :  * Try to mark all the shared buffers as dirty.
    7803                 :             :  *
    7804                 :             :  * This function is intended for testing/development use only! See
    7805                 :             :  * MarkDirtyUnpinnedBuffer().
    7806                 :             :  *
    7807                 :             :  * See MarkDirtyRelUnpinnedBuffers() above for details about the buffers_*
    7808                 :             :  * parameters.
    7809                 :             :  */
    7810                 :             : void
    7811                 :           0 : MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied,
    7812                 :             :                                                         int32 *buffers_already_dirty,
    7813                 :             :                                                         int32 *buffers_skipped)
    7814                 :             : {
    7815                 :           0 :         *buffers_dirtied = 0;
    7816                 :           0 :         *buffers_already_dirty = 0;
    7817                 :           0 :         *buffers_skipped = 0;
    7818                 :             : 
    7819         [ #  # ]:           0 :         for (int buf = 1; buf <= NBuffers; buf++)
    7820                 :             :         {
    7821                 :           0 :                 BufferDesc *desc = GetBufferDescriptor(buf - 1);
    7822                 :           0 :                 uint64          buf_state;
    7823                 :           0 :                 bool            buffer_already_dirty;
    7824                 :             : 
    7825         [ #  # ]:           0 :                 CHECK_FOR_INTERRUPTS();
    7826                 :             : 
    7827                 :           0 :                 buf_state = pg_atomic_read_u64(&desc->state);
    7828         [ #  # ]:           0 :                 if (!(buf_state & BM_VALID))
    7829                 :           0 :                         continue;
    7830                 :             : 
    7831                 :           0 :                 ResourceOwnerEnlarge(CurrentResourceOwner);
    7832                 :           0 :                 ReservePrivateRefCountEntry();
    7833                 :             : 
    7834                 :           0 :                 LockBufHdr(desc);
    7835                 :             : 
    7836         [ #  # ]:           0 :                 if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
    7837                 :           0 :                         (*buffers_dirtied)++;
    7838         [ #  # ]:           0 :                 else if (buffer_already_dirty)
    7839                 :           0 :                         (*buffers_already_dirty)++;
    7840                 :             :                 else
    7841                 :           0 :                         (*buffers_skipped)++;
    7842      [ #  #  # ]:           0 :         }
    7843                 :           0 : }
    7844                 :             : 
    7845                 :             : /*
    7846                 :             :  * Generic implementation of the AIO handle staging callback for readv/writev
    7847                 :             :  * on local/shared buffers.
    7848                 :             :  *
    7849                 :             :  * Each readv/writev can target multiple buffers. The buffers have already
    7850                 :             :  * been registered with the IO handle.
    7851                 :             :  *
    7852                 :             :  * To make the IO ready for execution ("staging"), we need to ensure that the
    7853                 :             :  * targeted buffers are in an appropriate state while the IO is ongoing. For
    7854                 :             :  * that the AIO subsystem needs to have its own buffer pin, otherwise an error
    7855                 :             :  * in this backend could lead to this backend's buffer pin being released as
    7856                 :             :  * part of error handling, which in turn could lead to the buffer being
    7857                 :             :  * replaced while IO is ongoing.
    7858                 :             :  */
    7859                 :             : static pg_attribute_always_inline void
    7860                 :        7052 : buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
    7861                 :             : {
    7862                 :        7052 :         uint64     *io_data;
    7863                 :        7052 :         uint8           handle_data_len;
    7864                 :        7052 :         PgAioWaitRef io_ref;
    7865                 :        7052 :         BufferTag       first PG_USED_FOR_ASSERTS_ONLY = {0};
    7866                 :             : 
    7867                 :        7052 :         io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
    7868                 :             : 
    7869                 :        7052 :         pgaio_io_get_wref(ioh, &io_ref);
    7870                 :             : 
    7871                 :             :         /* iterate over all buffers affected by the vectored readv/writev */
    7872         [ +  + ]:       18202 :         for (int i = 0; i < handle_data_len; i++)
    7873                 :             :         {
    7874                 :       11150 :                 Buffer          buffer = (Buffer) io_data[i];
    7875         [ +  + ]:       11150 :                 BufferDesc *buf_hdr = is_temp ?
    7876                 :        2769 :                         GetLocalBufferDescriptor(-buffer - 1)
    7877                 :        8381 :                         : GetBufferDescriptor(buffer - 1);
    7878                 :       11150 :                 uint64          buf_state;
    7879                 :             : 
    7880                 :             :                 /*
    7881                 :             :                  * Check that all the buffers are actually ones that could conceivably
    7882                 :             :                  * be done in one IO, i.e. are sequential. This is the last
    7883                 :             :                  * buffer-aware code before IO is actually executed and confusion
    7884                 :             :                  * about which buffers are targeted by IO can be hard to debug, making
    7885                 :             :                  * it worth doing extra-paranoid checks.
    7886                 :             :                  */
    7887         [ +  + ]:       11150 :                 if (i == 0)
    7888                 :        7052 :                         first = buf_hdr->tag;
    7889                 :             :                 else
    7890                 :             :                 {
    7891         [ +  - ]:        4098 :                         Assert(buf_hdr->tag.relNumber == first.relNumber);
    7892         [ +  - ]:        4098 :                         Assert(buf_hdr->tag.blockNum == first.blockNum + i);
    7893                 :             :                 }
    7894                 :             : 
    7895         [ +  + ]:       11150 :                 if (is_temp)
    7896                 :        2769 :                         buf_state = pg_atomic_read_u64(&buf_hdr->state);
    7897                 :             :                 else
    7898                 :        8381 :                         buf_state = LockBufHdr(buf_hdr);
    7899                 :             : 
    7900                 :             :                 /* verify the buffer is in the expected state */
    7901         [ -  + ]:       11150 :                 Assert(buf_state & BM_TAG_VALID);
    7902         [ -  + ]:       11150 :                 if (is_write)
    7903                 :             :                 {
    7904         [ #  # ]:           0 :                         Assert(buf_state & BM_VALID);
    7905         [ #  # ]:           0 :                         Assert(buf_state & BM_DIRTY);
    7906                 :           0 :                 }
    7907                 :             :                 else
    7908                 :             :                 {
    7909         [ -  + ]:       11150 :                         Assert(!(buf_state & BM_VALID));
    7910         [ -  + ]:       11150 :                         Assert(!(buf_state & BM_DIRTY));
    7911                 :             :                 }
    7912                 :             : 
    7913                 :             :                 /* temp buffers don't use BM_IO_IN_PROGRESS */
    7914         [ +  + ]:       11150 :                 if (!is_temp)
    7915         [ -  + ]:        8381 :                         Assert(buf_state & BM_IO_IN_PROGRESS);
    7916                 :             : 
    7917         [ -  + ]:       11150 :                 Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
    7918                 :             : 
    7919                 :             :                 /*
    7920                 :             :                  * Reflect that the buffer is now owned by the AIO subsystem.
    7921                 :             :                  *
    7922                 :             :                  * For local buffers: This can't be done just via LocalRefCount, as
    7923                 :             :                  * one might initially think, as this backend could error out while
    7924                 :             :                  * AIO is still in progress, releasing all the pins by the backend
    7925                 :             :                  * itself.
    7926                 :             :                  *
    7927                 :             :                  * This pin is released again in TerminateBufferIO().
    7928                 :             :                  */
    7929                 :       11150 :                 buf_hdr->io_wref = io_ref;
    7930                 :             : 
    7931         [ +  + ]:       11150 :                 if (is_temp)
    7932                 :             :                 {
    7933                 :        2769 :                         buf_state += BUF_REFCOUNT_ONE;
    7934                 :        2769 :                         pg_atomic_unlocked_write_u64(&buf_hdr->state, buf_state);
    7935                 :        2769 :                 }
    7936                 :             :                 else
    7937                 :        8381 :                         UnlockBufHdrExt(buf_hdr, buf_state, 0, 0, 1);
    7938                 :             : 
    7939                 :             :                 /*
    7940                 :             :                  * Ensure the content lock that prevents buffer modifications while
    7941                 :             :                  * the buffer is being written out is not released early due to an
    7942                 :             :                  * error.
    7943                 :             :                  */
    7944   [ -  +  #  # ]:       11150 :                 if (is_write && !is_temp)
    7945                 :             :                 {
    7946         [ #  # ]:           0 :                         Assert(BufferLockHeldByMe(buf_hdr));
    7947                 :             : 
    7948                 :             :                         /*
    7949                 :             :                          * Lock is now owned by AIO subsystem.
    7950                 :             :                          */
    7951                 :           0 :                         BufferLockDisown(buffer, buf_hdr);
    7952                 :           0 :                 }
    7953                 :             : 
    7954                 :             :                 /*
    7955                 :             :                  * Stop tracking this buffer via the resowner - the AIO system now
    7956                 :             :                  * keeps track.
    7957                 :             :                  */
    7958         [ +  + ]:       11150 :                 if (!is_temp)
    7959                 :        8381 :                         ResourceOwnerForgetBufferIO(CurrentResourceOwner, buffer);
    7960                 :       11150 :         }
    7961                 :        7052 : }
    7962                 :             : 
    7963                 :             : /*
    7964                 :             :  * Decode readv errors as encoded by buffer_readv_encode_error().
    7965                 :             :  */
    7966                 :             : static inline void
    7967                 :           0 : buffer_readv_decode_error(PgAioResult result,
    7968                 :             :                                                   bool *zeroed_any,
    7969                 :             :                                                   bool *ignored_any,
    7970                 :             :                                                   uint8 *zeroed_or_error_count,
    7971                 :             :                                                   uint8 *checkfail_count,
    7972                 :             :                                                   uint8 *first_off)
    7973                 :             : {
    7974                 :           0 :         uint32          rem_error = result.error_data;
    7975                 :             : 
    7976                 :             :         /* see static asserts in buffer_readv_encode_error */
    7977                 :             : #define READV_COUNT_BITS        7
    7978                 :             : #define READV_COUNT_MASK        ((1 << READV_COUNT_BITS) - 1)
    7979                 :             : 
    7980                 :           0 :         *zeroed_any = rem_error & 1;
    7981                 :           0 :         rem_error >>= 1;
    7982                 :             : 
    7983                 :           0 :         *ignored_any = rem_error & 1;
    7984                 :           0 :         rem_error >>= 1;
    7985                 :             : 
    7986                 :           0 :         *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
    7987                 :           0 :         rem_error >>= READV_COUNT_BITS;
    7988                 :             : 
    7989                 :           0 :         *checkfail_count = rem_error & READV_COUNT_MASK;
    7990                 :           0 :         rem_error >>= READV_COUNT_BITS;
    7991                 :             : 
    7992                 :           0 :         *first_off = rem_error & READV_COUNT_MASK;
    7993                 :           0 :         rem_error >>= READV_COUNT_BITS;
    7994                 :           0 : }
    7995                 :             : 
    7996                 :             : /*
    7997                 :             :  * Helper to encode errors for buffer_readv_complete()
    7998                 :             :  *
    7999                 :             :  * Errors are encoded as follows:
    8000                 :             :  * - bit 0 indicates whether any page was zeroed (1) or not (0)
    8001                 :             :  * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
    8002                 :             :  * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
    8003                 :             :  * - next READV_COUNT_BITS bits indicate the number of checksum failures
    8004                 :             :  * - next READV_COUNT_BITS bits indicate the first offset of the first page
    8005                 :             :  *   that was errored or zeroed or, if no errors/zeroes, the first ignored
    8006                 :             :  *   checksum
    8007                 :             :  */
    8008                 :             : static inline void
    8009                 :           0 : buffer_readv_encode_error(PgAioResult *result,
    8010                 :             :                                                   bool is_temp,
    8011                 :             :                                                   bool zeroed_any,
    8012                 :             :                                                   bool ignored_any,
    8013                 :             :                                                   uint8 error_count,
    8014                 :             :                                                   uint8 zeroed_count,
    8015                 :             :                                                   uint8 checkfail_count,
    8016                 :             :                                                   uint8 first_error_off,
    8017                 :             :                                                   uint8 first_zeroed_off,
    8018                 :             :                                                   uint8 first_ignored_off)
    8019                 :             : {
    8020                 :             : 
    8021                 :           0 :         uint8           shift = 0;
    8022                 :           0 :         uint8           zeroed_or_error_count =
    8023         [ #  # ]:           0 :                 error_count > 0 ? error_count : zeroed_count;
    8024                 :           0 :         uint8           first_off;
    8025                 :             : 
    8026                 :             :         StaticAssertDecl(PG_IOV_MAX <= 1 << READV_COUNT_BITS,
    8027                 :             :                                          "PG_IOV_MAX is bigger than reserved space for error data");
    8028                 :             :         StaticAssertDecl((1 + 1 + 3 * READV_COUNT_BITS) <= PGAIO_RESULT_ERROR_BITS,
    8029                 :             :                                          "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
    8030                 :             : 
    8031                 :             :         /*
    8032                 :             :          * We only have space to encode one offset - but luckily that's good
    8033                 :             :          * enough. If there is an error, the error is the interesting offset, same
    8034                 :             :          * with a zeroed buffer vs an ignored buffer.
    8035                 :             :          */
    8036         [ #  # ]:           0 :         if (error_count > 0)
    8037                 :           0 :                 first_off = first_error_off;
    8038         [ #  # ]:           0 :         else if (zeroed_count > 0)
    8039                 :           0 :                 first_off = first_zeroed_off;
    8040                 :             :         else
    8041                 :           0 :                 first_off = first_ignored_off;
    8042                 :             : 
    8043   [ #  #  #  # ]:           0 :         Assert(!zeroed_any || error_count == 0);
    8044                 :             : 
    8045                 :           0 :         result->error_data = 0;
    8046                 :             : 
    8047                 :           0 :         result->error_data |= zeroed_any << shift;
    8048                 :           0 :         shift += 1;
    8049                 :             : 
    8050                 :           0 :         result->error_data |= ignored_any << shift;
    8051                 :           0 :         shift += 1;
    8052                 :             : 
    8053                 :           0 :         result->error_data |= ((uint32) zeroed_or_error_count) << shift;
    8054                 :           0 :         shift += READV_COUNT_BITS;
    8055                 :             : 
    8056                 :           0 :         result->error_data |= ((uint32) checkfail_count) << shift;
    8057                 :           0 :         shift += READV_COUNT_BITS;
    8058                 :             : 
    8059                 :           0 :         result->error_data |= ((uint32) first_off) << shift;
    8060                 :           0 :         shift += READV_COUNT_BITS;
    8061                 :             : 
    8062                 :           0 :         result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
    8063                 :             :                 PGAIO_HCB_SHARED_BUFFER_READV;
    8064                 :             : 
    8065         [ #  # ]:           0 :         if (error_count > 0)
    8066                 :           0 :                 result->status = PGAIO_RS_ERROR;
    8067                 :             :         else
    8068                 :           0 :                 result->status = PGAIO_RS_WARNING;
    8069                 :             : 
    8070                 :             :         /*
    8071                 :             :          * The encoding is complicated enough to warrant cross-checking it against
    8072                 :             :          * the decode function.
    8073                 :             :          */
    8074                 :             : #ifdef USE_ASSERT_CHECKING
    8075                 :             :         {
    8076                 :           0 :                 bool            zeroed_any_2,
    8077                 :             :                                         ignored_any_2;
    8078                 :           0 :                 uint8           zeroed_or_error_count_2,
    8079                 :             :                                         checkfail_count_2,
    8080                 :             :                                         first_off_2;
    8081                 :             : 
    8082                 :           0 :                 buffer_readv_decode_error(*result,
    8083                 :             :                                                                   &zeroed_any_2, &ignored_any_2,
    8084                 :             :                                                                   &zeroed_or_error_count_2,
    8085                 :             :                                                                   &checkfail_count_2,
    8086                 :             :                                                                   &first_off_2);
    8087         [ #  # ]:           0 :                 Assert(zeroed_any == zeroed_any_2);
    8088         [ #  # ]:           0 :                 Assert(ignored_any == ignored_any_2);
    8089         [ #  # ]:           0 :                 Assert(zeroed_or_error_count == zeroed_or_error_count_2);
    8090         [ #  # ]:           0 :                 Assert(checkfail_count == checkfail_count_2);
    8091         [ #  # ]:           0 :                 Assert(first_off == first_off_2);
    8092                 :           0 :         }
    8093                 :             : #endif
    8094                 :             : 
    8095                 :             : #undef READV_COUNT_BITS
    8096                 :             : #undef READV_COUNT_MASK
    8097                 :           0 : }
    8098                 :             : 
    8099                 :             : /*
    8100                 :             :  * Helper for AIO readv completion callbacks, supporting both shared and temp
    8101                 :             :  * buffers. Gets called once for each buffer in a multi-page read.
    8102                 :             :  */
    8103                 :             : static pg_attribute_always_inline void
    8104                 :        9673 : buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer,
    8105                 :             :                                                   uint8 flags, bool failed, bool is_temp,
    8106                 :             :                                                   bool *buffer_invalid,
    8107                 :             :                                                   bool *failed_checksum,
    8108                 :             :                                                   bool *ignored_checksum,
    8109                 :             :                                                   bool *zeroed_buffer)
    8110                 :             : {
    8111         [ +  + ]:        9673 :         BufferDesc *buf_hdr = is_temp ?
    8112                 :        2769 :                 GetLocalBufferDescriptor(-buffer - 1)
    8113                 :        6904 :                 : GetBufferDescriptor(buffer - 1);
    8114                 :        9673 :         BufferTag       tag = buf_hdr->tag;
    8115                 :        9673 :         char       *bufdata = BufferGetBlock(buffer);
    8116                 :        9673 :         uint64          set_flag_bits;
    8117                 :        9673 :         int                     piv_flags;
    8118                 :             : 
    8119                 :             :         /* check that the buffer is in the expected state for a read */
    8120                 :             : #ifdef USE_ASSERT_CHECKING
    8121                 :             :         {
    8122                 :        9673 :                 uint64          buf_state = pg_atomic_read_u64(&buf_hdr->state);
    8123                 :             : 
    8124         [ +  - ]:        9673 :                 Assert(buf_state & BM_TAG_VALID);
    8125         [ +  - ]:        9673 :                 Assert(!(buf_state & BM_VALID));
    8126                 :             :                 /* temp buffers don't use BM_IO_IN_PROGRESS */
    8127         [ +  + ]:        9673 :                 if (!is_temp)
    8128         [ +  - ]:        6904 :                         Assert(buf_state & BM_IO_IN_PROGRESS);
    8129         [ +  - ]:        9673 :                 Assert(!(buf_state & BM_DIRTY));
    8130                 :        9673 :         }
    8131                 :             : #endif
    8132                 :             : 
    8133                 :        9673 :         *buffer_invalid = false;
    8134                 :        9673 :         *failed_checksum = false;
    8135                 :        9673 :         *ignored_checksum = false;
    8136                 :        9673 :         *zeroed_buffer = false;
    8137                 :             : 
    8138                 :             :         /*
    8139                 :             :          * We ask PageIsVerified() to only log the message about checksum errors,
    8140                 :             :          * as the completion might be run in any backend (or IO workers). We will
    8141                 :             :          * report checksum errors in buffer_readv_report().
    8142                 :             :          */
    8143                 :        9673 :         piv_flags = PIV_LOG_LOG;
    8144                 :             : 
    8145                 :             :         /* the local zero_damaged_pages may differ from the definer's */
    8146         [ +  - ]:        9673 :         if (flags & READ_BUFFERS_IGNORE_CHECKSUM_FAILURES)
    8147                 :           0 :                 piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
    8148                 :             : 
    8149                 :             :         /* Check for garbage data. */
    8150         [ -  + ]:        9673 :         if (!failed)
    8151                 :             :         {
    8152                 :             :                 /*
    8153                 :             :                  * If the buffer is not currently pinned by this backend, e.g. because
    8154                 :             :                  * we're completing this IO after an error, the buffer data will have
    8155                 :             :                  * been marked as inaccessible when the buffer was unpinned. The AIO
    8156                 :             :                  * subsystem holds a pin, but that doesn't prevent the buffer from
    8157                 :             :                  * having been marked as inaccessible. The completion might also be
    8158                 :             :                  * executed in a different process.
    8159                 :             :                  */
    8160                 :             : #ifdef USE_VALGRIND
    8161                 :             :                 if (!BufferIsPinned(buffer))
    8162                 :             :                         VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
    8163                 :             : #endif
    8164                 :             : 
    8165   [ +  -  +  - ]:       19346 :                 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
    8166                 :        9673 :                                                         failed_checksum))
    8167                 :             :                 {
    8168         [ #  # ]:           0 :                         if (flags & READ_BUFFERS_ZERO_ON_ERROR)
    8169                 :             :                         {
    8170                 :           0 :                                 memset(bufdata, 0, BLCKSZ);
    8171                 :           0 :                                 *zeroed_buffer = true;
    8172                 :           0 :                         }
    8173                 :             :                         else
    8174                 :             :                         {
    8175                 :           0 :                                 *buffer_invalid = true;
    8176                 :             :                                 /* mark buffer as having failed */
    8177                 :           0 :                                 failed = true;
    8178                 :             :                         }
    8179                 :           0 :                 }
    8180         [ +  - ]:        9673 :                 else if (*failed_checksum)
    8181                 :           0 :                         *ignored_checksum = true;
    8182                 :             : 
    8183                 :             :                 /* undo what we did above */
    8184                 :             : #ifdef USE_VALGRIND
    8185                 :             :                 if (!BufferIsPinned(buffer))
    8186                 :             :                         VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
    8187                 :             : #endif
    8188                 :             : 
    8189                 :             :                 /*
    8190                 :             :                  * Immediately log a message about the invalid page, but only to the
    8191                 :             :                  * server log. The reason to do so immediately is that this may be
    8192                 :             :                  * executed in a different backend than the one that originated the
    8193                 :             :                  * request. The reason to do so immediately is that the originator
    8194                 :             :                  * might not process the query result immediately (because it is busy
    8195                 :             :                  * doing another part of query processing) or at all (e.g. if it was
    8196                 :             :                  * cancelled or errored out due to another IO also failing). The
    8197                 :             :                  * definer of the IO will emit an ERROR or WARNING when processing the
    8198                 :             :                  * IO's results
    8199                 :             :                  *
    8200                 :             :                  * To avoid duplicating the code to emit these log messages, we reuse
    8201                 :             :                  * buffer_readv_report().
    8202                 :             :                  */
    8203   [ +  -  +  -  :        9673 :                 if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
                   -  + ]
    8204                 :             :                 {
    8205                 :           0 :                         PgAioResult result_one = {0};
    8206                 :             : 
    8207                 :           0 :                         buffer_readv_encode_error(&result_one, is_temp,
    8208                 :           0 :                                                                           *zeroed_buffer,
    8209                 :           0 :                                                                           *ignored_checksum,
    8210                 :           0 :                                                                           *buffer_invalid,
    8211                 :           0 :                                                                           *zeroed_buffer ? 1 : 0,
    8212                 :           0 :                                                                           *failed_checksum ? 1 : 0,
    8213                 :           0 :                                                                           buf_off, buf_off, buf_off);
    8214                 :           0 :                         pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
    8215                 :           0 :                 }
    8216                 :        9673 :         }
    8217                 :             : 
    8218                 :             :         /* Terminate I/O and set BM_VALID. */
    8219                 :        9673 :         set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
    8220         [ +  + ]:        9673 :         if (is_temp)
    8221                 :        2769 :                 TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
    8222                 :             :         else
    8223                 :        6904 :                 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
    8224                 :             : 
    8225                 :             :         /*
    8226                 :             :          * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
    8227                 :             :          * callback may not be executed in the same backend that called
    8228                 :             :          * BUFFER_READ_START. The alternative would be to defer calling the
    8229                 :             :          * tracepoint to a later point (e.g. the local completion callback for
    8230                 :             :          * shared buffer reads), which seems even less helpful.
    8231                 :             :          */
    8232                 :        9673 :         TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
    8233                 :             :                                                                           tag.blockNum,
    8234                 :             :                                                                           tag.spcOid,
    8235                 :             :                                                                           tag.dbOid,
    8236                 :             :                                                                           tag.relNumber,
    8237                 :             :                                                                           is_temp ? MyProcNumber : INVALID_PROC_NUMBER,
    8238                 :             :                                                                           false);
    8239                 :        9673 : }
    8240                 :             : 
    8241                 :             : /*
    8242                 :             :  * Perform completion handling of a single AIO read. This read may cover
    8243                 :             :  * multiple blocks / buffers.
    8244                 :             :  *
    8245                 :             :  * Shared between shared and local buffers, to reduce code duplication.
    8246                 :             :  */
    8247                 :             : static pg_attribute_always_inline PgAioResult
    8248                 :        6681 : buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
    8249                 :             :                                           uint8 cb_data, bool is_temp)
    8250                 :             : {
    8251                 :        6681 :         PgAioResult result = prior_result;
    8252                 :        6681 :         PgAioTargetData *td = pgaio_io_get_target_data(ioh);
    8253                 :        6681 :         uint8           first_error_off = 0;
    8254                 :        6681 :         uint8           first_zeroed_off = 0;
    8255                 :        6681 :         uint8           first_ignored_off = 0;
    8256                 :        6681 :         uint8           error_count = 0;
    8257                 :        6681 :         uint8           zeroed_count = 0;
    8258                 :        6681 :         uint8           ignored_count = 0;
    8259                 :        6681 :         uint8           checkfail_count = 0;
    8260                 :        6681 :         uint64     *io_data;
    8261                 :        6681 :         uint8           handle_data_len;
    8262                 :             : 
    8263         [ +  + ]:        6681 :         if (is_temp)
    8264                 :             :         {
    8265         [ +  - ]:         582 :                 Assert(td->smgr.is_temp);
    8266         [ +  - ]:         582 :                 Assert(pgaio_io_get_owner(ioh) == MyProcNumber);
    8267                 :         582 :         }
    8268                 :             :         else
    8269         [ +  - ]:        6099 :                 Assert(!td->smgr.is_temp);
    8270                 :             : 
    8271                 :             :         /*
    8272                 :             :          * Iterate over all the buffers affected by this IO and call the
    8273                 :             :          * per-buffer completion function for each buffer.
    8274                 :             :          */
    8275                 :        6681 :         io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
    8276         [ +  + ]:       16354 :         for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
    8277                 :             :         {
    8278                 :        9673 :                 Buffer          buf = io_data[buf_off];
    8279                 :        9673 :                 bool            failed;
    8280                 :        9673 :                 bool            failed_verification = false;
    8281                 :        9673 :                 bool            failed_checksum = false;
    8282                 :        9673 :                 bool            zeroed_buffer = false;
    8283                 :        9673 :                 bool            ignored_checksum = false;
    8284                 :             : 
    8285         [ +  - ]:        9673 :                 Assert(BufferIsValid(buf));
    8286                 :             : 
    8287                 :             :                 /*
    8288                 :             :                  * If the entire I/O failed on a lower-level, each buffer needs to be
    8289                 :             :                  * marked as failed. In case of a partial read, the first few buffers
    8290                 :             :                  * may be ok.
    8291                 :             :                  */
    8292                 :        9673 :                 failed =
    8293                 :        9673 :                         prior_result.status == PGAIO_RS_ERROR
    8294         [ -  + ]:        9673 :                         || prior_result.result <= buf_off;
    8295                 :             : 
    8296                 :        9673 :                 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
    8297                 :             :                                                                   &failed_verification,
    8298                 :             :                                                                   &failed_checksum,
    8299                 :             :                                                                   &ignored_checksum,
    8300                 :             :                                                                   &zeroed_buffer);
    8301                 :             : 
    8302                 :             :                 /*
    8303                 :             :                  * Track information about the number of different kinds of error
    8304                 :             :                  * conditions across all pages, as there can be multiple pages failing
    8305                 :             :                  * verification as part of one IO.
    8306                 :             :                  */
    8307   [ -  +  #  #  :        9673 :                 if (failed_verification && !zeroed_buffer && error_count++ == 0)
                   #  # ]
    8308                 :           0 :                         first_error_off = buf_off;
    8309   [ -  +  #  # ]:        9673 :                 if (zeroed_buffer && zeroed_count++ == 0)
    8310                 :           0 :                         first_zeroed_off = buf_off;
    8311   [ -  +  #  # ]:        9673 :                 if (ignored_checksum && ignored_count++ == 0)
    8312                 :           0 :                         first_ignored_off = buf_off;
    8313         [ +  - ]:        9673 :                 if (failed_checksum)
    8314                 :           0 :                         checkfail_count++;
    8315                 :        9673 :         }
    8316                 :             : 
    8317                 :             :         /*
    8318                 :             :          * If the smgr read succeeded [partially] and page verification failed for
    8319                 :             :          * some of the pages, adjust the IO's result state appropriately.
    8320                 :             :          */
    8321   [ +  -  -  + ]:       13362 :         if (prior_result.status != PGAIO_RS_ERROR &&
    8322   [ +  -  +  - ]:        6681 :                 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
    8323                 :             :         {
    8324                 :           0 :                 buffer_readv_encode_error(&result, is_temp,
    8325                 :           0 :                                                                   zeroed_count > 0, ignored_count > 0,
    8326                 :           0 :                                                                   error_count, zeroed_count, checkfail_count,
    8327                 :           0 :                                                                   first_error_off, first_zeroed_off,
    8328                 :           0 :                                                                   first_ignored_off);
    8329                 :           0 :                 pgaio_result_report(result, td, DEBUG1);
    8330                 :           0 :         }
    8331                 :             : 
    8332                 :             :         /*
    8333                 :             :          * For shared relations this reporting is done in
    8334                 :             :          * shared_buffer_readv_complete_local().
    8335                 :             :          */
    8336   [ +  +  +  - ]:        6681 :         if (is_temp && checkfail_count > 0)
    8337                 :           0 :                 pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
    8338                 :           0 :                                                                                           checkfail_count);
    8339                 :             : 
    8340                 :             :         return result;
    8341                 :        6681 : }
    8342                 :             : 
    8343                 :             : /*
    8344                 :             :  * AIO error reporting callback for aio_shared_buffer_readv_cb and
    8345                 :             :  * aio_local_buffer_readv_cb.
    8346                 :             :  *
    8347                 :             :  * The error is encoded / decoded in buffer_readv_encode_error() /
    8348                 :             :  * buffer_readv_decode_error().
    8349                 :             :  */
    8350                 :             : static void
    8351                 :           0 : buffer_readv_report(PgAioResult result, const PgAioTargetData *td,
    8352                 :             :                                         int elevel)
    8353                 :             : {
    8354                 :           0 :         int                     nblocks = td->smgr.nblocks;
    8355                 :           0 :         BlockNumber first = td->smgr.blockNum;
    8356                 :           0 :         BlockNumber last = first + nblocks - 1;
    8357                 :           0 :         ProcNumber      errProc =
    8358         [ #  # ]:           0 :                 td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER;
    8359                 :           0 :         RelPathStr      rpath =
    8360                 :           0 :                 relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
    8361                 :           0 :         bool            zeroed_any,
    8362                 :             :                                 ignored_any;
    8363                 :           0 :         uint8           zeroed_or_error_count,
    8364                 :             :                                 checkfail_count,
    8365                 :             :                                 first_off;
    8366                 :           0 :         uint8           affected_count;
    8367                 :           0 :         const char *msg_one,
    8368                 :             :                            *msg_mult,
    8369                 :             :                            *det_mult,
    8370                 :             :                            *hint_mult;
    8371                 :             : 
    8372                 :           0 :         buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
    8373                 :             :                                                           &zeroed_or_error_count,
    8374                 :             :                                                           &checkfail_count,
    8375                 :             :                                                           &first_off);
    8376                 :             : 
    8377                 :             :         /*
    8378                 :             :          * Treat a read that had both zeroed buffers *and* ignored checksums as a
    8379                 :             :          * special case, it's too irregular to be emitted the same way as the
    8380                 :             :          * other cases.
    8381                 :             :          */
    8382   [ #  #  #  # ]:           0 :         if (zeroed_any && ignored_any)
    8383                 :             :         {
    8384         [ #  # ]:           0 :                 Assert(zeroed_any && ignored_any);
    8385         [ #  # ]:           0 :                 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
    8386         [ #  # ]:           0 :                 Assert(result.status != PGAIO_RS_ERROR);
    8387                 :           0 :                 affected_count = zeroed_or_error_count;
    8388                 :             : 
    8389   [ #  #  #  #  :           0 :                 ereport(elevel,
          #  #  #  #  #  
                #  #  # ]
    8390                 :             :                                 errcode(ERRCODE_DATA_CORRUPTED),
    8391                 :             :                                 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
    8392                 :             :                                            affected_count, checkfail_count, first, last, rpath.str),
    8393                 :             :                                 affected_count > 1 ?
    8394                 :             :                                 errdetail("Block %u held the first zeroed page.",
    8395                 :             :                                                   first + first_off) : 0,
    8396                 :             :                                 errhint_plural("See server log for details about the other %d invalid block.",
    8397                 :             :                                                            "See server log for details about the other %d invalid blocks.",
    8398                 :             :                                                            affected_count + checkfail_count - 1,
    8399                 :             :                                                            affected_count + checkfail_count - 1));
    8400                 :           0 :                 return;
    8401                 :             :         }
    8402                 :             : 
    8403                 :             :         /*
    8404                 :             :          * The other messages are highly repetitive. To avoid duplicating a long
    8405                 :             :          * and complicated ereport(), gather the translated format strings
    8406                 :             :          * separately and then do one common ereport.
    8407                 :             :          */
    8408         [ #  # ]:           0 :         if (result.status == PGAIO_RS_ERROR)
    8409                 :             :         {
    8410         [ #  # ]:           0 :                 Assert(!zeroed_any);    /* can't have invalid pages when zeroing them */
    8411                 :           0 :                 affected_count = zeroed_or_error_count;
    8412                 :           0 :                 msg_one = _("invalid page in block %u of relation \"%s\"");
    8413                 :           0 :                 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
    8414                 :           0 :                 det_mult = _("Block %u held the first invalid page.");
    8415                 :           0 :                 hint_mult = _("See server log for the other %u invalid block(s).");
    8416                 :           0 :         }
    8417   [ #  #  #  # ]:           0 :         else if (zeroed_any && !ignored_any)
    8418                 :             :         {
    8419                 :           0 :                 affected_count = zeroed_or_error_count;
    8420                 :           0 :                 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
    8421                 :           0 :                 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
    8422                 :           0 :                 det_mult = _("Block %u held the first zeroed page.");
    8423                 :           0 :                 hint_mult = _("See server log for the other %u zeroed block(s).");
    8424                 :           0 :         }
    8425         [ #  # ]:           0 :         else if (!zeroed_any && ignored_any)
    8426                 :             :         {
    8427                 :           0 :                 affected_count = checkfail_count;
    8428                 :           0 :                 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
    8429                 :           0 :                 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
    8430                 :           0 :                 det_mult = _("Block %u held the first ignored page.");
    8431                 :           0 :                 hint_mult = _("See server log for the other %u ignored block(s).");
    8432                 :           0 :         }
    8433                 :             :         else
    8434                 :           0 :                 pg_unreachable();
    8435                 :             : 
    8436   [ #  #  #  #  :           0 :         ereport(elevel,
          #  #  #  #  #  
          #  #  #  #  #  
                   #  # ]
    8437                 :             :                         errcode(ERRCODE_DATA_CORRUPTED),
    8438                 :             :                         affected_count == 1 ?
    8439                 :             :                         errmsg_internal(msg_one, first + first_off, rpath.str) :
    8440                 :             :                         errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
    8441                 :             :                         affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
    8442                 :             :                         affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
    8443         [ #  # ]:           0 : }
    8444                 :             : 
    8445                 :             : static void
    8446                 :        6470 : shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
    8447                 :             : {
    8448                 :        6470 :         buffer_stage_common(ioh, false, false);
    8449                 :        6470 : }
    8450                 :             : 
    8451                 :             : static PgAioResult
    8452                 :        6099 : shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
    8453                 :             :                                                          uint8 cb_data)
    8454                 :             : {
    8455                 :        6099 :         return buffer_readv_complete(ioh, prior_result, cb_data, false);
    8456                 :             : }
    8457                 :             : 
    8458                 :             : /*
    8459                 :             :  * We need a backend-local completion callback for shared buffers, to be able
    8460                 :             :  * to report checksum errors correctly. Unfortunately that can only safely
    8461                 :             :  * happen if the reporting backend has previously called
    8462                 :             :  * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
    8463                 :             :  * the backend that started the IO. Hence this callback.
    8464                 :             :  */
    8465                 :             : static PgAioResult
    8466                 :        6470 : shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result,
    8467                 :             :                                                                    uint8 cb_data)
    8468                 :             : {
    8469                 :        6470 :         bool            zeroed_any,
    8470                 :             :                                 ignored_any;
    8471                 :        6470 :         uint8           zeroed_or_error_count,
    8472                 :             :                                 checkfail_count,
    8473                 :             :                                 first_off;
    8474                 :             : 
    8475         [ +  - ]:        6470 :         if (prior_result.status == PGAIO_RS_OK)
    8476                 :        6470 :                 return prior_result;
    8477                 :             : 
    8478                 :           0 :         buffer_readv_decode_error(prior_result,
    8479                 :             :                                                           &zeroed_any,
    8480                 :             :                                                           &ignored_any,
    8481                 :             :                                                           &zeroed_or_error_count,
    8482                 :             :                                                           &checkfail_count,
    8483                 :             :                                                           &first_off);
    8484                 :             : 
    8485         [ #  # ]:           0 :         if (checkfail_count)
    8486                 :             :         {
    8487                 :           0 :                 PgAioTargetData *td = pgaio_io_get_target_data(ioh);
    8488                 :             : 
    8489                 :           0 :                 pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
    8490                 :           0 :                                                                                           checkfail_count);
    8491                 :           0 :         }
    8492                 :             : 
    8493                 :           0 :         return prior_result;
    8494                 :        6470 : }
    8495                 :             : 
    8496                 :             : static void
    8497                 :         582 : local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
    8498                 :             : {
    8499                 :         582 :         buffer_stage_common(ioh, false, true);
    8500                 :         582 : }
    8501                 :             : 
    8502                 :             : static PgAioResult
    8503                 :         582 : local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
    8504                 :             :                                                         uint8 cb_data)
    8505                 :             : {
    8506                 :         582 :         return buffer_readv_complete(ioh, prior_result, cb_data, true);
    8507                 :             : }
    8508                 :             : 
    8509                 :             : /* readv callback is passed READ_BUFFERS_* flags as callback data */
    8510                 :             : const PgAioHandleCallbacks aio_shared_buffer_readv_cb = {
    8511                 :             :         .stage = shared_buffer_readv_stage,
    8512                 :             :         .complete_shared = shared_buffer_readv_complete,
    8513                 :             :         /* need a local callback to report checksum failures */
    8514                 :             :         .complete_local = shared_buffer_readv_complete_local,
    8515                 :             :         .report = buffer_readv_report,
    8516                 :             : };
    8517                 :             : 
    8518                 :             : /* readv callback is passed READ_BUFFERS_* flags as callback data */
    8519                 :             : const PgAioHandleCallbacks aio_local_buffer_readv_cb = {
    8520                 :             :         .stage = local_buffer_readv_stage,
    8521                 :             : 
    8522                 :             :         /*
    8523                 :             :          * Note that this, in contrast to the shared_buffers case, uses
    8524                 :             :          * complete_local, as only the issuing backend has access to the required
    8525                 :             :          * datastructures. This is important in case the IO completion may be
    8526                 :             :          * consumed incidentally by another backend.
    8527                 :             :          */
    8528                 :             :         .complete_local = local_buffer_readv_complete,
    8529                 :             :         .report = buffer_readv_report,
    8530                 :             : };
        

Generated by: LCOV version 2.3.2-1