LCOV - code coverage report
Current view: top level - src/backend/storage/smgr - md.c (source / functions) Coverage Total Hit
Test: Code coverage Lines: 62.9 % 703 442
Test Date: 2026-01-26 10:56:24 Functions: 82.1 % 39 32
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
Branches: 31.4 % 494 155

             Branch data     Line data    Source code
       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * md.c
       4                 :             :  *        This code manages relations that reside on magnetic disk.
       5                 :             :  *
       6                 :             :  * Or at least, that was what the Berkeley folk had in mind when they named
       7                 :             :  * this file.  In reality, what this code provides is an interface from
       8                 :             :  * the smgr API to Unix-like filesystem APIs, so it will work with any type
       9                 :             :  * of device for which the operating system provides filesystem support.
      10                 :             :  * It doesn't matter whether the bits are on spinning rust or some other
      11                 :             :  * storage technology.
      12                 :             :  *
      13                 :             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      14                 :             :  * Portions Copyright (c) 1994, Regents of the University of California
      15                 :             :  *
      16                 :             :  *
      17                 :             :  * IDENTIFICATION
      18                 :             :  *        src/backend/storage/smgr/md.c
      19                 :             :  *
      20                 :             :  *-------------------------------------------------------------------------
      21                 :             :  */
      22                 :             : #include "postgres.h"
      23                 :             : 
      24                 :             : #include <limits.h>
      25                 :             : #include <unistd.h>
      26                 :             : #include <fcntl.h>
      27                 :             : #include <sys/file.h>
      28                 :             : 
      29                 :             : #include "access/xlogutils.h"
      30                 :             : #include "commands/tablespace.h"
      31                 :             : #include "common/file_utils.h"
      32                 :             : #include "miscadmin.h"
      33                 :             : #include "pg_trace.h"
      34                 :             : #include "pgstat.h"
      35                 :             : #include "storage/aio.h"
      36                 :             : #include "storage/bufmgr.h"
      37                 :             : #include "storage/fd.h"
      38                 :             : #include "storage/md.h"
      39                 :             : #include "storage/relfilelocator.h"
      40                 :             : #include "storage/smgr.h"
      41                 :             : #include "storage/sync.h"
      42                 :             : #include "utils/memutils.h"
      43                 :             : 
      44                 :             : /*
      45                 :             :  * The magnetic disk storage manager keeps track of open file
      46                 :             :  * descriptors in its own descriptor pool.  This is done to make it
      47                 :             :  * easier to support relations that are larger than the operating
      48                 :             :  * system's file size limit (often 2GBytes).  In order to do that,
      49                 :             :  * we break relations up into "segment" files that are each shorter than
      50                 :             :  * the OS file size limit.  The segment size is set by the RELSEG_SIZE
      51                 :             :  * configuration constant in pg_config.h.
      52                 :             :  *
      53                 :             :  * On disk, a relation must consist of consecutively numbered segment
      54                 :             :  * files in the pattern
      55                 :             :  *      -- Zero or more full segments of exactly RELSEG_SIZE blocks each
      56                 :             :  *      -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
      57                 :             :  *      -- Optionally, any number of inactive segments of size 0 blocks.
      58                 :             :  * The full and partial segments are collectively the "active" segments.
      59                 :             :  * Inactive segments are those that once contained data but are currently
      60                 :             :  * not needed because of an mdtruncate() operation.  The reason for leaving
      61                 :             :  * them present at size zero, rather than unlinking them, is that other
      62                 :             :  * backends and/or the checkpointer might be holding open file references to
      63                 :             :  * such segments.  If the relation expands again after mdtruncate(), such
      64                 :             :  * that a deactivated segment becomes active again, it is important that
      65                 :             :  * such file references still be valid --- else data might get written
      66                 :             :  * out to an unlinked old copy of a segment file that will eventually
      67                 :             :  * disappear.
      68                 :             :  *
      69                 :             :  * RELSEG_SIZE must fit into BlockNumber; but since we expose its value
      70                 :             :  * as an integer GUC, it actually needs to fit in signed int.  It's worth
      71                 :             :  * having a cross-check for this since configure's --with-segsize options
      72                 :             :  * could let people select insane values.
      73                 :             :  */
      74                 :             : StaticAssertDecl(RELSEG_SIZE > 0 && RELSEG_SIZE <= INT_MAX,
      75                 :             :                                  "RELSEG_SIZE must fit in an integer");
      76                 :             : 
      77                 :             : /*
      78                 :             :  * File descriptors are stored in the per-fork md_seg_fds arrays inside
      79                 :             :  * SMgrRelation. The length of these arrays is stored in md_num_open_segs.
      80                 :             :  * Note that a fork's md_num_open_segs having a specific value does not
      81                 :             :  * necessarily mean the relation doesn't have additional segments; we may
      82                 :             :  * just not have opened the next segment yet.  (We could not have "all
      83                 :             :  * segments are in the array" as an invariant anyway, since another backend
      84                 :             :  * could extend the relation while we aren't looking.)  We do not have
      85                 :             :  * entries for inactive segments, however; as soon as we find a partial
      86                 :             :  * segment, we assume that any subsequent segments are inactive.
      87                 :             :  *
      88                 :             :  * The entire MdfdVec array is palloc'd in the MdCxt memory context.
      89                 :             :  */
      90                 :             : 
      91                 :             : typedef struct _MdfdVec
      92                 :             : {
      93                 :             :         File            mdfd_vfd;               /* fd number in fd.c's pool */
      94                 :             :         BlockNumber mdfd_segno;         /* segment number, from 0 */
      95                 :             : } MdfdVec;
      96                 :             : 
      97                 :             : static MemoryContext MdCxt;             /* context for all MdfdVec objects */
      98                 :             : 
      99                 :             : 
     100                 :             : /* Populate a file tag describing an md.c segment file. */
     101                 :             : #define INIT_MD_FILETAG(a,xx_rlocator,xx_forknum,xx_segno) \
     102                 :             : ( \
     103                 :             :         memset(&(a), 0, sizeof(FileTag)), \
     104                 :             :         (a).handler = SYNC_HANDLER_MD, \
     105                 :             :         (a).rlocator = (xx_rlocator), \
     106                 :             :         (a).forknum = (xx_forknum), \
     107                 :             :         (a).segno = (xx_segno) \
     108                 :             : )
     109                 :             : 
     110                 :             : 
     111                 :             : /*** behavior for mdopen & _mdfd_getseg ***/
     112                 :             : /* ereport if segment not present */
     113                 :             : #define EXTENSION_FAIL                          (1 << 0)
     114                 :             : /* return NULL if segment not present */
     115                 :             : #define EXTENSION_RETURN_NULL           (1 << 1)
     116                 :             : /* create new segments as needed */
     117                 :             : #define EXTENSION_CREATE                        (1 << 2)
     118                 :             : /* create new segments if needed during recovery */
     119                 :             : #define EXTENSION_CREATE_RECOVERY       (1 << 3)
     120                 :             : /* don't try to open a segment, if not already open */
     121                 :             : #define EXTENSION_DONT_OPEN                     (1 << 5)
     122                 :             : 
     123                 :             : 
     124                 :             : /*
     125                 :             :  * Fixed-length string to represent paths to files that need to be built by
     126                 :             :  * md.c.
     127                 :             :  *
     128                 :             :  * The maximum number of segments is MaxBlockNumber / RELSEG_SIZE, where
     129                 :             :  * RELSEG_SIZE can be set to 1 (for testing only).
     130                 :             :  */
     131                 :             : #define SEGMENT_CHARS   OIDCHARS
     132                 :             : #define MD_PATH_STR_MAXLEN \
     133                 :             :         (\
     134                 :             :                 REL_PATH_STR_MAXLEN \
     135                 :             :                 + sizeof((char)'.') \
     136                 :             :                 + SEGMENT_CHARS \
     137                 :             :         )
     138                 :             : typedef struct MdPathStr
     139                 :             : {
     140                 :             :         char            str[MD_PATH_STR_MAXLEN + 1];
     141                 :             : } MdPathStr;
     142                 :             : 
     143                 :             : 
     144                 :             : /* local routines */
     145                 :             : static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
     146                 :             :                                                  bool isRedo);
     147                 :             : static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
     148                 :             : static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
     149                 :             :                                                                    MdfdVec *seg);
     150                 :             : static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
     151                 :             :                                                                         BlockNumber segno);
     152                 :             : static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
     153                 :             :                                                                         BlockNumber segno);
     154                 :             : static void _fdvec_resize(SMgrRelation reln,
     155                 :             :                                                   ForkNumber forknum,
     156                 :             :                                                   int nseg);
     157                 :             : static MdPathStr _mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
     158                 :             :                                                            BlockNumber segno);
     159                 :             : static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum,
     160                 :             :                                                           BlockNumber segno, int oflags);
     161                 :             : static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
     162                 :             :                                                          BlockNumber blkno, bool skipFsync, int behavior);
     163                 :             : static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
     164                 :             :                                                           MdfdVec *seg);
     165                 :             : 
     166                 :             : static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data);
     167                 :             : static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel);
     168                 :             : 
     169                 :             : const PgAioHandleCallbacks aio_md_readv_cb = {
     170                 :             :         .complete_shared = md_readv_complete,
     171                 :             :         .report = md_readv_report,
     172                 :             : };
     173                 :             : 
     174                 :             : 
     175                 :             : static inline int
     176                 :       99981 : _mdfd_open_flags(void)
     177                 :             : {
     178                 :       99981 :         int                     flags = O_RDWR | PG_BINARY;
     179                 :             : 
     180         [ +  - ]:       99981 :         if (io_direct_flags & IO_DIRECT_DATA)
     181                 :           0 :                 flags |= PG_O_DIRECT;
     182                 :             : 
     183                 :      199962 :         return flags;
     184                 :       99981 : }
     185                 :             : 
     186                 :             : /*
     187                 :             :  * mdinit() -- Initialize private state for magnetic disk storage manager.
     188                 :             :  */
     189                 :             : void
     190                 :         806 : mdinit(void)
     191                 :             : {
     192                 :         806 :         MdCxt = AllocSetContextCreate(TopMemoryContext,
     193                 :             :                                                                   "MdSmgr",
     194                 :             :                                                                   ALLOCSET_DEFAULT_SIZES);
     195                 :         806 : }
     196                 :             : 
     197                 :             : /*
     198                 :             :  * mdexists() -- Does the physical file exist?
     199                 :             :  *
     200                 :             :  * Note: this will return true for lingering files, with pending deletions
     201                 :             :  */
     202                 :             : bool
     203                 :       58572 : mdexists(SMgrRelation reln, ForkNumber forknum)
     204                 :             : {
     205                 :             :         /*
     206                 :             :          * Close it first, to ensure that we notice if the fork has been unlinked
     207                 :             :          * since we opened it.  As an optimization, we can skip that in recovery,
     208                 :             :          * which already closes relations when dropping them.
     209                 :             :          */
     210         [ -  + ]:       58572 :         if (!InRecovery)
     211                 :       58572 :                 mdclose(reln, forknum);
     212                 :             : 
     213                 :       58572 :         return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
     214                 :             : }
     215                 :             : 
     216                 :             : /*
     217                 :             :  * mdcreate() -- Create a new relation on magnetic disk.
     218                 :             :  *
     219                 :             :  * If isRedo is true, it's okay for the relation to exist already.
     220                 :             :  */
     221                 :             : void
     222                 :       12496 : mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
     223                 :             : {
     224                 :       12496 :         MdfdVec    *mdfd;
     225                 :       12496 :         RelPathStr      path;
     226                 :       12496 :         File            fd;
     227                 :             : 
     228   [ -  +  #  # ]:       12496 :         if (isRedo && reln->md_num_open_segs[forknum] > 0)
     229                 :           0 :                 return;                                 /* created and opened already... */
     230                 :             : 
     231         [ +  - ]:       12496 :         Assert(reln->md_num_open_segs[forknum] == 0);
     232                 :             : 
     233                 :             :         /*
     234                 :             :          * We may be using the target table space for the first time in this
     235                 :             :          * database, so create a per-database subdirectory if needed.
     236                 :             :          *
     237                 :             :          * XXX this is a fairly ugly violation of module layering, but this seems
     238                 :             :          * to be the best place to put the check.  Maybe TablespaceCreateDbspace
     239                 :             :          * should be here and not in commands/tablespace.c?  But that would imply
     240                 :             :          * importing a lot of stuff that smgr.c oughtn't know, either.
     241                 :             :          */
     242                 :       24992 :         TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
     243                 :       12496 :                                                         reln->smgr_rlocator.locator.dbOid,
     244                 :       12496 :                                                         isRedo);
     245                 :             : 
     246                 :       12496 :         path = relpath(reln->smgr_rlocator, forknum);
     247                 :             : 
     248                 :       12496 :         fd = PathNameOpenFile(path.str, _mdfd_open_flags() | O_CREAT | O_EXCL);
     249                 :             : 
     250         [ +  - ]:       12496 :         if (fd < 0)
     251                 :             :         {
     252                 :           0 :                 int                     save_errno = errno;
     253                 :             : 
     254         [ #  # ]:           0 :                 if (isRedo)
     255                 :           0 :                         fd = PathNameOpenFile(path.str, _mdfd_open_flags());
     256         [ #  # ]:           0 :                 if (fd < 0)
     257                 :             :                 {
     258                 :             :                         /* be sure to report the error reported by create, not open */
     259                 :           0 :                         errno = save_errno;
     260   [ #  #  #  # ]:           0 :                         ereport(ERROR,
     261                 :             :                                         (errcode_for_file_access(),
     262                 :             :                                          errmsg("could not create file \"%s\": %m", path.str)));
     263                 :           0 :                 }
     264                 :           0 :         }
     265                 :             : 
     266                 :       12496 :         _fdvec_resize(reln, forknum, 1);
     267                 :       12496 :         mdfd = &reln->md_seg_fds[forknum][0];
     268                 :       12496 :         mdfd->mdfd_vfd = fd;
     269                 :       12496 :         mdfd->mdfd_segno = 0;
     270                 :             : 
     271         [ +  + ]:       12496 :         if (!SmgrIsTemp(reln))
     272                 :       11440 :                 register_dirty_segment(reln, forknum, mdfd);
     273         [ -  + ]:       12496 : }
     274                 :             : 
     275                 :             : /*
     276                 :             :  * mdunlink() -- Unlink a relation.
     277                 :             :  *
     278                 :             :  * Note that we're passed a RelFileLocatorBackend --- by the time this is called,
     279                 :             :  * there won't be an SMgrRelation hashtable entry anymore.
     280                 :             :  *
     281                 :             :  * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
     282                 :             :  * to delete all forks.
     283                 :             :  *
     284                 :             :  * For regular relations, we don't unlink the first segment file of the rel,
     285                 :             :  * but just truncate it to zero length, and record a request to unlink it after
     286                 :             :  * the next checkpoint.  Additional segments can be unlinked immediately,
     287                 :             :  * however.  Leaving the empty file in place prevents that relfilenumber
     288                 :             :  * from being reused.  The scenario this protects us from is:
     289                 :             :  * 1. We delete a relation (and commit, and actually remove its file).
     290                 :             :  * 2. We create a new relation, which by chance gets the same relfilenumber as
     291                 :             :  *        the just-deleted one (OIDs must've wrapped around for that to happen).
     292                 :             :  * 3. We crash before another checkpoint occurs.
     293                 :             :  * During replay, we would delete the file and then recreate it, which is fine
     294                 :             :  * if the contents of the file were repopulated by subsequent WAL entries.
     295                 :             :  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
     296                 :             :  * file after populating it (as we do at wal_level=minimal), the contents of
     297                 :             :  * the file would be lost forever.  By leaving the empty file until after the
     298                 :             :  * next checkpoint, we prevent reassignment of the relfilenumber until it's
     299                 :             :  * safe, because relfilenumber assignment skips over any existing file.
     300                 :             :  *
     301                 :             :  * Additional segments, if any, are truncated and then unlinked.  The reason
     302                 :             :  * for truncating is that other backends may still hold open FDs for these at
     303                 :             :  * the smgr level, so that the kernel can't remove the file yet.  We want to
     304                 :             :  * reclaim the disk space right away despite that.
     305                 :             :  *
     306                 :             :  * We do not need to go through this dance for temp relations, though, because
     307                 :             :  * we never make WAL entries for temp rels, and so a temp rel poses no threat
     308                 :             :  * to the health of a regular rel that has taken over its relfilenumber.
     309                 :             :  * The fact that temp rels and regular rels have different file naming
     310                 :             :  * patterns provides additional safety.  Other backends shouldn't have open
     311                 :             :  * FDs for them, either.
     312                 :             :  *
     313                 :             :  * We also don't do it while performing a binary upgrade.  There is no reuse
     314                 :             :  * hazard in that case, since after a crash or even a simple ERROR, the
     315                 :             :  * upgrade fails and the whole cluster must be recreated from scratch.
     316                 :             :  * Furthermore, it is important to remove the files from disk immediately,
     317                 :             :  * because we may be about to reuse the same relfilenumber.
     318                 :             :  *
     319                 :             :  * All the above applies only to the relation's main fork; other forks can
     320                 :             :  * just be removed immediately, since they are not needed to prevent the
     321                 :             :  * relfilenumber from being recycled.  Also, we do not carefully
     322                 :             :  * track whether other forks have been created or not, but just attempt to
     323                 :             :  * unlink them unconditionally; so we should never complain about ENOENT.
     324                 :             :  *
     325                 :             :  * If isRedo is true, it's unsurprising for the relation to be already gone.
     326                 :             :  * Also, we should remove the file immediately instead of queuing a request
     327                 :             :  * for later, since during redo there's no possibility of creating a
     328                 :             :  * conflicting relation.
     329                 :             :  *
     330                 :             :  * Note: we currently just never warn about ENOENT at all.  We could warn in
     331                 :             :  * the main-fork, non-isRedo case, but it doesn't seem worth the trouble.
     332                 :             :  *
     333                 :             :  * Note: any failure should be reported as WARNING not ERROR, because
     334                 :             :  * we are usually not in a transaction anymore when this is called.
     335                 :             :  */
     336                 :             : void
     337                 :       35696 : mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
     338                 :             : {
     339                 :             :         /* Now do the per-fork work */
     340         [ -  + ]:       35696 :         if (forknum == InvalidForkNumber)
     341                 :             :         {
     342         [ #  # ]:           0 :                 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
     343                 :           0 :                         mdunlinkfork(rlocator, forknum, isRedo);
     344                 :           0 :         }
     345                 :             :         else
     346                 :       35696 :                 mdunlinkfork(rlocator, forknum, isRedo);
     347                 :       35696 : }
     348                 :             : 
     349                 :             : /*
     350                 :             :  * Truncate a file to release disk space.
     351                 :             :  */
     352                 :             : static int
     353                 :       39870 : do_truncate(const char *path)
     354                 :             : {
     355                 :       39870 :         int                     save_errno;
     356                 :       39870 :         int                     ret;
     357                 :             : 
     358                 :       39870 :         ret = pg_truncate(path, 0);
     359                 :             : 
     360                 :             :         /* Log a warning here to avoid repetition in callers. */
     361   [ +  +  +  - ]:       39870 :         if (ret < 0 && errno != ENOENT)
     362                 :             :         {
     363                 :           0 :                 save_errno = errno;
     364   [ #  #  #  # ]:           0 :                 ereport(WARNING,
     365                 :             :                                 (errcode_for_file_access(),
     366                 :             :                                  errmsg("could not truncate file \"%s\": %m", path)));
     367                 :           0 :                 errno = save_errno;
     368                 :           0 :         }
     369                 :             : 
     370                 :       79740 :         return ret;
     371                 :       39870 : }
     372                 :             : 
     373                 :             : static void
     374                 :       35696 : mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
     375                 :             : {
     376                 :       35696 :         RelPathStr      path;
     377                 :       35696 :         int                     ret;
     378                 :       35696 :         int                     save_errno;
     379                 :             : 
     380                 :       35696 :         path = relpath(rlocator, forknum);
     381                 :             : 
     382                 :             :         /*
     383                 :             :          * Truncate and then unlink the first segment, or just register a request
     384                 :             :          * to unlink it later, as described in the comments for mdunlink().
     385                 :             :          */
     386   [ +  -  +  -  :       35696 :         if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
             +  +  +  + ]
     387                 :        8924 :                 RelFileLocatorBackendIsTemp(rlocator))
     388                 :             :         {
     389         [ +  + ]:       27784 :                 if (!RelFileLocatorBackendIsTemp(rlocator))
     390                 :             :                 {
     391                 :             :                         /* Prevent other backends' fds from holding on to the disk space */
     392                 :       23736 :                         ret = do_truncate(path.str);
     393                 :             : 
     394                 :             :                         /* Forget any pending sync requests for the first segment */
     395                 :       23736 :                         save_errno = errno;
     396                 :       23736 :                         register_forget_request(rlocator, forknum, 0 /* first seg */ );
     397                 :       23736 :                         errno = save_errno;
     398                 :       23736 :                 }
     399                 :             :                 else
     400                 :        4048 :                         ret = 0;
     401                 :             : 
     402                 :             :                 /* Next unlink the file, unless it was already found to be missing */
     403   [ +  +  -  + ]:       27784 :                 if (ret >= 0 || errno != ENOENT)
     404                 :             :                 {
     405                 :        4359 :                         ret = unlink(path.str);
     406   [ +  +  +  - ]:        4359 :                         if (ret < 0 && errno != ENOENT)
     407                 :             :                         {
     408                 :           0 :                                 save_errno = errno;
     409   [ #  #  #  # ]:           0 :                                 ereport(WARNING,
     410                 :             :                                                 (errcode_for_file_access(),
     411                 :             :                                                  errmsg("could not remove file \"%s\": %m", path.str)));
     412                 :           0 :                                 errno = save_errno;
     413                 :           0 :                         }
     414                 :        4359 :                 }
     415                 :       27784 :         }
     416                 :             :         else
     417                 :             :         {
     418                 :             :                 /* Prevent other backends' fds from holding on to the disk space */
     419                 :        7912 :                 ret = do_truncate(path.str);
     420                 :             : 
     421                 :             :                 /* Register request to unlink first segment later */
     422                 :        7912 :                 save_errno = errno;
     423                 :        7912 :                 register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
     424                 :        7912 :                 errno = save_errno;
     425                 :             :         }
     426                 :             : 
     427                 :             :         /*
     428                 :             :          * Delete any additional segments.
     429                 :             :          *
     430                 :             :          * Note that because we loop until getting ENOENT, we will correctly
     431                 :             :          * remove all inactive segments as well as active ones.  Ideally we'd
     432                 :             :          * continue the loop until getting exactly that errno, but that risks an
     433                 :             :          * infinite loop if the problem is directory-wide (for instance, if we
     434                 :             :          * suddenly can't read the data directory itself).  We compromise by
     435                 :             :          * continuing after a non-ENOENT truncate error, but stopping after any
     436                 :             :          * unlink error.  If there is indeed a directory-wide problem, additional
     437                 :             :          * unlink attempts wouldn't work anyway.
     438                 :             :          */
     439   [ +  +  -  + ]:       35696 :         if (ret >= 0 || errno != ENOENT)
     440                 :             :         {
     441                 :        9278 :                 MdPathStr       segpath;
     442                 :        9278 :                 BlockNumber segno;
     443                 :             : 
     444                 :        9278 :                 for (segno = 1;; segno++)
     445                 :             :                 {
     446                 :        9278 :                         sprintf(segpath.str, "%s.%u", path.str, segno);
     447                 :             : 
     448         [ +  + ]:        9278 :                         if (!RelFileLocatorBackendIsTemp(rlocator))
     449                 :             :                         {
     450                 :             :                                 /*
     451                 :             :                                  * Prevent other backends' fds from holding on to the disk
     452                 :             :                                  * space.  We're done if we see ENOENT, though.
     453                 :             :                                  */
     454   [ +  -  +  - ]:        8222 :                                 if (do_truncate(segpath.str) < 0 && errno == ENOENT)
     455                 :        8222 :                                         break;
     456                 :             : 
     457                 :             :                                 /*
     458                 :             :                                  * Forget any pending sync requests for this segment before we
     459                 :             :                                  * try to unlink.
     460                 :             :                                  */
     461                 :           0 :                                 register_forget_request(rlocator, forknum, segno);
     462                 :           0 :                         }
     463                 :             : 
     464         [ -  + ]:        1056 :                         if (unlink(segpath.str) < 0)
     465                 :             :                         {
     466                 :             :                                 /* ENOENT is expected after the last segment... */
     467         [ +  - ]:        1056 :                                 if (errno != ENOENT)
     468   [ #  #  #  # ]:           0 :                                         ereport(WARNING,
     469                 :             :                                                         (errcode_for_file_access(),
     470                 :             :                                                          errmsg("could not remove file \"%s\": %m", segpath.str)));
     471                 :        1056 :                                 break;
     472                 :             :                         }
     473                 :           0 :                 }
     474                 :        9278 :         }
     475                 :       35696 : }
     476                 :             : 
     477                 :             : /*
     478                 :             :  * mdextend() -- Add a block to the specified relation.
     479                 :             :  *
     480                 :             :  * The semantics are nearly the same as mdwrite(): write at the
     481                 :             :  * specified position.  However, this is to be used for the case of
     482                 :             :  * extending a relation (i.e., blocknum is at or beyond the current
     483                 :             :  * EOF).  Note that we assume writing a block beyond current EOF
     484                 :             :  * causes intervening file space to become filled with zeroes.
     485                 :             :  */
     486                 :             : void
     487                 :       10470 : mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
     488                 :             :                  const void *buffer, bool skipFsync)
     489                 :             : {
     490                 :       10470 :         pgoff_t         seekpos;
     491                 :       10470 :         int                     nbytes;
     492                 :       10470 :         MdfdVec    *v;
     493                 :             : 
     494                 :             :         /* If this build supports direct I/O, the buffer must be I/O aligned. */
     495                 :             :         if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
     496         [ +  - ]:       10470 :                 Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
     497                 :             : 
     498                 :             :         /* This assert is too expensive to have on normally ... */
     499                 :             : #ifdef CHECK_WRITE_VS_EXTEND
     500                 :             :         Assert(blocknum >= mdnblocks(reln, forknum));
     501                 :             : #endif
     502                 :             : 
     503                 :             :         /*
     504                 :             :          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
     505                 :             :          * more --- we mustn't create a block whose number actually is
     506                 :             :          * InvalidBlockNumber.  (Note that this failure should be unreachable
     507                 :             :          * because of upstream checks in bufmgr.c.)
     508                 :             :          */
     509         [ +  - ]:       10470 :         if (blocknum == InvalidBlockNumber)
     510   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     511                 :             :                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     512                 :             :                                  errmsg("cannot extend file \"%s\" beyond %u blocks",
     513                 :             :                                                 relpath(reln->smgr_rlocator, forknum).str,
     514                 :             :                                                 InvalidBlockNumber)));
     515                 :             : 
     516                 :       10470 :         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
     517                 :             : 
     518                 :       10470 :         seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
     519                 :             : 
     520         [ +  - ]:       10470 :         Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
     521                 :             : 
     522         [ +  - ]:       10470 :         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
     523                 :             :         {
     524         [ #  # ]:           0 :                 if (nbytes < 0)
     525   [ #  #  #  # ]:           0 :                         ereport(ERROR,
     526                 :             :                                         (errcode_for_file_access(),
     527                 :             :                                          errmsg("could not extend file \"%s\": %m",
     528                 :             :                                                         FilePathName(v->mdfd_vfd)),
     529                 :             :                                          errhint("Check free disk space.")));
     530                 :             :                 /* short write: complain appropriately */
     531   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     532                 :             :                                 (errcode(ERRCODE_DISK_FULL),
     533                 :             :                                  errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
     534                 :             :                                                 FilePathName(v->mdfd_vfd),
     535                 :             :                                                 nbytes, BLCKSZ, blocknum),
     536                 :             :                                  errhint("Check free disk space.")));
     537                 :           0 :         }
     538                 :             : 
     539   [ +  +  -  + ]:       10470 :         if (!skipFsync && !SmgrIsTemp(reln))
     540                 :           9 :                 register_dirty_segment(reln, forknum, v);
     541                 :             : 
     542         [ +  - ]:       10470 :         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
     543                 :       10470 : }
     544                 :             : 
     545                 :             : /*
     546                 :             :  * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
     547                 :             :  *
     548                 :             :  * Similar to mdextend(), except the relation can be extended by multiple
     549                 :             :  * blocks at once and the added blocks will be filled with zeroes.
     550                 :             :  */
     551                 :             : void
     552                 :       27454 : mdzeroextend(SMgrRelation reln, ForkNumber forknum,
     553                 :             :                          BlockNumber blocknum, int nblocks, bool skipFsync)
     554                 :             : {
     555                 :       27454 :         MdfdVec    *v;
     556                 :       27454 :         BlockNumber curblocknum = blocknum;
     557                 :       27454 :         int                     remblocks = nblocks;
     558                 :             : 
     559         [ +  - ]:       27454 :         Assert(nblocks > 0);
     560                 :             : 
     561                 :             :         /* This assert is too expensive to have on normally ... */
     562                 :             : #ifdef CHECK_WRITE_VS_EXTEND
     563                 :             :         Assert(blocknum >= mdnblocks(reln, forknum));
     564                 :             : #endif
     565                 :             : 
     566                 :             :         /*
     567                 :             :          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
     568                 :             :          * more --- we mustn't create a block whose number actually is
     569                 :             :          * InvalidBlockNumber or larger.
     570                 :             :          */
     571         [ +  - ]:       27454 :         if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
     572   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     573                 :             :                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     574                 :             :                                  errmsg("cannot extend file \"%s\" beyond %u blocks",
     575                 :             :                                                 relpath(reln->smgr_rlocator, forknum).str,
     576                 :             :                                                 InvalidBlockNumber)));
     577                 :             : 
     578         [ +  + ]:       54908 :         while (remblocks > 0)
     579                 :             :         {
     580                 :       27454 :                 BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
     581                 :       27454 :                 pgoff_t         seekpos = (pgoff_t) BLCKSZ * segstartblock;
     582                 :       27454 :                 int                     numblocks;
     583                 :             : 
     584         [ -  + ]:       27454 :                 if (segstartblock + remblocks > RELSEG_SIZE)
     585                 :           0 :                         numblocks = RELSEG_SIZE - segstartblock;
     586                 :             :                 else
     587                 :       27454 :                         numblocks = remblocks;
     588                 :             : 
     589                 :       27454 :                 v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
     590                 :             : 
     591         [ -  + ]:       27454 :                 Assert(segstartblock < RELSEG_SIZE);
     592         [ -  + ]:       27454 :                 Assert(segstartblock + numblocks <= RELSEG_SIZE);
     593                 :             : 
     594                 :             :                 /*
     595                 :             :                  * If available and useful, use posix_fallocate() (via
     596                 :             :                  * FileFallocate()) to extend the relation. That's often more
     597                 :             :                  * efficient than using write(), as it commonly won't cause the kernel
     598                 :             :                  * to allocate page cache space for the extended pages.
     599                 :             :                  *
     600                 :             :                  * However, we don't use FileFallocate() for small extensions, as it
     601                 :             :                  * defeats delayed allocation on some filesystems. Not clear where
     602                 :             :                  * that decision should be made though? For now just use a cutoff of
     603                 :             :                  * 8, anything between 4 and 8 worked OK in some local testing.
     604                 :             :                  */
     605         [ +  + ]:       27454 :                 if (numblocks > 8)
     606                 :             :                 {
     607                 :         131 :                         int                     ret;
     608                 :             : 
     609                 :         262 :                         ret = FileFallocate(v->mdfd_vfd,
     610                 :         131 :                                                                 seekpos, (pgoff_t) BLCKSZ * numblocks,
     611                 :             :                                                                 WAIT_EVENT_DATA_FILE_EXTEND);
     612         [ +  - ]:         131 :                         if (ret != 0)
     613                 :             :                         {
     614   [ #  #  #  # ]:           0 :                                 ereport(ERROR,
     615                 :             :                                                 errcode_for_file_access(),
     616                 :             :                                                 errmsg("could not extend file \"%s\" with FileFallocate(): %m",
     617                 :             :                                                            FilePathName(v->mdfd_vfd)),
     618                 :             :                                                 errhint("Check free disk space."));
     619                 :           0 :                         }
     620                 :         131 :                 }
     621                 :             :                 else
     622                 :             :                 {
     623                 :       27323 :                         int                     ret;
     624                 :             : 
     625                 :             :                         /*
     626                 :             :                          * Even if we don't want to use fallocate, we can still extend a
     627                 :             :                          * bit more efficiently than writing each 8kB block individually.
     628                 :             :                          * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
     629                 :             :                          * to avoid multiple writes or needing a zeroed buffer for the
     630                 :             :                          * whole length of the extension.
     631                 :             :                          */
     632                 :       54646 :                         ret = FileZero(v->mdfd_vfd,
     633                 :       27323 :                                                    seekpos, (pgoff_t) BLCKSZ * numblocks,
     634                 :             :                                                    WAIT_EVENT_DATA_FILE_EXTEND);
     635         [ +  - ]:       27323 :                         if (ret < 0)
     636   [ #  #  #  # ]:           0 :                                 ereport(ERROR,
     637                 :             :                                                 errcode_for_file_access(),
     638                 :             :                                                 errmsg("could not extend file \"%s\": %m",
     639                 :             :                                                            FilePathName(v->mdfd_vfd)),
     640                 :             :                                                 errhint("Check free disk space."));
     641                 :       27323 :                 }
     642                 :             : 
     643   [ +  -  +  + ]:       27454 :                 if (!skipFsync && !SmgrIsTemp(reln))
     644                 :       24322 :                         register_dirty_segment(reln, forknum, v);
     645                 :             : 
     646         [ -  + ]:       27454 :                 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
     647                 :             : 
     648                 :       27454 :                 remblocks -= numblocks;
     649                 :       27454 :                 curblocknum += numblocks;
     650                 :       27454 :         }
     651                 :       27454 : }
     652                 :             : 
     653                 :             : /*
     654                 :             :  * mdopenfork() -- Open one fork of the specified relation.
     655                 :             :  *
     656                 :             :  * Note we only open the first segment, when there are multiple segments.
     657                 :             :  *
     658                 :             :  * If first segment is not present, either ereport or return NULL according
     659                 :             :  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
     660                 :             :  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
     661                 :             :  * invent one out of whole cloth.
     662                 :             :  */
     663                 :             : static MdfdVec *
     664                 :      598324 : mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
     665                 :             : {
     666                 :      598324 :         MdfdVec    *mdfd;
     667                 :      598324 :         RelPathStr      path;
     668                 :      598324 :         File            fd;
     669                 :             : 
     670                 :             :         /* No work if already open */
     671         [ +  + ]:      598324 :         if (reln->md_num_open_segs[forknum] > 0)
     672                 :      514345 :                 return &reln->md_seg_fds[forknum][0];
     673                 :             : 
     674                 :       83979 :         path = relpath(reln->smgr_rlocator, forknum);
     675                 :             : 
     676                 :       83979 :         fd = PathNameOpenFile(path.str, _mdfd_open_flags());
     677                 :             : 
     678         [ +  + ]:       83979 :         if (fd < 0)
     679                 :             :         {
     680         [ +  - ]:       42362 :                 if ((behavior & EXTENSION_RETURN_NULL) &&
     681                 :       42362 :                         FILE_POSSIBLY_DELETED(errno))
     682                 :       42362 :                         return NULL;
     683   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     684                 :             :                                 (errcode_for_file_access(),
     685                 :             :                                  errmsg("could not open file \"%s\": %m", path.str)));
     686                 :           0 :         }
     687                 :             : 
     688                 :       41617 :         _fdvec_resize(reln, forknum, 1);
     689                 :       41617 :         mdfd = &reln->md_seg_fds[forknum][0];
     690                 :       41617 :         mdfd->mdfd_vfd = fd;
     691                 :       41617 :         mdfd->mdfd_segno = 0;
     692                 :             : 
     693         [ +  - ]:       41617 :         Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
     694                 :             : 
     695                 :       41617 :         return mdfd;
     696                 :      598324 : }
     697                 :             : 
     698                 :             : /*
     699                 :             :  * mdopen() -- Initialize newly-opened relation.
     700                 :             :  */
     701                 :             : void
     702                 :       65565 : mdopen(SMgrRelation reln)
     703                 :             : {
     704                 :             :         /* mark it not open */
     705         [ +  + ]:      327825 :         for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
     706                 :      262260 :                 reln->md_num_open_segs[forknum] = 0;
     707                 :       65565 : }
     708                 :             : 
     709                 :             : /*
     710                 :             :  * mdclose() -- Close the specified relation, if it isn't closed already.
     711                 :             :  */
     712                 :             : void
     713                 :      447416 : mdclose(SMgrRelation reln, ForkNumber forknum)
     714                 :             : {
     715                 :      447416 :         int                     nopensegs = reln->md_num_open_segs[forknum];
     716                 :             : 
     717                 :             :         /* No work if already closed */
     718         [ +  + ]:      447416 :         if (nopensegs == 0)
     719                 :      405401 :                 return;
     720                 :             : 
     721                 :             :         /* close segments starting from the end */
     722         [ +  + ]:       84030 :         while (nopensegs > 0)
     723                 :             :         {
     724                 :       42015 :                 MdfdVec    *v = &reln->md_seg_fds[forknum][nopensegs - 1];
     725                 :             : 
     726                 :       42015 :                 FileClose(v->mdfd_vfd);
     727                 :       42015 :                 _fdvec_resize(reln, forknum, nopensegs - 1);
     728                 :       42015 :                 nopensegs--;
     729                 :       42015 :         }
     730         [ -  + ]:      447416 : }
     731                 :             : 
     732                 :             : /*
     733                 :             :  * mdprefetch() -- Initiate asynchronous read of the specified blocks of a relation
     734                 :             :  */
     735                 :             : bool
     736                 :           0 : mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
     737                 :             :                    int nblocks)
     738                 :             : {
     739                 :             : #ifdef USE_PREFETCH
     740                 :             : 
     741         [ #  # ]:           0 :         Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
     742                 :             : 
     743         [ #  # ]:           0 :         if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
     744                 :           0 :                 return false;
     745                 :             : 
     746         [ #  # ]:           0 :         while (nblocks > 0)
     747                 :             :         {
     748                 :           0 :                 pgoff_t         seekpos;
     749                 :           0 :                 MdfdVec    *v;
     750                 :           0 :                 int                     nblocks_this_segment;
     751                 :             : 
     752                 :           0 :                 v = _mdfd_getseg(reln, forknum, blocknum, false,
     753                 :           0 :                                                  InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
     754         [ #  # ]:           0 :                 if (v == NULL)
     755                 :           0 :                         return false;
     756                 :             : 
     757                 :           0 :                 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
     758                 :             : 
     759         [ #  # ]:           0 :                 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
     760                 :             : 
     761                 :           0 :                 nblocks_this_segment =
     762         [ #  # ]:           0 :                         Min(nblocks,
     763                 :             :                                 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
     764                 :             : 
     765                 :           0 :                 (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ * nblocks_this_segment,
     766                 :             :                                                         WAIT_EVENT_DATA_FILE_PREFETCH);
     767                 :             : 
     768                 :           0 :                 blocknum += nblocks_this_segment;
     769                 :           0 :                 nblocks -= nblocks_this_segment;
     770      [ #  #  # ]:           0 :         }
     771                 :             : #endif                                                  /* USE_PREFETCH */
     772                 :             : 
     773                 :           0 :         return true;
     774                 :           0 : }
     775                 :             : 
     776                 :             : /*
     777                 :             :  * Convert an array of buffer address into an array of iovec objects, and
     778                 :             :  * return the number that were required.  'iov' must have enough space for up
     779                 :             :  * to 'nblocks' elements, but the number used may be less depending on
     780                 :             :  * merging.  In the case of a run of fully contiguous buffers, a single iovec
     781                 :             :  * will be populated that can be handled as a plain non-vectored I/O.
     782                 :             :  */
     783                 :             : static int
     784                 :       15230 : buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
     785                 :             : {
     786                 :       15230 :         struct iovec *iovp;
     787                 :       15230 :         int                     iovcnt;
     788                 :             : 
     789         [ +  - ]:       15230 :         Assert(nblocks >= 1);
     790                 :             : 
     791                 :             :         /* If this build supports direct I/O, buffers must be I/O aligned. */
     792         [ +  + ]:       34558 :         for (int i = 0; i < nblocks; ++i)
     793                 :             :         {
     794                 :             :                 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
     795         [ +  - ]:       19328 :                         Assert((uintptr_t) buffers[i] ==
     796                 :             :                                    TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
     797                 :       19328 :         }
     798                 :             : 
     799                 :             :         /* Start the first iovec off with the first buffer. */
     800                 :       15230 :         iovp = &iov[0];
     801                 :       15230 :         iovp->iov_base = buffers[0];
     802                 :       15230 :         iovp->iov_len = BLCKSZ;
     803                 :       15230 :         iovcnt = 1;
     804                 :             : 
     805                 :             :         /* Try to merge the rest. */
     806         [ +  + ]:       19328 :         for (int i = 1; i < nblocks; ++i)
     807                 :             :         {
     808                 :        4098 :                 void       *buffer = buffers[i];
     809                 :             : 
     810         [ +  + ]:        4098 :                 if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
     811                 :             :                 {
     812                 :             :                         /* Contiguous with the last iovec. */
     813                 :        3890 :                         iovp->iov_len += BLCKSZ;
     814                 :        3890 :                 }
     815                 :             :                 else
     816                 :             :                 {
     817                 :             :                         /* Need a new iovec. */
     818                 :         208 :                         iovp++;
     819                 :         208 :                         iovp->iov_base = buffer;
     820                 :         208 :                         iovp->iov_len = BLCKSZ;
     821                 :         208 :                         iovcnt++;
     822                 :             :                 }
     823                 :        4098 :         }
     824                 :             : 
     825                 :       30460 :         return iovcnt;
     826                 :       15230 : }
     827                 :             : 
     828                 :             : /*
     829                 :             :  * mdmaxcombine() -- Return the maximum number of total blocks that can be
     830                 :             :  *                               combined with an IO starting at blocknum.
     831                 :             :  */
     832                 :             : uint32
     833                 :         639 : mdmaxcombine(SMgrRelation reln, ForkNumber forknum,
     834                 :             :                          BlockNumber blocknum)
     835                 :             : {
     836                 :         639 :         BlockNumber segoff;
     837                 :             : 
     838                 :         639 :         segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
     839                 :             : 
     840                 :        1278 :         return RELSEG_SIZE - segoff;
     841                 :         639 : }
     842                 :             : 
     843                 :             : /*
     844                 :             :  * mdreadv() -- Read the specified blocks from a relation.
     845                 :             :  */
     846                 :             : void
     847                 :         177 : mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
     848                 :             :                 void **buffers, BlockNumber nblocks)
     849                 :             : {
     850         [ +  + ]:         354 :         while (nblocks > 0)
     851                 :             :         {
     852                 :         177 :                 struct iovec iov[PG_IOV_MAX];
     853                 :         177 :                 int                     iovcnt;
     854                 :         177 :                 pgoff_t         seekpos;
     855                 :         177 :                 int                     nbytes;
     856                 :         177 :                 MdfdVec    *v;
     857                 :         177 :                 BlockNumber nblocks_this_segment;
     858                 :         177 :                 size_t          transferred_this_segment;
     859                 :         177 :                 size_t          size_this_segment;
     860                 :             : 
     861                 :         177 :                 v = _mdfd_getseg(reln, forknum, blocknum, false,
     862                 :             :                                                  EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
     863                 :             : 
     864                 :         177 :                 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
     865                 :             : 
     866         [ +  - ]:         177 :                 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
     867                 :             : 
     868                 :         177 :                 nblocks_this_segment =
     869         [ +  - ]:         177 :                         Min(nblocks,
     870                 :             :                                 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
     871         [ +  - ]:         177 :                 nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
     872                 :             : 
     873         [ +  - ]:         177 :                 if (nblocks_this_segment != nblocks)
     874   [ #  #  #  # ]:           0 :                         elog(ERROR, "read crosses segment boundary");
     875                 :             : 
     876                 :         177 :                 iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
     877                 :         177 :                 size_this_segment = nblocks_this_segment * BLCKSZ;
     878                 :         177 :                 transferred_this_segment = 0;
     879                 :             : 
     880                 :             :                 /*
     881                 :             :                  * Inner loop to continue after a short read.  We'll keep going until
     882                 :             :                  * we hit EOF rather than assuming that a short read means we hit the
     883                 :             :                  * end.
     884                 :             :                  */
     885                 :         177 :                 for (;;)
     886                 :             :                 {
     887                 :         177 :                         TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
     888                 :             :                                                                                                 reln->smgr_rlocator.locator.spcOid,
     889                 :             :                                                                                                 reln->smgr_rlocator.locator.dbOid,
     890                 :             :                                                                                                 reln->smgr_rlocator.locator.relNumber,
     891                 :             :                                                                                                 reln->smgr_rlocator.backend);
     892                 :         177 :                         nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
     893                 :             :                                                            WAIT_EVENT_DATA_FILE_READ);
     894                 :         177 :                         TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
     895                 :             :                                                                                            reln->smgr_rlocator.locator.spcOid,
     896                 :             :                                                                                            reln->smgr_rlocator.locator.dbOid,
     897                 :             :                                                                                            reln->smgr_rlocator.locator.relNumber,
     898                 :             :                                                                                            reln->smgr_rlocator.backend,
     899                 :             :                                                                                            nbytes,
     900                 :             :                                                                                            size_this_segment - transferred_this_segment);
     901                 :             : 
     902                 :             : #ifdef SIMULATE_SHORT_READ
     903                 :             :                         nbytes = Min(nbytes, 4096);
     904                 :             : #endif
     905                 :             : 
     906         [ +  - ]:         177 :                         if (nbytes < 0)
     907   [ #  #  #  # ]:           0 :                                 ereport(ERROR,
     908                 :             :                                                 (errcode_for_file_access(),
     909                 :             :                                                  errmsg("could not read blocks %u..%u in file \"%s\": %m",
     910                 :             :                                                                 blocknum,
     911                 :             :                                                                 blocknum + nblocks_this_segment - 1,
     912                 :             :                                                                 FilePathName(v->mdfd_vfd))));
     913                 :             : 
     914         [ +  - ]:         177 :                         if (nbytes == 0)
     915                 :             :                         {
     916                 :             :                                 /*
     917                 :             :                                  * We are at or past EOF, or we read a partial block at EOF.
     918                 :             :                                  * Normally this is an error; upper levels should never try to
     919                 :             :                                  * read a nonexistent block.  However, if zero_damaged_pages
     920                 :             :                                  * is ON or we are InRecovery, we should instead return zeroes
     921                 :             :                                  * without complaining.  This allows, for example, the case of
     922                 :             :                                  * trying to update a block that was later truncated away.
     923                 :             :                                  *
     924                 :             :                                  * NB: We think that this codepath is unreachable in recovery
     925                 :             :                                  * and incomplete with zero_damaged_pages, as missing segments
     926                 :             :                                  * are not created. Putting blocks into the buffer-pool that
     927                 :             :                                  * do not exist on disk is rather problematic, as it will not
     928                 :             :                                  * be found by scans that rely on smgrnblocks(), as they are
     929                 :             :                                  * beyond EOF. It also can cause weird problems with relation
     930                 :             :                                  * extension, as relation extension does not expect blocks
     931                 :             :                                  * beyond EOF to exist.
     932                 :             :                                  *
     933                 :             :                                  * Therefore we do not want to copy the logic into
     934                 :             :                                  * mdstartreadv(), where it would have to be more complicated
     935                 :             :                                  * due to potential differences in the zero_damaged_pages
     936                 :             :                                  * setting between the definer and completor of IO.
     937                 :             :                                  *
     938                 :             :                                  * For PG 18, we are putting an Assert(false) in mdreadv()
     939                 :             :                                  * (triggering failures in assertion-enabled builds, but
     940                 :             :                                  * continuing to work in production builds). Afterwards we
     941                 :             :                                  * plan to remove this code entirely.
     942                 :             :                                  */
     943   [ #  #  #  # ]:           0 :                                 if (zero_damaged_pages || InRecovery)
     944                 :             :                                 {
     945                 :           0 :                                         Assert(false);  /* see comment above */
     946                 :             : 
     947         [ #  # ]:           0 :                                         for (BlockNumber i = transferred_this_segment / BLCKSZ;
     948                 :           0 :                                                  i < nblocks_this_segment;
     949                 :           0 :                                                  ++i)
     950                 :           0 :                                                 memset(buffers[i], 0, BLCKSZ);
     951                 :           0 :                                         break;
     952                 :             :                                 }
     953                 :             :                                 else
     954   [ #  #  #  # ]:           0 :                                         ereport(ERROR,
     955                 :             :                                                         (errcode(ERRCODE_DATA_CORRUPTED),
     956                 :             :                                                          errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
     957                 :             :                                                                         blocknum,
     958                 :             :                                                                         blocknum + nblocks_this_segment - 1,
     959                 :             :                                                                         FilePathName(v->mdfd_vfd),
     960                 :             :                                                                         transferred_this_segment,
     961                 :             :                                                                         size_this_segment)));
     962                 :           0 :                         }
     963                 :             : 
     964                 :             :                         /* One loop should usually be enough. */
     965                 :         177 :                         transferred_this_segment += nbytes;
     966         [ +  - ]:         177 :                         Assert(transferred_this_segment <= size_this_segment);
     967         [ -  + ]:         177 :                         if (transferred_this_segment == size_this_segment)
     968                 :         177 :                                 break;
     969                 :             : 
     970                 :             :                         /* Adjust position and vectors after a short read. */
     971                 :           0 :                         seekpos += nbytes;
     972                 :           0 :                         iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
     973                 :             :                 }
     974                 :             : 
     975                 :         177 :                 nblocks -= nblocks_this_segment;
     976                 :         177 :                 buffers += nblocks_this_segment;
     977                 :         177 :                 blocknum += nblocks_this_segment;
     978                 :         177 :         }
     979                 :         177 : }
     980                 :             : 
     981                 :             : /*
     982                 :             :  * mdstartreadv() -- Asynchronous version of mdreadv().
     983                 :             :  */
     984                 :             : void
     985                 :        7052 : mdstartreadv(PgAioHandle *ioh,
     986                 :             :                          SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
     987                 :             :                          void **buffers, BlockNumber nblocks)
     988                 :             : {
     989                 :        7052 :         pgoff_t         seekpos;
     990                 :        7052 :         MdfdVec    *v;
     991                 :        7052 :         BlockNumber nblocks_this_segment;
     992                 :        7052 :         struct iovec *iov;
     993                 :        7052 :         int                     iovcnt;
     994                 :        7052 :         int                     ret;
     995                 :             : 
     996                 :        7052 :         v = _mdfd_getseg(reln, forknum, blocknum, false,
     997                 :             :                                          EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
     998                 :             : 
     999                 :        7052 :         seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
    1000                 :             : 
    1001         [ +  - ]:        7052 :         Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
    1002                 :             : 
    1003                 :        7052 :         nblocks_this_segment =
    1004         [ +  - ]:        7052 :                 Min(nblocks,
    1005                 :             :                         RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
    1006                 :             : 
    1007         [ +  - ]:        7052 :         if (nblocks_this_segment != nblocks)
    1008   [ #  #  #  # ]:           0 :                 elog(ERROR, "read crossing segment boundary");
    1009                 :             : 
    1010                 :        7052 :         iovcnt = pgaio_io_get_iovec(ioh, &iov);
    1011                 :             : 
    1012         [ +  - ]:        7052 :         Assert(nblocks <= iovcnt);
    1013                 :             : 
    1014                 :        7052 :         iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
    1015                 :             : 
    1016         [ +  - ]:        7052 :         Assert(iovcnt <= nblocks_this_segment);
    1017                 :             : 
    1018         [ -  + ]:        7052 :         if (!(io_direct_flags & IO_DIRECT_DATA))
    1019                 :        7052 :                 pgaio_io_set_flag(ioh, PGAIO_HF_BUFFERED);
    1020                 :             : 
    1021                 :       14104 :         pgaio_io_set_target_smgr(ioh,
    1022                 :        7052 :                                                          reln,
    1023                 :        7052 :                                                          forknum,
    1024                 :        7052 :                                                          blocknum,
    1025                 :        7052 :                                                          nblocks,
    1026                 :             :                                                          false);
    1027                 :        7052 :         pgaio_io_register_callbacks(ioh, PGAIO_HCB_MD_READV, 0);
    1028                 :             : 
    1029                 :        7052 :         ret = FileStartReadV(ioh, v->mdfd_vfd, iovcnt, seekpos, WAIT_EVENT_DATA_FILE_READ);
    1030         [ +  - ]:        7052 :         if (ret != 0)
    1031   [ #  #  #  # ]:           0 :                 ereport(ERROR,
    1032                 :             :                                 (errcode_for_file_access(),
    1033                 :             :                                  errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
    1034                 :             :                                                 blocknum,
    1035                 :             :                                                 blocknum + nblocks_this_segment - 1,
    1036                 :             :                                                 FilePathName(v->mdfd_vfd))));
    1037                 :             : 
    1038                 :             :         /*
    1039                 :             :          * The error checks corresponding to the post-read checks in mdreadv() are
    1040                 :             :          * in md_readv_complete().
    1041                 :             :          *
    1042                 :             :          * However we chose, at least for now, to not implement the
    1043                 :             :          * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
    1044                 :             :          * that logic is rather problematic, and we want to get rid of it. Here
    1045                 :             :          * equivalent logic would have to be more complicated due to potential
    1046                 :             :          * differences in the zero_damaged_pages setting between the definer and
    1047                 :             :          * completor of IO.
    1048                 :             :          */
    1049                 :        7052 : }
    1050                 :             : 
    1051                 :             : /*
    1052                 :             :  * mdwritev() -- Write the supplied blocks at the appropriate location.
    1053                 :             :  *
    1054                 :             :  * This is to be used only for updating already-existing blocks of a
    1055                 :             :  * relation (ie, those before the current EOF).  To extend a relation,
    1056                 :             :  * use mdextend().
    1057                 :             :  */
    1058                 :             : void
    1059                 :        8001 : mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
    1060                 :             :                  const void **buffers, BlockNumber nblocks, bool skipFsync)
    1061                 :             : {
    1062                 :             :         /* This assert is too expensive to have on normally ... */
    1063                 :             : #ifdef CHECK_WRITE_VS_EXTEND
    1064                 :             :         Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
    1065                 :             : #endif
    1066                 :             : 
    1067         [ +  + ]:       16002 :         while (nblocks > 0)
    1068                 :             :         {
    1069                 :        8001 :                 struct iovec iov[PG_IOV_MAX];
    1070                 :        8001 :                 int                     iovcnt;
    1071                 :        8001 :                 pgoff_t         seekpos;
    1072                 :        8001 :                 int                     nbytes;
    1073                 :        8001 :                 MdfdVec    *v;
    1074                 :        8001 :                 BlockNumber nblocks_this_segment;
    1075                 :        8001 :                 size_t          transferred_this_segment;
    1076                 :        8001 :                 size_t          size_this_segment;
    1077                 :             : 
    1078                 :        8001 :                 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
    1079                 :             :                                                  EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
    1080                 :             : 
    1081                 :        8001 :                 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
    1082                 :             : 
    1083         [ -  + ]:        8001 :                 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
    1084                 :             : 
    1085                 :        8001 :                 nblocks_this_segment =
    1086         [ +  - ]:        8001 :                         Min(nblocks,
    1087                 :             :                                 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
    1088         [ +  - ]:        8001 :                 nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
    1089                 :             : 
    1090         [ +  - ]:        8001 :                 if (nblocks_this_segment != nblocks)
    1091   [ #  #  #  # ]:           0 :                         elog(ERROR, "write crosses segment boundary");
    1092                 :             : 
    1093                 :        8001 :                 iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
    1094                 :        8001 :                 size_this_segment = nblocks_this_segment * BLCKSZ;
    1095                 :        8001 :                 transferred_this_segment = 0;
    1096                 :             : 
    1097                 :             :                 /*
    1098                 :             :                  * Inner loop to continue after a short write.  If the reason is that
    1099                 :             :                  * we're out of disk space, a future attempt should get an ENOSPC
    1100                 :             :                  * error from the kernel.
    1101                 :             :                  */
    1102                 :        8001 :                 for (;;)
    1103                 :             :                 {
    1104                 :        8001 :                         TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
    1105                 :             :                                                                                                  reln->smgr_rlocator.locator.spcOid,
    1106                 :             :                                                                                                  reln->smgr_rlocator.locator.dbOid,
    1107                 :             :                                                                                                  reln->smgr_rlocator.locator.relNumber,
    1108                 :             :                                                                                                  reln->smgr_rlocator.backend);
    1109                 :        8001 :                         nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
    1110                 :             :                                                                 WAIT_EVENT_DATA_FILE_WRITE);
    1111                 :        8001 :                         TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
    1112                 :             :                                                                                                 reln->smgr_rlocator.locator.spcOid,
    1113                 :             :                                                                                                 reln->smgr_rlocator.locator.dbOid,
    1114                 :             :                                                                                                 reln->smgr_rlocator.locator.relNumber,
    1115                 :             :                                                                                                 reln->smgr_rlocator.backend,
    1116                 :             :                                                                                                 nbytes,
    1117                 :             :                                                                                                 size_this_segment - transferred_this_segment);
    1118                 :             : 
    1119                 :             : #ifdef SIMULATE_SHORT_WRITE
    1120                 :             :                         nbytes = Min(nbytes, 4096);
    1121                 :             : #endif
    1122                 :             : 
    1123         [ +  - ]:        8001 :                         if (nbytes < 0)
    1124                 :             :                         {
    1125                 :           0 :                                 bool            enospc = errno == ENOSPC;
    1126                 :             : 
    1127   [ #  #  #  #  :           0 :                                 ereport(ERROR,
                   #  # ]
    1128                 :             :                                                 (errcode_for_file_access(),
    1129                 :             :                                                  errmsg("could not write blocks %u..%u in file \"%s\": %m",
    1130                 :             :                                                                 blocknum,
    1131                 :             :                                                                 blocknum + nblocks_this_segment - 1,
    1132                 :             :                                                                 FilePathName(v->mdfd_vfd)),
    1133                 :             :                                                  enospc ? errhint("Check free disk space.") : 0));
    1134                 :           0 :                         }
    1135                 :             : 
    1136                 :             :                         /* One loop should usually be enough. */
    1137                 :        8001 :                         transferred_this_segment += nbytes;
    1138         [ +  - ]:        8001 :                         Assert(transferred_this_segment <= size_this_segment);
    1139         [ -  + ]:        8001 :                         if (transferred_this_segment == size_this_segment)
    1140                 :        8001 :                                 break;
    1141                 :             : 
    1142                 :             :                         /* Adjust position and iovecs after a short write. */
    1143                 :           0 :                         seekpos += nbytes;
    1144                 :           0 :                         iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
    1145                 :             :                 }
    1146                 :             : 
    1147   [ +  +  +  + ]:        8001 :                 if (!skipFsync && !SmgrIsTemp(reln))
    1148                 :        6998 :                         register_dirty_segment(reln, forknum, v);
    1149                 :             : 
    1150                 :        8001 :                 nblocks -= nblocks_this_segment;
    1151                 :        8001 :                 buffers += nblocks_this_segment;
    1152                 :        8001 :                 blocknum += nblocks_this_segment;
    1153                 :        8001 :         }
    1154                 :        8001 : }
    1155                 :             : 
    1156                 :             : 
    1157                 :             : /*
    1158                 :             :  * mdwriteback() -- Tell the kernel to write pages back to storage.
    1159                 :             :  *
    1160                 :             :  * This accepts a range of blocks because flushing several pages at once is
    1161                 :             :  * considerably more efficient than doing so individually.
    1162                 :             :  */
    1163                 :             : void
    1164                 :           0 : mdwriteback(SMgrRelation reln, ForkNumber forknum,
    1165                 :             :                         BlockNumber blocknum, BlockNumber nblocks)
    1166                 :             : {
    1167         [ #  # ]:           0 :         Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
    1168                 :             : 
    1169                 :             :         /*
    1170                 :             :          * Issue flush requests in as few requests as possible; have to split at
    1171                 :             :          * segment boundaries though, since those are actually separate files.
    1172                 :             :          */
    1173         [ #  # ]:           0 :         while (nblocks > 0)
    1174                 :             :         {
    1175                 :           0 :                 BlockNumber nflush = nblocks;
    1176                 :           0 :                 pgoff_t         seekpos;
    1177                 :           0 :                 MdfdVec    *v;
    1178                 :           0 :                 int                     segnum_start,
    1179                 :             :                                         segnum_end;
    1180                 :             : 
    1181                 :           0 :                 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
    1182                 :             :                                                  EXTENSION_DONT_OPEN);
    1183                 :             : 
    1184                 :             :                 /*
    1185                 :             :                  * We might be flushing buffers of already removed relations, that's
    1186                 :             :                  * ok, just ignore that case.  If the segment file wasn't open already
    1187                 :             :                  * (ie from a recent mdwrite()), then we don't want to re-open it, to
    1188                 :             :                  * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
    1189                 :             :                  * us with a descriptor to a file that is about to be unlinked.
    1190                 :             :                  */
    1191         [ #  # ]:           0 :                 if (!v)
    1192                 :           0 :                         return;
    1193                 :             : 
    1194                 :             :                 /* compute offset inside the current segment */
    1195                 :           0 :                 segnum_start = blocknum / RELSEG_SIZE;
    1196                 :             : 
    1197                 :             :                 /* compute number of desired writes within the current segment */
    1198                 :           0 :                 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
    1199         [ #  # ]:           0 :                 if (segnum_start != segnum_end)
    1200                 :           0 :                         nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
    1201                 :             : 
    1202         [ #  # ]:           0 :                 Assert(nflush >= 1);
    1203         [ #  # ]:           0 :                 Assert(nflush <= nblocks);
    1204                 :             : 
    1205                 :           0 :                 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
    1206                 :             : 
    1207                 :           0 :                 FileWriteback(v->mdfd_vfd, seekpos, (pgoff_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
    1208                 :             : 
    1209                 :           0 :                 nblocks -= nflush;
    1210                 :           0 :                 blocknum += nflush;
    1211      [ #  #  # ]:           0 :         }
    1212                 :           0 : }
    1213                 :             : 
    1214                 :             : /*
    1215                 :             :  * mdnblocks() -- Get the number of blocks stored in a relation.
    1216                 :             :  *
    1217                 :             :  * Important side effect: all active segments of the relation are opened
    1218                 :             :  * and added to the md_seg_fds array.  If this routine has not been
    1219                 :             :  * called, then only segments up to the last one actually touched
    1220                 :             :  * are present in the array.
    1221                 :             :  */
    1222                 :             : BlockNumber
    1223                 :      538585 : mdnblocks(SMgrRelation reln, ForkNumber forknum)
    1224                 :             : {
    1225                 :      538585 :         MdfdVec    *v;
    1226                 :      538585 :         BlockNumber nblocks;
    1227                 :      538585 :         BlockNumber segno;
    1228                 :             : 
    1229                 :      538585 :         mdopenfork(reln, forknum, EXTENSION_FAIL);
    1230                 :             : 
    1231                 :             :         /* mdopen has opened the first segment */
    1232         [ +  - ]:      538585 :         Assert(reln->md_num_open_segs[forknum] > 0);
    1233                 :             : 
    1234                 :             :         /*
    1235                 :             :          * Start from the last open segments, to avoid redundant seeks.  We have
    1236                 :             :          * previously verified that these segments are exactly RELSEG_SIZE long,
    1237                 :             :          * and it's useless to recheck that each time.
    1238                 :             :          *
    1239                 :             :          * NOTE: this assumption could only be wrong if another backend has
    1240                 :             :          * truncated the relation.  We rely on higher code levels to handle that
    1241                 :             :          * scenario by closing and re-opening the md fd, which is handled via
    1242                 :             :          * relcache flush.  (Since the checkpointer doesn't participate in
    1243                 :             :          * relcache flush, it could have segment entries for inactive segments;
    1244                 :             :          * that's OK because the checkpointer never needs to compute relation
    1245                 :             :          * size.)
    1246                 :             :          */
    1247                 :      538585 :         segno = reln->md_num_open_segs[forknum] - 1;
    1248                 :      538585 :         v = &reln->md_seg_fds[forknum][segno];
    1249                 :             : 
    1250                 :      538585 :         for (;;)
    1251                 :             :         {
    1252                 :      538585 :                 nblocks = _mdnblocks(reln, forknum, v);
    1253         [ +  - ]:      538585 :                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
    1254   [ #  #  #  # ]:           0 :                         elog(FATAL, "segment too big");
    1255         [ +  - ]:      538585 :                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
    1256                 :      538585 :                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
    1257                 :             : 
    1258                 :             :                 /*
    1259                 :             :                  * If segment is exactly RELSEG_SIZE, advance to next one.
    1260                 :             :                  */
    1261                 :           0 :                 segno++;
    1262                 :             : 
    1263                 :             :                 /*
    1264                 :             :                  * We used to pass O_CREAT here, but that has the disadvantage that it
    1265                 :             :                  * might create a segment which has vanished through some operating
    1266                 :             :                  * system misadventure.  In such a case, creating the segment here
    1267                 :             :                  * undermines _mdfd_getseg's attempts to notice and report an error
    1268                 :             :                  * upon access to a missing segment.
    1269                 :             :                  */
    1270                 :           0 :                 v = _mdfd_openseg(reln, forknum, segno, 0);
    1271         [ #  # ]:           0 :                 if (v == NULL)
    1272                 :           0 :                         return segno * ((BlockNumber) RELSEG_SIZE);
    1273                 :             :         }
    1274                 :      538585 : }
    1275                 :             : 
    1276                 :             : /*
    1277                 :             :  * mdtruncate() -- Truncate relation to specified number of blocks.
    1278                 :             :  *
    1279                 :             :  * Guaranteed not to allocate memory, so it can be used in a critical section.
    1280                 :             :  * Caller must have called smgrnblocks() to obtain curnblk while holding a
    1281                 :             :  * sufficient lock to prevent a change in relation size, and not used any smgr
    1282                 :             :  * functions for this relation or handled interrupts in between.  This makes
    1283                 :             :  * sure we have opened all active segments, so that truncate loop will get
    1284                 :             :  * them all!
    1285                 :             :  *
    1286                 :             :  * If nblocks > curnblk, the request is ignored when we are InRecovery,
    1287                 :             :  * otherwise, an error is raised.
    1288                 :             :  */
    1289                 :             : void
    1290                 :         200 : mdtruncate(SMgrRelation reln, ForkNumber forknum,
    1291                 :             :                    BlockNumber curnblk, BlockNumber nblocks)
    1292                 :             : {
    1293                 :         200 :         BlockNumber priorblocks;
    1294                 :         200 :         int                     curopensegs;
    1295                 :             : 
    1296         [ +  - ]:         200 :         if (nblocks > curnblk)
    1297                 :             :         {
    1298                 :             :                 /* Bogus request ... but no complaint if InRecovery */
    1299         [ #  # ]:           0 :                 if (InRecovery)
    1300                 :           0 :                         return;
    1301   [ #  #  #  # ]:           0 :                 ereport(ERROR,
    1302                 :             :                                 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
    1303                 :             :                                                 relpath(reln->smgr_rlocator, forknum).str,
    1304                 :             :                                                 nblocks, curnblk)));
    1305                 :           0 :         }
    1306         [ +  + ]:         200 :         if (nblocks == curnblk)
    1307                 :          98 :                 return;                                 /* no work */
    1308                 :             : 
    1309                 :             :         /*
    1310                 :             :          * Truncate segments, starting at the last one. Starting at the end makes
    1311                 :             :          * managing the memory for the fd array easier, should there be errors.
    1312                 :             :          */
    1313                 :         102 :         curopensegs = reln->md_num_open_segs[forknum];
    1314         [ +  + ]:         204 :         while (curopensegs > 0)
    1315                 :             :         {
    1316                 :         102 :                 MdfdVec    *v;
    1317                 :             : 
    1318                 :         102 :                 priorblocks = (curopensegs - 1) * RELSEG_SIZE;
    1319                 :             : 
    1320                 :         102 :                 v = &reln->md_seg_fds[forknum][curopensegs - 1];
    1321                 :             : 
    1322         [ -  + ]:         102 :                 if (priorblocks > nblocks)
    1323                 :             :                 {
    1324                 :             :                         /*
    1325                 :             :                          * This segment is no longer active. We truncate the file, but do
    1326                 :             :                          * not delete it, for reasons explained in the header comments.
    1327                 :             :                          */
    1328         [ #  # ]:           0 :                         if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
    1329   [ #  #  #  # ]:           0 :                                 ereport(ERROR,
    1330                 :             :                                                 (errcode_for_file_access(),
    1331                 :             :                                                  errmsg("could not truncate file \"%s\": %m",
    1332                 :             :                                                                 FilePathName(v->mdfd_vfd))));
    1333                 :             : 
    1334         [ #  # ]:           0 :                         if (!SmgrIsTemp(reln))
    1335                 :           0 :                                 register_dirty_segment(reln, forknum, v);
    1336                 :             : 
    1337                 :             :                         /* we never drop the 1st segment */
    1338         [ #  # ]:           0 :                         Assert(v != &reln->md_seg_fds[forknum][0]);
    1339                 :             : 
    1340                 :           0 :                         FileClose(v->mdfd_vfd);
    1341                 :           0 :                         _fdvec_resize(reln, forknum, curopensegs - 1);
    1342                 :           0 :                 }
    1343         [ +  - ]:         102 :                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
    1344                 :             :                 {
    1345                 :             :                         /*
    1346                 :             :                          * This is the last segment we want to keep. Truncate the file to
    1347                 :             :                          * the right length. NOTE: if nblocks is exactly a multiple K of
    1348                 :             :                          * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
    1349                 :             :                          * keep it. This adheres to the invariant given in the header
    1350                 :             :                          * comments.
    1351                 :             :                          */
    1352                 :         102 :                         BlockNumber lastsegblocks = nblocks - priorblocks;
    1353                 :             : 
    1354         [ -  + ]:         102 :                         if (FileTruncate(v->mdfd_vfd, (pgoff_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
    1355   [ #  #  #  # ]:           0 :                                 ereport(ERROR,
    1356                 :             :                                                 (errcode_for_file_access(),
    1357                 :             :                                                  errmsg("could not truncate file \"%s\" to %u blocks: %m",
    1358                 :             :                                                                 FilePathName(v->mdfd_vfd),
    1359                 :             :                                                                 nblocks)));
    1360         [ +  + ]:         102 :                         if (!SmgrIsTemp(reln))
    1361                 :          49 :                                 register_dirty_segment(reln, forknum, v);
    1362                 :         102 :                 }
    1363                 :             :                 else
    1364                 :             :                 {
    1365                 :             :                         /*
    1366                 :             :                          * We still need this segment, so nothing to do for this and any
    1367                 :             :                          * earlier segment.
    1368                 :             :                          */
    1369                 :           0 :                         break;
    1370                 :             :                 }
    1371                 :         102 :                 curopensegs--;
    1372         [ -  + ]:         102 :         }
    1373                 :         200 : }
    1374                 :             : 
    1375                 :             : /*
    1376                 :             :  * mdregistersync() -- Mark whole relation as needing fsync
    1377                 :             :  */
    1378                 :             : void
    1379                 :        3498 : mdregistersync(SMgrRelation reln, ForkNumber forknum)
    1380                 :             : {
    1381                 :        3498 :         int                     segno;
    1382                 :        3498 :         int                     min_inactive_seg;
    1383                 :             : 
    1384                 :             :         /*
    1385                 :             :          * NOTE: mdnblocks makes sure we have opened all active segments, so that
    1386                 :             :          * the loop below will get them all!
    1387                 :             :          */
    1388                 :        3498 :         mdnblocks(reln, forknum);
    1389                 :             : 
    1390                 :        3498 :         min_inactive_seg = segno = reln->md_num_open_segs[forknum];
    1391                 :             : 
    1392                 :             :         /*
    1393                 :             :          * Temporarily open inactive segments, then close them after sync.  There
    1394                 :             :          * may be some inactive segments left opened after error, but that is
    1395                 :             :          * harmless.  We don't bother to clean them up and take a risk of further
    1396                 :             :          * trouble.  The next mdclose() will soon close them.
    1397                 :             :          */
    1398         [ -  + ]:        3498 :         while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
    1399                 :           0 :                 segno++;
    1400                 :             : 
    1401         [ +  + ]:        6996 :         while (segno > 0)
    1402                 :             :         {
    1403                 :        3498 :                 MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
    1404                 :             : 
    1405                 :        3498 :                 register_dirty_segment(reln, forknum, v);
    1406                 :             : 
    1407                 :             :                 /* Close inactive segments immediately */
    1408         [ +  - ]:        3498 :                 if (segno > min_inactive_seg)
    1409                 :             :                 {
    1410                 :           0 :                         FileClose(v->mdfd_vfd);
    1411                 :           0 :                         _fdvec_resize(reln, forknum, segno - 1);
    1412                 :           0 :                 }
    1413                 :             : 
    1414                 :        3498 :                 segno--;
    1415                 :        3498 :         }
    1416                 :        3498 : }
    1417                 :             : 
    1418                 :             : /*
    1419                 :             :  * mdimmedsync() -- Immediately sync a relation to stable storage.
    1420                 :             :  *
    1421                 :             :  * Note that only writes already issued are synced; this routine knows
    1422                 :             :  * nothing of dirty buffers that may exist inside the buffer manager.  We
    1423                 :             :  * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
    1424                 :             :  * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
    1425                 :             :  * some segment, then mdtruncate() renders that segment inactive.  If we
    1426                 :             :  * crash before the next checkpoint syncs the newly-inactive segment, that
    1427                 :             :  * segment may survive recovery, reintroducing unwanted data into the table.
    1428                 :             :  */
    1429                 :             : void
    1430                 :           8 : mdimmedsync(SMgrRelation reln, ForkNumber forknum)
    1431                 :             : {
    1432                 :           8 :         int                     segno;
    1433                 :           8 :         int                     min_inactive_seg;
    1434                 :             : 
    1435                 :             :         /*
    1436                 :             :          * NOTE: mdnblocks makes sure we have opened all active segments, so that
    1437                 :             :          * the loop below will get them all!
    1438                 :             :          */
    1439                 :           8 :         mdnblocks(reln, forknum);
    1440                 :             : 
    1441                 :           8 :         min_inactive_seg = segno = reln->md_num_open_segs[forknum];
    1442                 :             : 
    1443                 :             :         /*
    1444                 :             :          * Temporarily open inactive segments, then close them after sync.  There
    1445                 :             :          * may be some inactive segments left opened after fsync() error, but that
    1446                 :             :          * is harmless.  We don't bother to clean them up and take a risk of
    1447                 :             :          * further trouble.  The next mdclose() will soon close them.
    1448                 :             :          */
    1449         [ -  + ]:           8 :         while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
    1450                 :           0 :                 segno++;
    1451                 :             : 
    1452         [ +  + ]:          16 :         while (segno > 0)
    1453                 :             :         {
    1454                 :           8 :                 MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
    1455                 :             : 
    1456                 :             :                 /*
    1457                 :             :                  * fsyncs done through mdimmedsync() should be tracked in a separate
    1458                 :             :                  * IOContext than those done through mdsyncfiletag() to differentiate
    1459                 :             :                  * between unavoidable client backend fsyncs (e.g. those done during
    1460                 :             :                  * index build) and those which ideally would have been done by the
    1461                 :             :                  * checkpointer. Since other IO operations bypassing the buffer
    1462                 :             :                  * manager could also be tracked in such an IOContext, wait until
    1463                 :             :                  * these are also tracked to track immediate fsyncs.
    1464                 :             :                  */
    1465         [ +  - ]:           8 :                 if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
    1466   [ #  #  #  #  :           0 :                         ereport(data_sync_elevel(ERROR),
                   #  # ]
    1467                 :             :                                         (errcode_for_file_access(),
    1468                 :             :                                          errmsg("could not fsync file \"%s\": %m",
    1469                 :             :                                                         FilePathName(v->mdfd_vfd))));
    1470                 :             : 
    1471                 :             :                 /* Close inactive segments immediately */
    1472         [ +  - ]:           8 :                 if (segno > min_inactive_seg)
    1473                 :             :                 {
    1474                 :           0 :                         FileClose(v->mdfd_vfd);
    1475                 :           0 :                         _fdvec_resize(reln, forknum, segno - 1);
    1476                 :           0 :                 }
    1477                 :             : 
    1478                 :           8 :                 segno--;
    1479                 :           8 :         }
    1480                 :           8 : }
    1481                 :             : 
    1482                 :             : int
    1483                 :         287 : mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
    1484                 :             : {
    1485                 :         287 :         MdfdVec    *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
    1486                 :             : 
    1487                 :         287 :         v = _mdfd_getseg(reln, forknum, blocknum, false,
    1488                 :             :                                          EXTENSION_FAIL);
    1489                 :             : 
    1490                 :         287 :         *off = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
    1491                 :             : 
    1492         [ +  - ]:         287 :         Assert(*off < (pgoff_t) BLCKSZ * RELSEG_SIZE);
    1493                 :             : 
    1494                 :         574 :         return FileGetRawDesc(v->mdfd_vfd);
    1495                 :         287 : }
    1496                 :             : 
    1497                 :             : /*
    1498                 :             :  * register_dirty_segment() -- Mark a relation segment as needing fsync
    1499                 :             :  *
    1500                 :             :  * If there is a local pending-ops table, just make an entry in it for
    1501                 :             :  * ProcessSyncRequests to process later.  Otherwise, try to pass off the
    1502                 :             :  * fsync request to the checkpointer process.  If that fails, just do the
    1503                 :             :  * fsync locally before returning (we hope this will not happen often
    1504                 :             :  * enough to be a performance problem).
    1505                 :             :  */
    1506                 :             : static void
    1507                 :       46316 : register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
    1508                 :             : {
    1509                 :       46316 :         FileTag         tag;
    1510                 :             : 
    1511                 :       46316 :         INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
    1512                 :             : 
    1513                 :             :         /* Temp relations should never be fsync'd */
    1514         [ +  - ]:       46316 :         Assert(!SmgrIsTemp(reln));
    1515                 :             : 
    1516         [ +  - ]:       46316 :         if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
    1517                 :             :         {
    1518                 :           0 :                 instr_time      io_start;
    1519                 :             : 
    1520   [ #  #  #  # ]:           0 :                 ereport(DEBUG1,
    1521                 :             :                                 (errmsg_internal("could not forward fsync request because request queue is full")));
    1522                 :             : 
    1523                 :           0 :                 io_start = pgstat_prepare_io_time(track_io_timing);
    1524                 :             : 
    1525         [ #  # ]:           0 :                 if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
    1526   [ #  #  #  #  :           0 :                         ereport(data_sync_elevel(ERROR),
                   #  # ]
    1527                 :             :                                         (errcode_for_file_access(),
    1528                 :             :                                          errmsg("could not fsync file \"%s\": %m",
    1529                 :             :                                                         FilePathName(seg->mdfd_vfd))));
    1530                 :             : 
    1531                 :             :                 /*
    1532                 :             :                  * We have no way of knowing if the current IOContext is
    1533                 :             :                  * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
    1534                 :             :                  * point, so count the fsync as being in the IOCONTEXT_NORMAL
    1535                 :             :                  * IOContext. This is probably okay, because the number of backend
    1536                 :             :                  * fsyncs doesn't say anything about the efficacy of the
    1537                 :             :                  * BufferAccessStrategy. And counting both fsyncs done in
    1538                 :             :                  * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
    1539                 :             :                  * IOCONTEXT_NORMAL is likely clearer when investigating the number of
    1540                 :             :                  * backend fsyncs.
    1541                 :             :                  */
    1542                 :           0 :                 pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
    1543                 :             :                                                                 IOOP_FSYNC, io_start, 1, 0);
    1544                 :           0 :         }
    1545                 :       46316 : }
    1546                 :             : 
    1547                 :             : /*
    1548                 :             :  * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
    1549                 :             :  */
    1550                 :             : static void
    1551                 :        7912 : register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
    1552                 :             :                                                 BlockNumber segno)
    1553                 :             : {
    1554                 :        7912 :         FileTag         tag;
    1555                 :             : 
    1556                 :        7912 :         INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
    1557                 :             : 
    1558                 :             :         /* Should never be used with temp relations */
    1559         [ +  - ]:        7912 :         Assert(!RelFileLocatorBackendIsTemp(rlocator));
    1560                 :             : 
    1561                 :        7912 :         RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
    1562                 :        7912 : }
    1563                 :             : 
    1564                 :             : /*
    1565                 :             :  * register_forget_request() -- forget any fsyncs for a relation fork's segment
    1566                 :             :  */
    1567                 :             : static void
    1568                 :       23736 : register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
    1569                 :             :                                                 BlockNumber segno)
    1570                 :             : {
    1571                 :       23736 :         FileTag         tag;
    1572                 :             : 
    1573                 :       23736 :         INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
    1574                 :             : 
    1575                 :       23736 :         RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
    1576                 :       23736 : }
    1577                 :             : 
    1578                 :             : /*
    1579                 :             :  * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
    1580                 :             :  */
    1581                 :             : void
    1582                 :           1 : ForgetDatabaseSyncRequests(Oid dbid)
    1583                 :             : {
    1584                 :           1 :         FileTag         tag;
    1585                 :           1 :         RelFileLocator rlocator;
    1586                 :             : 
    1587                 :           1 :         rlocator.dbOid = dbid;
    1588                 :           1 :         rlocator.spcOid = 0;
    1589                 :           1 :         rlocator.relNumber = 0;
    1590                 :             : 
    1591                 :           1 :         INIT_MD_FILETAG(tag, rlocator, InvalidForkNumber, InvalidBlockNumber);
    1592                 :             : 
    1593                 :           1 :         RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
    1594                 :           1 : }
    1595                 :             : 
    1596                 :             : /*
    1597                 :             :  * DropRelationFiles -- drop files of all given relations
    1598                 :             :  */
    1599                 :             : void
    1600                 :           0 : DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
    1601                 :             : {
    1602                 :           0 :         SMgrRelation *srels;
    1603                 :           0 :         int                     i;
    1604                 :             : 
    1605                 :           0 :         srels = palloc_array(SMgrRelation, ndelrels);
    1606         [ #  # ]:           0 :         for (i = 0; i < ndelrels; i++)
    1607                 :             :         {
    1608                 :           0 :                 SMgrRelation srel = smgropen(delrels[i], INVALID_PROC_NUMBER);
    1609                 :             : 
    1610         [ #  # ]:           0 :                 if (isRedo)
    1611                 :             :                 {
    1612                 :           0 :                         ForkNumber      fork;
    1613                 :             : 
    1614         [ #  # ]:           0 :                         for (fork = 0; fork <= MAX_FORKNUM; fork++)
    1615                 :           0 :                                 XLogDropRelation(delrels[i], fork);
    1616                 :           0 :                 }
    1617                 :           0 :                 srels[i] = srel;
    1618                 :           0 :         }
    1619                 :             : 
    1620                 :           0 :         smgrdounlinkall(srels, ndelrels, isRedo);
    1621                 :             : 
    1622         [ #  # ]:           0 :         for (i = 0; i < ndelrels; i++)
    1623                 :           0 :                 smgrclose(srels[i]);
    1624                 :           0 :         pfree(srels);
    1625                 :           0 : }
    1626                 :             : 
    1627                 :             : 
    1628                 :             : /*
    1629                 :             :  * _fdvec_resize() -- Resize the fork's open segments array
    1630                 :             :  */
    1631                 :             : static void
    1632                 :       96128 : _fdvec_resize(SMgrRelation reln,
    1633                 :             :                           ForkNumber forknum,
    1634                 :             :                           int nseg)
    1635                 :             : {
    1636         [ +  + ]:       96128 :         if (nseg == 0)
    1637                 :             :         {
    1638         [ -  + ]:       42015 :                 if (reln->md_num_open_segs[forknum] > 0)
    1639                 :             :                 {
    1640                 :       42015 :                         pfree(reln->md_seg_fds[forknum]);
    1641                 :       42015 :                         reln->md_seg_fds[forknum] = NULL;
    1642                 :       42015 :                 }
    1643                 :       42015 :         }
    1644         [ -  + ]:       54113 :         else if (reln->md_num_open_segs[forknum] == 0)
    1645                 :             :         {
    1646                 :       54113 :                 reln->md_seg_fds[forknum] =
    1647                 :       54113 :                         MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
    1648                 :       54113 :         }
    1649         [ #  # ]:           0 :         else if (nseg > reln->md_num_open_segs[forknum])
    1650                 :             :         {
    1651                 :             :                 /*
    1652                 :             :                  * It doesn't seem worthwhile complicating the code to amortize
    1653                 :             :                  * repalloc() calls.  Those are far faster than PathNameOpenFile() or
    1654                 :             :                  * FileClose(), and the memory context internally will sometimes avoid
    1655                 :             :                  * doing an actual reallocation.
    1656                 :             :                  */
    1657                 :           0 :                 reln->md_seg_fds[forknum] =
    1658                 :           0 :                         repalloc(reln->md_seg_fds[forknum],
    1659                 :           0 :                                          sizeof(MdfdVec) * nseg);
    1660                 :           0 :         }
    1661                 :             :         else
    1662                 :             :         {
    1663                 :             :                 /*
    1664                 :             :                  * We don't reallocate a smaller array, because we want mdtruncate()
    1665                 :             :                  * to be able to promise that it won't allocate memory, so that it is
    1666                 :             :                  * allowed in a critical section.  This means that a bit of space in
    1667                 :             :                  * the array is now wasted, until the next time we add a segment and
    1668                 :             :                  * reallocate.
    1669                 :             :                  */
    1670                 :             :         }
    1671                 :             : 
    1672                 :       96128 :         reln->md_num_open_segs[forknum] = nseg;
    1673                 :       96128 : }
    1674                 :             : 
    1675                 :             : /*
    1676                 :             :  * Return the filename for the specified segment of the relation. The
    1677                 :             :  * returned string is palloc'd.
    1678                 :             :  */
    1679                 :             : static MdPathStr
    1680                 :        3506 : _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
    1681                 :             : {
    1682                 :        3506 :         RelPathStr      path;
    1683                 :             :         MdPathStr       fullpath;
    1684                 :             : 
    1685                 :        3506 :         path = relpath(reln->smgr_rlocator, forknum);
    1686                 :             : 
    1687         [ +  - ]:        3506 :         if (segno > 0)
    1688                 :        3506 :                 sprintf(fullpath.str, "%s.%u", path.str, segno);
    1689                 :             :         else
    1690                 :           0 :                 strcpy(fullpath.str, path.str);
    1691                 :             : 
    1692                 :             :         return fullpath;
    1693                 :        3506 : }
    1694                 :             : 
    1695                 :             : /*
    1696                 :             :  * Open the specified segment of the relation,
    1697                 :             :  * and make a MdfdVec object for it.  Returns NULL on failure.
    1698                 :             :  */
    1699                 :             : static MdfdVec *
    1700                 :        3506 : _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
    1701                 :             :                           int oflags)
    1702                 :             : {
    1703                 :        3506 :         MdfdVec    *v;
    1704                 :        3506 :         File            fd;
    1705                 :        3506 :         MdPathStr       fullpath;
    1706                 :             : 
    1707                 :        3506 :         fullpath = _mdfd_segpath(reln, forknum, segno);
    1708                 :             : 
    1709                 :             :         /* open the file */
    1710                 :        3506 :         fd = PathNameOpenFile(fullpath.str, _mdfd_open_flags() | oflags);
    1711                 :             : 
    1712         [ -  + ]:        3506 :         if (fd < 0)
    1713                 :        3506 :                 return NULL;
    1714                 :             : 
    1715                 :             :         /*
    1716                 :             :          * Segments are always opened in order from lowest to highest, so we must
    1717                 :             :          * be adding a new one at the end.
    1718                 :             :          */
    1719         [ #  # ]:           0 :         Assert(segno == reln->md_num_open_segs[forknum]);
    1720                 :             : 
    1721                 :           0 :         _fdvec_resize(reln, forknum, segno + 1);
    1722                 :             : 
    1723                 :             :         /* fill the entry */
    1724                 :           0 :         v = &reln->md_seg_fds[forknum][segno];
    1725                 :           0 :         v->mdfd_vfd = fd;
    1726                 :           0 :         v->mdfd_segno = segno;
    1727                 :             : 
    1728         [ #  # ]:           0 :         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
    1729                 :             : 
    1730                 :             :         /* all done */
    1731                 :           0 :         return v;
    1732                 :        3506 : }
    1733                 :             : 
    1734                 :             : /*
    1735                 :             :  * _mdfd_getseg() -- Find the segment of the relation holding the
    1736                 :             :  *                                       specified block.
    1737                 :             :  *
    1738                 :             :  * If the segment doesn't exist, we ereport, return NULL, or create the
    1739                 :             :  * segment, according to "behavior".  Note: skipFsync is only used in the
    1740                 :             :  * EXTENSION_CREATE case.
    1741                 :             :  */
    1742                 :             : static MdfdVec *
    1743                 :       53441 : _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
    1744                 :             :                          bool skipFsync, int behavior)
    1745                 :             : {
    1746                 :       53441 :         MdfdVec    *v;
    1747                 :       53441 :         BlockNumber targetseg;
    1748                 :       53441 :         BlockNumber nextsegno;
    1749                 :             : 
    1750                 :             :         /* some way to handle non-existent segments needs to be specified */
    1751         [ +  - ]:       53441 :         Assert(behavior &
    1752                 :             :                    (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL |
    1753                 :             :                         EXTENSION_DONT_OPEN));
    1754                 :             : 
    1755                 :       53441 :         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
    1756                 :             : 
    1757                 :             :         /* if an existing and opened segment, we're done */
    1758         [ +  + ]:       53441 :         if (targetseg < reln->md_num_open_segs[forknum])
    1759                 :             :         {
    1760                 :       52561 :                 v = &reln->md_seg_fds[forknum][targetseg];
    1761                 :       52561 :                 return v;
    1762                 :             :         }
    1763                 :             : 
    1764                 :             :         /* The caller only wants the segment if we already had it open. */
    1765         [ -  + ]:         880 :         if (behavior & EXTENSION_DONT_OPEN)
    1766                 :           0 :                 return NULL;
    1767                 :             : 
    1768                 :             :         /*
    1769                 :             :          * The target segment is not yet open. Iterate over all the segments
    1770                 :             :          * between the last opened and the target segment. This way missing
    1771                 :             :          * segments either raise an error, or get created (according to
    1772                 :             :          * 'behavior'). Start with either the last opened, or the first segment if
    1773                 :             :          * none was opened before.
    1774                 :             :          */
    1775         [ -  + ]:         880 :         if (reln->md_num_open_segs[forknum] > 0)
    1776                 :           0 :                 v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
    1777                 :             :         else
    1778                 :             :         {
    1779                 :         880 :                 v = mdopenfork(reln, forknum, behavior);
    1780         [ +  - ]:         880 :                 if (!v)
    1781                 :           0 :                         return NULL;            /* if behavior & EXTENSION_RETURN_NULL */
    1782                 :             :         }
    1783                 :             : 
    1784         [ -  + ]:         880 :         for (nextsegno = reln->md_num_open_segs[forknum];
    1785                 :         880 :                  nextsegno <= targetseg; nextsegno++)
    1786                 :             :         {
    1787                 :           0 :                 BlockNumber nblocks = _mdnblocks(reln, forknum, v);
    1788                 :           0 :                 int                     flags = 0;
    1789                 :             : 
    1790         [ #  # ]:           0 :                 Assert(nextsegno == v->mdfd_segno + 1);
    1791                 :             : 
    1792         [ #  # ]:           0 :                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
    1793   [ #  #  #  # ]:           0 :                         elog(FATAL, "segment too big");
    1794                 :             : 
    1795   [ #  #  #  # ]:           0 :                 if ((behavior & EXTENSION_CREATE) ||
    1796         [ #  # ]:           0 :                         (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
    1797                 :             :                 {
    1798                 :             :                         /*
    1799                 :             :                          * Normally we will create new segments only if authorized by the
    1800                 :             :                          * caller (i.e., we are doing mdextend()).  But when doing WAL
    1801                 :             :                          * recovery, create segments anyway; this allows cases such as
    1802                 :             :                          * replaying WAL data that has a write into a high-numbered
    1803                 :             :                          * segment of a relation that was later deleted. We want to go
    1804                 :             :                          * ahead and create the segments so we can finish out the replay.
    1805                 :             :                          *
    1806                 :             :                          * We have to maintain the invariant that segments before the last
    1807                 :             :                          * active segment are of size RELSEG_SIZE; therefore, if
    1808                 :             :                          * extending, pad them out with zeroes if needed.  (This only
    1809                 :             :                          * matters if in recovery, or if the caller is extending the
    1810                 :             :                          * relation discontiguously, but that can happen in hash indexes.)
    1811                 :             :                          */
    1812         [ #  # ]:           0 :                         if (nblocks < ((BlockNumber) RELSEG_SIZE))
    1813                 :             :                         {
    1814                 :           0 :                                 char       *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
    1815                 :             :                                                                                                          MCXT_ALLOC_ZERO);
    1816                 :             : 
    1817                 :           0 :                                 mdextend(reln, forknum,
    1818                 :           0 :                                                  nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
    1819                 :           0 :                                                  zerobuf, skipFsync);
    1820                 :           0 :                                 pfree(zerobuf);
    1821                 :           0 :                         }
    1822                 :           0 :                         flags = O_CREAT;
    1823                 :           0 :                 }
    1824         [ #  # ]:           0 :                 else if (nblocks < ((BlockNumber) RELSEG_SIZE))
    1825                 :             :                 {
    1826                 :             :                         /*
    1827                 :             :                          * When not extending, only open the next segment if the current
    1828                 :             :                          * one is exactly RELSEG_SIZE.  If not (this branch), either
    1829                 :             :                          * return NULL or fail.
    1830                 :             :                          */
    1831         [ #  # ]:           0 :                         if (behavior & EXTENSION_RETURN_NULL)
    1832                 :             :                         {
    1833                 :             :                                 /*
    1834                 :             :                                  * Some callers discern between reasons for _mdfd_getseg()
    1835                 :             :                                  * returning NULL based on errno. As there's no failing
    1836                 :             :                                  * syscall involved in this case, explicitly set errno to
    1837                 :             :                                  * ENOENT, as that seems the closest interpretation.
    1838                 :             :                                  */
    1839                 :           0 :                                 errno = ENOENT;
    1840                 :           0 :                                 return NULL;
    1841                 :             :                         }
    1842                 :             : 
    1843   [ #  #  #  # ]:           0 :                         ereport(ERROR,
    1844                 :             :                                         (errcode_for_file_access(),
    1845                 :             :                                          errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
    1846                 :             :                                                         _mdfd_segpath(reln, forknum, nextsegno).str,
    1847                 :             :                                                         blkno, nblocks)));
    1848                 :           0 :                 }
    1849                 :             : 
    1850                 :           0 :                 v = _mdfd_openseg(reln, forknum, nextsegno, flags);
    1851                 :             : 
    1852         [ #  # ]:           0 :                 if (v == NULL)
    1853                 :             :                 {
    1854         [ #  # ]:           0 :                         if ((behavior & EXTENSION_RETURN_NULL) &&
    1855                 :           0 :                                 FILE_POSSIBLY_DELETED(errno))
    1856                 :           0 :                                 return NULL;
    1857   [ #  #  #  # ]:           0 :                         ereport(ERROR,
    1858                 :             :                                         (errcode_for_file_access(),
    1859                 :             :                                          errmsg("could not open file \"%s\" (target block %u): %m",
    1860                 :             :                                                         _mdfd_segpath(reln, forknum, nextsegno).str,
    1861                 :             :                                                         blkno)));
    1862                 :           0 :                 }
    1863         [ #  # ]:           0 :         }
    1864                 :             : 
    1865                 :         880 :         return v;
    1866                 :       53441 : }
    1867                 :             : 
    1868                 :             : /*
    1869                 :             :  * Get number of blocks present in a single disk file
    1870                 :             :  */
    1871                 :             : static BlockNumber
    1872                 :      618126 : _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
    1873                 :             : {
    1874                 :      618126 :         pgoff_t         len;
    1875                 :             : 
    1876                 :      618126 :         len = FileSize(seg->mdfd_vfd);
    1877         [ +  - ]:      618126 :         if (len < 0)
    1878   [ #  #  #  # ]:           0 :                 ereport(ERROR,
    1879                 :             :                                 (errcode_for_file_access(),
    1880                 :             :                                  errmsg("could not seek to end of file \"%s\": %m",
    1881                 :             :                                                 FilePathName(seg->mdfd_vfd))));
    1882                 :             :         /* note that this calculation will ignore any partial block at EOF */
    1883                 :     1236252 :         return (BlockNumber) (len / BLCKSZ);
    1884                 :      618126 : }
    1885                 :             : 
    1886                 :             : /*
    1887                 :             :  * Sync a file to disk, given a file tag.  Write the path into an output
    1888                 :             :  * buffer so the caller can use it in error messages.
    1889                 :             :  *
    1890                 :             :  * Return 0 on success, -1 on failure, with errno set.
    1891                 :             :  */
    1892                 :             : int
    1893                 :           0 : mdsyncfiletag(const FileTag *ftag, char *path)
    1894                 :             : {
    1895                 :           0 :         SMgrRelation reln = smgropen(ftag->rlocator, INVALID_PROC_NUMBER);
    1896                 :           0 :         File            file;
    1897                 :           0 :         instr_time      io_start;
    1898                 :           0 :         bool            need_to_close;
    1899                 :           0 :         int                     result,
    1900                 :             :                                 save_errno;
    1901                 :             : 
    1902                 :             :         /* See if we already have the file open, or need to open it. */
    1903         [ #  # ]:           0 :         if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
    1904                 :             :         {
    1905                 :           0 :                 file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
    1906                 :           0 :                 strlcpy(path, FilePathName(file), MAXPGPATH);
    1907                 :           0 :                 need_to_close = false;
    1908                 :           0 :         }
    1909                 :             :         else
    1910                 :             :         {
    1911                 :           0 :                 MdPathStr       p;
    1912                 :             : 
    1913                 :           0 :                 p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
    1914                 :           0 :                 strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
    1915                 :             : 
    1916                 :           0 :                 file = PathNameOpenFile(path, _mdfd_open_flags());
    1917         [ #  # ]:           0 :                 if (file < 0)
    1918                 :           0 :                         return -1;
    1919                 :           0 :                 need_to_close = true;
    1920         [ #  # ]:           0 :         }
    1921                 :             : 
    1922                 :           0 :         io_start = pgstat_prepare_io_time(track_io_timing);
    1923                 :             : 
    1924                 :             :         /* Sync the file. */
    1925                 :           0 :         result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
    1926                 :           0 :         save_errno = errno;
    1927                 :             : 
    1928         [ #  # ]:           0 :         if (need_to_close)
    1929                 :           0 :                 FileClose(file);
    1930                 :             : 
    1931                 :           0 :         pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
    1932                 :             :                                                         IOOP_FSYNC, io_start, 1, 0);
    1933                 :             : 
    1934                 :           0 :         errno = save_errno;
    1935                 :           0 :         return result;
    1936                 :           0 : }
    1937                 :             : 
    1938                 :             : /*
    1939                 :             :  * Unlink a file, given a file tag.  Write the path into an output
    1940                 :             :  * buffer so the caller can use it in error messages.
    1941                 :             :  *
    1942                 :             :  * Return 0 on success, -1 on failure, with errno set.
    1943                 :             :  */
    1944                 :             : int
    1945                 :           0 : mdunlinkfiletag(const FileTag *ftag, char *path)
    1946                 :             : {
    1947                 :           0 :         RelPathStr      p;
    1948                 :             : 
    1949                 :             :         /* Compute the path. */
    1950                 :           0 :         p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
    1951                 :           0 :         strlcpy(path, p.str, MAXPGPATH);
    1952                 :             : 
    1953                 :             :         /* Try to unlink the file. */
    1954                 :           0 :         return unlink(path);
    1955                 :           0 : }
    1956                 :             : 
    1957                 :             : /*
    1958                 :             :  * Check if a given candidate request matches a given tag, when processing
    1959                 :             :  * a SYNC_FILTER_REQUEST request.  This will be called for all pending
    1960                 :             :  * requests to find out whether to forget them.
    1961                 :             :  */
    1962                 :             : bool
    1963                 :           0 : mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
    1964                 :             : {
    1965                 :             :         /*
    1966                 :             :          * For now we only use filter requests as a way to drop all scheduled
    1967                 :             :          * callbacks relating to a given database, when dropping the database.
    1968                 :             :          * We'll return true for all candidates that have the same database OID as
    1969                 :             :          * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
    1970                 :             :          */
    1971                 :           0 :         return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
    1972                 :             : }
    1973                 :             : 
    1974                 :             : /*
    1975                 :             :  * AIO completion callback for mdstartreadv().
    1976                 :             :  */
    1977                 :             : static PgAioResult
    1978                 :        6681 : md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
    1979                 :             : {
    1980                 :        6681 :         PgAioTargetData *td = pgaio_io_get_target_data(ioh);
    1981                 :        6681 :         PgAioResult result = prior_result;
    1982                 :             : 
    1983         [ +  - ]:        6681 :         if (prior_result.result < 0)
    1984                 :             :         {
    1985                 :           0 :                 result.status = PGAIO_RS_ERROR;
    1986                 :           0 :                 result.id = PGAIO_HCB_MD_READV;
    1987                 :             :                 /* For "hard" errors, track the error number in error_data */
    1988                 :           0 :                 result.error_data = -prior_result.result;
    1989                 :           0 :                 result.result = 0;
    1990                 :             : 
    1991                 :             :                 /*
    1992                 :             :                  * Immediately log a message about the IO error, but only to the
    1993                 :             :                  * server log. The reason to do so immediately is that the originator
    1994                 :             :                  * might not process the query result immediately (because it is busy
    1995                 :             :                  * doing another part of query processing) or at all (e.g. if it was
    1996                 :             :                  * cancelled or errored out due to another IO also failing).  The
    1997                 :             :                  * definer of the IO will emit an ERROR when processing the IO's
    1998                 :             :                  * results
    1999                 :             :                  */
    2000                 :           0 :                 pgaio_result_report(result, td, LOG_SERVER_ONLY);
    2001                 :             : 
    2002                 :           0 :                 return result;
    2003                 :             :         }
    2004                 :             : 
    2005                 :             :         /*
    2006                 :             :          * As explained above smgrstartreadv(), the smgr API operates on the level
    2007                 :             :          * of blocks, rather than bytes. Convert.
    2008                 :             :          */
    2009                 :        6681 :         result.result /= BLCKSZ;
    2010                 :             : 
    2011         [ +  - ]:        6681 :         Assert(result.result <= td->smgr.nblocks);
    2012                 :             : 
    2013         [ +  - ]:        6681 :         if (result.result == 0)
    2014                 :             :         {
    2015                 :             :                 /* consider 0 blocks read a failure */
    2016                 :           0 :                 result.status = PGAIO_RS_ERROR;
    2017                 :           0 :                 result.id = PGAIO_HCB_MD_READV;
    2018                 :           0 :                 result.error_data = 0;
    2019                 :             : 
    2020                 :             :                 /* see comment above the "hard error" case */
    2021                 :           0 :                 pgaio_result_report(result, td, LOG_SERVER_ONLY);
    2022                 :             : 
    2023                 :           0 :                 return result;
    2024                 :             :         }
    2025                 :             : 
    2026   [ +  -  +  - ]:        6681 :         if (result.status != PGAIO_RS_ERROR &&
    2027                 :        6681 :                 result.result < td->smgr.nblocks)
    2028                 :             :         {
    2029                 :             :                 /* partial reads should be retried at upper level */
    2030                 :           0 :                 result.status = PGAIO_RS_PARTIAL;
    2031                 :           0 :                 result.id = PGAIO_HCB_MD_READV;
    2032                 :           0 :         }
    2033                 :             : 
    2034                 :        6681 :         return result;
    2035                 :        6681 : }
    2036                 :             : 
    2037                 :             : /*
    2038                 :             :  * AIO error reporting callback for mdstartreadv().
    2039                 :             :  *
    2040                 :             :  * Errors are encoded as follows:
    2041                 :             :  * - PgAioResult.error_data != 0 encodes IO that failed with that errno
    2042                 :             :  * - PgAioResult.error_data == 0 encodes IO that didn't read all data
    2043                 :             :  */
    2044                 :             : static void
    2045                 :           0 : md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
    2046                 :             : {
    2047                 :           0 :         RelPathStr      path;
    2048                 :             : 
    2049         [ #  # ]:           0 :         path = relpathbackend(td->smgr.rlocator,
    2050                 :             :                                                   td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER,
    2051                 :             :                                                   td->smgr.forkNum);
    2052                 :             : 
    2053         [ #  # ]:           0 :         if (result.error_data != 0)
    2054                 :             :         {
    2055                 :             :                 /* for errcode_for_file_access() and %m */
    2056                 :           0 :                 errno = result.error_data;
    2057                 :             : 
    2058   [ #  #  #  #  :           0 :                 ereport(elevel,
          #  #  #  #  #  
                      # ]
    2059                 :             :                                 errcode_for_file_access(),
    2060                 :             :                                 errmsg("could not read blocks %u..%u in file \"%s\": %m",
    2061                 :             :                                            td->smgr.blockNum,
    2062                 :             :                                            td->smgr.blockNum + td->smgr.nblocks - 1,
    2063                 :             :                                            path.str));
    2064                 :           0 :         }
    2065                 :             :         else
    2066                 :             :         {
    2067                 :             :                 /*
    2068                 :             :                  * NB: This will typically only be output in debug messages, while
    2069                 :             :                  * retrying a partial IO.
    2070                 :             :                  */
    2071   [ #  #  #  #  :           0 :                 ereport(elevel,
          #  #  #  #  #  
                      # ]
    2072                 :             :                                 errcode(ERRCODE_DATA_CORRUPTED),
    2073                 :             :                                 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
    2074                 :             :                                            td->smgr.blockNum,
    2075                 :             :                                            td->smgr.blockNum + td->smgr.nblocks - 1,
    2076                 :             :                                            path.str,
    2077                 :             :                                            result.result * (size_t) BLCKSZ,
    2078                 :             :                                            td->smgr.nblocks * (size_t) BLCKSZ));
    2079                 :             :         }
    2080                 :           0 : }
        

Generated by: LCOV version 2.3.2-1