LCOV - code coverage report
Current view: top level - src/backend/storage/aio - aio.c (source / functions) Coverage Total Hit
Test: Code coverage Lines: 65.4 % 408 267
Test Date: 2026-01-26 10:56:24 Functions: 81.1 % 37 30
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
Branches: 32.5 % 332 108

             Branch data     Line data    Source code
       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * aio.c
       4                 :             :  *    AIO - Core Logic
       5                 :             :  *
       6                 :             :  * For documentation about how AIO works on a higher level, including a
       7                 :             :  * schematic example, see README.md.
       8                 :             :  *
       9                 :             :  *
      10                 :             :  * AIO is a complicated subsystem. To keep things navigable, it is split
      11                 :             :  * across a number of files:
      12                 :             :  *
      13                 :             :  * - method_*.c - different ways of executing AIO (e.g. worker process)
      14                 :             :  *
      15                 :             :  * - aio_target.c - IO on different kinds of targets
      16                 :             :  *
      17                 :             :  * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
      18                 :             :  *
      19                 :             :  * - aio_callback.c - callbacks at IO operation lifecycle events
      20                 :             :  *
      21                 :             :  * - aio_init.c - per-server and per-backend initialization
      22                 :             :  *
      23                 :             :  * - aio.c - all other topics
      24                 :             :  *
      25                 :             :  * - read_stream.c - helper for reading buffered relation data
      26                 :             :  *
      27                 :             :  * - README.md - higher-level overview over AIO
      28                 :             :  *
      29                 :             :  *
      30                 :             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      31                 :             :  * Portions Copyright (c) 1994, Regents of the University of California
      32                 :             :  *
      33                 :             :  * IDENTIFICATION
      34                 :             :  *    src/backend/storage/aio/aio.c
      35                 :             :  *
      36                 :             :  *-------------------------------------------------------------------------
      37                 :             :  */
      38                 :             : 
      39                 :             : #include "postgres.h"
      40                 :             : 
      41                 :             : #include "lib/ilist.h"
      42                 :             : #include "miscadmin.h"
      43                 :             : #include "port/atomics.h"
      44                 :             : #include "storage/aio.h"
      45                 :             : #include "storage/aio_internal.h"
      46                 :             : #include "storage/aio_subsys.h"
      47                 :             : #include "utils/guc.h"
      48                 :             : #include "utils/guc_hooks.h"
      49                 :             : #include "utils/injection_point.h"
      50                 :             : #include "utils/resowner.h"
      51                 :             : #include "utils/wait_event_types.h"
      52                 :             : 
      53                 :             : 
      54                 :             : static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
      55                 :             : static void pgaio_io_reclaim(PgAioHandle *ioh);
      56                 :             : static void pgaio_io_resowner_register(PgAioHandle *ioh, struct ResourceOwnerData *resowner);
      57                 :             : static void pgaio_io_wait_for_free(void);
      58                 :             : static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
      59                 :             : static const char *pgaio_io_state_get_name(PgAioHandleState s);
      60                 :             : static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
      61                 :             : 
      62                 :             : 
      63                 :             : /* Options for io_method. */
      64                 :             : const struct config_enum_entry io_method_options[] = {
      65                 :             :         {"sync", IOMETHOD_SYNC, false},
      66                 :             :         {"worker", IOMETHOD_WORKER, false},
      67                 :             : #ifdef IOMETHOD_IO_URING_ENABLED
      68                 :             :         {"io_uring", IOMETHOD_IO_URING, false},
      69                 :             : #endif
      70                 :             :         {NULL, 0, false}
      71                 :             : };
      72                 :             : 
      73                 :             : /* GUCs */
      74                 :             : int                     io_method = DEFAULT_IO_METHOD;
      75                 :             : int                     io_max_concurrency = -1;
      76                 :             : 
      77                 :             : /* global control for AIO */
      78                 :             : PgAioCtl   *pgaio_ctl;
      79                 :             : 
      80                 :             : /* current backend's per-backend state */
      81                 :             : PgAioBackend *pgaio_my_backend;
      82                 :             : 
      83                 :             : 
      84                 :             : static const IoMethodOps *const pgaio_method_ops_table[] = {
      85                 :             :         [IOMETHOD_SYNC] = &pgaio_sync_ops,
      86                 :             :         [IOMETHOD_WORKER] = &pgaio_worker_ops,
      87                 :             : #ifdef IOMETHOD_IO_URING_ENABLED
      88                 :             :         [IOMETHOD_IO_URING] = &pgaio_uring_ops,
      89                 :             : #endif
      90                 :             : };
      91                 :             : 
      92                 :             : StaticAssertDecl(lengthof(io_method_options) == lengthof(pgaio_method_ops_table) + 1,
      93                 :             :                                  "io_method_options out of sync with pgaio_method_ops_table");
      94                 :             : 
      95                 :             : /* callbacks for the configured io_method, set by assign_io_method */
      96                 :             : const IoMethodOps *pgaio_method_ops;
      97                 :             : 
      98                 :             : 
      99                 :             : /* --------------------------------------------------------------------------------
     100                 :             :  * Public Functions related to PgAioHandle
     101                 :             :  * --------------------------------------------------------------------------------
     102                 :             :  */
     103                 :             : 
     104                 :             : /*
     105                 :             :  * Acquire an AioHandle, waiting for IO completion if necessary.
     106                 :             :  *
     107                 :             :  * Each backend can only have one AIO handle that has been "handed out" to
     108                 :             :  * code, but not yet submitted or released. This restriction is necessary to
     109                 :             :  * ensure that it is possible for code to wait for an unused handle by waiting
     110                 :             :  * for in-flight IO to complete. There is a limited number of handles in each
     111                 :             :  * backend, if multiple handles could be handed out without being submitted,
     112                 :             :  * waiting for all in-flight IO to complete would not guarantee that handles
     113                 :             :  * free up.
     114                 :             :  *
     115                 :             :  * It is cheap to acquire an IO handle, unless all handles are in use. In that
     116                 :             :  * case this function waits for the oldest IO to complete. If that is not
     117                 :             :  * desirable, use pgaio_io_acquire_nb().
     118                 :             :  *
     119                 :             :  * If a handle was acquired but then does not turn out to be needed,
     120                 :             :  * e.g. because pgaio_io_acquire() is called before starting an IO in a
     121                 :             :  * critical section, the handle needs to be released with pgaio_io_release().
     122                 :             :  *
     123                 :             :  *
     124                 :             :  * To react to the completion of the IO as soon as it is known to have
     125                 :             :  * completed, callbacks can be registered with pgaio_io_register_callbacks().
     126                 :             :  *
     127                 :             :  * To actually execute IO using the returned handle, the pgaio_io_start_*()
     128                 :             :  * family of functions is used. In many cases the pgaio_io_start_*() call will
     129                 :             :  * not be done directly by code that acquired the handle, but by lower level
     130                 :             :  * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
     131                 :             :  * AIO, it typically will pass the handle to smgr.c, which will pass it on to
     132                 :             :  * md.c, on to fd.c, which then finally calls pgaio_io_start_*().  This
     133                 :             :  * forwarding allows the various layers to react to the IO's completion by
     134                 :             :  * registering callbacks. These callbacks in turn can translate a lower
     135                 :             :  * layer's result into a result understandable by a higher layer.
     136                 :             :  *
     137                 :             :  * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
     138                 :             :  * not submitted to the kernel). Unless in batchmode
     139                 :             :  * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
     140                 :             :  * execution. Note that, whether in batchmode or not, the IO might even
     141                 :             :  * complete before the functions return.
     142                 :             :  *
     143                 :             :  * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
     144                 :             :  * referenced by the IO issuing code. To e.g. wait for IO, references to the
     145                 :             :  * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
     146                 :             :  * is called.  pgaio_wref_wait() can be used to wait for the IO to complete.
     147                 :             :  *
     148                 :             :  *
     149                 :             :  * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
     150                 :             :  * passed to pgaio_io_acquire(). Once the issuing backend has called
     151                 :             :  * pgaio_wref_wait(), the PgAioReturn contains information about whether the
     152                 :             :  * operation succeeded and details about the first failure, if any. The error
     153                 :             :  * can be raised / logged with pgaio_result_report().
     154                 :             :  *
     155                 :             :  * The lifetime of the memory pointed to be *ret needs to be at least as long
     156                 :             :  * as the passed in resowner. If the resowner releases resources before the IO
     157                 :             :  * completes (typically due to an error), the reference to *ret will be
     158                 :             :  * cleared. In case of resowner cleanup *ret will not be updated with the
     159                 :             :  * results of the IO operation.
     160                 :             :  */
     161                 :             : PgAioHandle *
     162                 :           0 : pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
     163                 :             : {
     164                 :           0 :         PgAioHandle *h;
     165                 :             : 
     166                 :           0 :         while (true)
     167                 :             :         {
     168                 :           0 :                 h = pgaio_io_acquire_nb(resowner, ret);
     169                 :             : 
     170         [ #  # ]:           0 :                 if (h != NULL)
     171                 :           0 :                         return h;
     172                 :             : 
     173                 :             :                 /*
     174                 :             :                  * Evidently all handles by this backend are in use. Just wait for
     175                 :             :                  * some to complete.
     176                 :             :                  */
     177                 :           0 :                 pgaio_io_wait_for_free();
     178                 :             :         }
     179                 :           0 : }
     180                 :             : 
     181                 :             : /*
     182                 :             :  * Acquire an AioHandle, returning NULL if no handles are free.
     183                 :             :  *
     184                 :             :  * See pgaio_io_acquire(). The only difference is that this function will return
     185                 :             :  * NULL if there are no idle handles, instead of blocking.
     186                 :             :  */
     187                 :             : PgAioHandle *
     188                 :        7053 : pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
     189                 :             : {
     190                 :        7053 :         PgAioHandle *ioh = NULL;
     191                 :             : 
     192         [ +  - ]:        7053 :         if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
     193                 :             :         {
     194         [ #  # ]:           0 :                 Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
     195                 :           0 :                 pgaio_submit_staged();
     196                 :           0 :         }
     197                 :             : 
     198         [ +  - ]:        7053 :         if (pgaio_my_backend->handed_out_io)
     199   [ #  #  #  # ]:           0 :                 elog(ERROR, "API violation: Only one IO can be handed out");
     200                 :             : 
     201                 :             :         /*
     202                 :             :          * Probably not needed today, as interrupts should not process this IO,
     203                 :             :          * but...
     204                 :             :          */
     205                 :        7053 :         HOLD_INTERRUPTS();
     206                 :             : 
     207         [ -  + ]:        7053 :         if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
     208                 :             :         {
     209                 :        7053 :                 dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
     210                 :             : 
     211                 :        7053 :                 ioh = dclist_container(PgAioHandle, node, ion);
     212                 :             : 
     213         [ +  - ]:        7053 :                 Assert(ioh->state == PGAIO_HS_IDLE);
     214         [ +  - ]:        7053 :                 Assert(ioh->owner_procno == MyProcNumber);
     215                 :             : 
     216                 :        7053 :                 pgaio_io_update_state(ioh, PGAIO_HS_HANDED_OUT);
     217                 :        7053 :                 pgaio_my_backend->handed_out_io = ioh;
     218                 :             : 
     219         [ -  + ]:        7053 :                 if (resowner)
     220                 :        7053 :                         pgaio_io_resowner_register(ioh, resowner);
     221                 :             : 
     222         [ -  + ]:        7053 :                 if (ret)
     223                 :             :                 {
     224                 :        7053 :                         ioh->report_return = ret;
     225                 :        7053 :                         ret->result.status = PGAIO_RS_UNKNOWN;
     226                 :        7053 :                 }
     227                 :        7053 :         }
     228                 :             : 
     229         [ +  - ]:        7053 :         RESUME_INTERRUPTS();
     230                 :             : 
     231                 :       14106 :         return ioh;
     232                 :        7053 : }
     233                 :             : 
     234                 :             : /*
     235                 :             :  * Release IO handle that turned out to not be required.
     236                 :             :  *
     237                 :             :  * See pgaio_io_acquire() for more details.
     238                 :             :  */
     239                 :             : void
     240                 :           1 : pgaio_io_release(PgAioHandle *ioh)
     241                 :             : {
     242         [ +  - ]:           1 :         if (ioh == pgaio_my_backend->handed_out_io)
     243                 :             :         {
     244         [ +  - ]:           1 :                 Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     245         [ +  - ]:           1 :                 Assert(ioh->resowner);
     246                 :             : 
     247                 :           1 :                 pgaio_my_backend->handed_out_io = NULL;
     248                 :             : 
     249                 :             :                 /*
     250                 :             :                  * Note that no interrupts are processed between the handed_out_io
     251                 :             :                  * check and the call to reclaim - that's important as otherwise an
     252                 :             :                  * interrupt could have already reclaimed the handle.
     253                 :             :                  */
     254                 :           1 :                 pgaio_io_reclaim(ioh);
     255                 :           1 :         }
     256                 :             :         else
     257                 :             :         {
     258   [ #  #  #  # ]:           0 :                 elog(ERROR, "release in unexpected state");
     259                 :             :         }
     260                 :           1 : }
     261                 :             : 
     262                 :             : /*
     263                 :             :  * Release IO handle during resource owner cleanup.
     264                 :             :  */
     265                 :             : void
     266                 :           0 : pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
     267                 :             : {
     268                 :           0 :         PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
     269                 :             : 
     270         [ #  # ]:           0 :         Assert(ioh->resowner);
     271                 :             : 
     272                 :             :         /*
     273                 :             :          * Otherwise an interrupt, in the middle of releasing the IO, could end up
     274                 :             :          * trying to wait for the IO, leading to state confusion.
     275                 :             :          */
     276                 :           0 :         HOLD_INTERRUPTS();
     277                 :             : 
     278                 :           0 :         ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
     279                 :           0 :         ioh->resowner = NULL;
     280                 :             : 
     281   [ #  #  #  #  :           0 :         switch ((PgAioHandleState) ioh->state)
                      # ]
     282                 :             :         {
     283                 :             :                 case PGAIO_HS_IDLE:
     284   [ #  #  #  # ]:           0 :                         elog(ERROR, "unexpected");
     285                 :           0 :                         break;
     286                 :             :                 case PGAIO_HS_HANDED_OUT:
     287   [ #  #  #  # ]:           0 :                         Assert(ioh == pgaio_my_backend->handed_out_io || pgaio_my_backend->handed_out_io == NULL);
     288                 :             : 
     289         [ #  # ]:           0 :                         if (ioh == pgaio_my_backend->handed_out_io)
     290                 :             :                         {
     291                 :           0 :                                 pgaio_my_backend->handed_out_io = NULL;
     292         [ #  # ]:           0 :                                 if (!on_error)
     293   [ #  #  #  # ]:           0 :                                         elog(WARNING, "leaked AIO handle");
     294                 :           0 :                         }
     295                 :             : 
     296                 :           0 :                         pgaio_io_reclaim(ioh);
     297                 :           0 :                         break;
     298                 :             :                 case PGAIO_HS_DEFINED:
     299                 :             :                 case PGAIO_HS_STAGED:
     300         [ #  # ]:           0 :                         if (!on_error)
     301   [ #  #  #  # ]:           0 :                                 elog(WARNING, "AIO handle was not submitted");
     302                 :           0 :                         pgaio_submit_staged();
     303                 :           0 :                         break;
     304                 :             :                 case PGAIO_HS_SUBMITTED:
     305                 :             :                 case PGAIO_HS_COMPLETED_IO:
     306                 :             :                 case PGAIO_HS_COMPLETED_SHARED:
     307                 :             :                 case PGAIO_HS_COMPLETED_LOCAL:
     308                 :             :                         /* this is expected to happen */
     309                 :           0 :                         break;
     310                 :             :         }
     311                 :             : 
     312                 :             :         /*
     313                 :             :          * Need to unregister the reporting of the IO's result, the memory it's
     314                 :             :          * referencing likely has gone away.
     315                 :             :          */
     316         [ #  # ]:           0 :         if (ioh->report_return)
     317                 :           0 :                 ioh->report_return = NULL;
     318                 :             : 
     319         [ #  # ]:           0 :         RESUME_INTERRUPTS();
     320                 :           0 : }
     321                 :             : 
     322                 :             : /*
     323                 :             :  * Add a [set of] flags to the IO.
     324                 :             :  *
     325                 :             :  * Note that this combines flags with already set flags, rather than set flags
     326                 :             :  * to explicitly the passed in parameters. This is to allow multiple callsites
     327                 :             :  * to set flags.
     328                 :             :  */
     329                 :             : void
     330                 :       14104 : pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
     331                 :             : {
     332         [ +  - ]:       14104 :         Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     333                 :             : 
     334                 :       14104 :         ioh->flags |= flag;
     335                 :       14104 : }
     336                 :             : 
     337                 :             : /*
     338                 :             :  * Returns an ID uniquely identifying the IO handle. This is only really
     339                 :             :  * useful for logging, as handles are reused across multiple IOs.
     340                 :             :  */
     341                 :             : int
     342                 :       11154 : pgaio_io_get_id(PgAioHandle *ioh)
     343                 :             : {
     344         [ +  - ]:       11154 :         Assert(ioh >= pgaio_ctl->io_handles &&
     345                 :             :                    ioh < (pgaio_ctl->io_handles + pgaio_ctl->io_handle_count));
     346                 :       11154 :         return ioh - pgaio_ctl->io_handles;
     347                 :             : }
     348                 :             : 
     349                 :             : /*
     350                 :             :  * Return the ProcNumber for the process that can use an IO handle. The
     351                 :             :  * mapping from IO handles to PGPROCs is static, therefore this even works
     352                 :             :  * when the corresponding PGPROC is not in use.
     353                 :             :  */
     354                 :             : ProcNumber
     355                 :         582 : pgaio_io_get_owner(PgAioHandle *ioh)
     356                 :             : {
     357                 :         582 :         return ioh->owner_procno;
     358                 :             : }
     359                 :             : 
     360                 :             : /*
     361                 :             :  * Return a wait reference for the IO. Only wait references can be used to
     362                 :             :  * wait for an IOs completion, as handles themselves can be reused after
     363                 :             :  * completion.  See also the comment above pgaio_io_acquire().
     364                 :             :  */
     365                 :             : void
     366                 :       14104 : pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
     367                 :             : {
     368   [ +  +  -  +  :       14104 :         Assert(ioh->state == PGAIO_HS_HANDED_OUT ||
                   #  # ]
     369                 :             :                    ioh->state == PGAIO_HS_DEFINED ||
     370                 :             :                    ioh->state == PGAIO_HS_STAGED);
     371         [ +  - ]:       14104 :         Assert(ioh->generation != 0);
     372                 :             : 
     373                 :       14104 :         iow->aio_index = ioh - pgaio_ctl->io_handles;
     374                 :       14104 :         iow->generation_upper = (uint32) (ioh->generation >> 32);
     375                 :       14104 :         iow->generation_lower = (uint32) ioh->generation;
     376                 :       14104 : }
     377                 :             : 
     378                 :             : 
     379                 :             : 
     380                 :             : /* --------------------------------------------------------------------------------
     381                 :             :  * Internal Functions related to PgAioHandle
     382                 :             :  * --------------------------------------------------------------------------------
     383                 :             :  */
     384                 :             : 
     385                 :             : static inline void
     386                 :       55676 : pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
     387                 :             : {
     388                 :             :         /*
     389                 :             :          * All callers need to have held interrupts in some form, otherwise
     390                 :             :          * interrupt processing could wait for the IO to complete, while in an
     391                 :             :          * intermediary state.
     392                 :             :          */
     393   [ +  +  -  +  :       55676 :         Assert(!INTERRUPTS_CAN_BE_PROCESSED());
                   #  # ]
     394                 :             : 
     395   [ -  +  -  + ]:       55676 :         pgaio_debug_io(DEBUG5, ioh,
     396                 :             :                                    "updating state to %s",
     397                 :             :                                    pgaio_io_state_get_name(new_state));
     398                 :             : 
     399                 :             :         /*
     400                 :             :          * Ensure the changes signified by the new state are visible before the
     401                 :             :          * new state becomes visible.
     402                 :             :          */
     403                 :       55676 :         pg_write_barrier();
     404                 :             : 
     405                 :       55676 :         ioh->state = new_state;
     406                 :       55676 : }
     407                 :             : 
     408                 :             : static void
     409                 :        7053 : pgaio_io_resowner_register(PgAioHandle *ioh, struct ResourceOwnerData *resowner)
     410                 :             : {
     411         [ +  - ]:        7053 :         Assert(!ioh->resowner);
     412         [ +  - ]:        7053 :         Assert(resowner);
     413                 :             : 
     414                 :        7053 :         ResourceOwnerRememberAioHandle(resowner, &ioh->resowner_node);
     415                 :        7053 :         ioh->resowner = resowner;
     416                 :        7053 : }
     417                 :             : 
     418                 :             : /*
     419                 :             :  * Stage IO for execution and, if appropriate, submit it immediately.
     420                 :             :  *
     421                 :             :  * Should only be called from pgaio_io_start_*().
     422                 :             :  */
     423                 :             : void
     424                 :        7052 : pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
     425                 :             : {
     426                 :        7052 :         bool            needs_synchronous;
     427                 :             : 
     428         [ +  - ]:        7052 :         Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     429         [ +  - ]:        7052 :         Assert(pgaio_my_backend->handed_out_io == ioh);
     430         [ +  - ]:        7052 :         Assert(pgaio_io_has_target(ioh));
     431                 :             : 
     432                 :             :         /*
     433                 :             :          * Otherwise an interrupt, in the middle of staging and possibly executing
     434                 :             :          * the IO, could end up trying to wait for the IO, leading to state
     435                 :             :          * confusion.
     436                 :             :          */
     437                 :        7052 :         HOLD_INTERRUPTS();
     438                 :             : 
     439                 :        7052 :         ioh->op = op;
     440                 :        7052 :         ioh->result = 0;
     441                 :             : 
     442                 :        7052 :         pgaio_io_update_state(ioh, PGAIO_HS_DEFINED);
     443                 :             : 
     444                 :             :         /* allow a new IO to be staged */
     445                 :        7052 :         pgaio_my_backend->handed_out_io = NULL;
     446                 :             : 
     447                 :        7052 :         pgaio_io_call_stage(ioh);
     448                 :             : 
     449                 :        7052 :         pgaio_io_update_state(ioh, PGAIO_HS_STAGED);
     450                 :             : 
     451                 :             :         /*
     452                 :             :          * Synchronous execution has to be executed, well, synchronously, so check
     453                 :             :          * that first.
     454                 :             :          */
     455                 :        7052 :         needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
     456                 :             : 
     457   [ -  +  -  + ]:        7052 :         pgaio_debug_io(DEBUG3, ioh,
     458                 :             :                                    "staged (synchronous: %d, in_batch: %d)",
     459                 :             :                                    needs_synchronous, pgaio_my_backend->in_batchmode);
     460                 :             : 
     461         [ +  + ]:        7052 :         if (!needs_synchronous)
     462                 :             :         {
     463                 :         658 :                 pgaio_my_backend->staged_ios[pgaio_my_backend->num_staged_ios++] = ioh;
     464         [ +  - ]:         658 :                 Assert(pgaio_my_backend->num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
     465                 :             : 
     466                 :             :                 /*
     467                 :             :                  * Unless code explicitly opted into batching IOs, submit the IO
     468                 :             :                  * immediately.
     469                 :             :                  */
     470         [ +  + ]:         658 :                 if (!pgaio_my_backend->in_batchmode)
     471                 :           7 :                         pgaio_submit_staged();
     472                 :         658 :         }
     473                 :             :         else
     474                 :             :         {
     475                 :        6394 :                 pgaio_io_prepare_submit(ioh);
     476                 :        6394 :                 pgaio_io_perform_synchronously(ioh);
     477                 :             :         }
     478                 :             : 
     479         [ +  - ]:        7052 :         RESUME_INTERRUPTS();
     480                 :        7052 : }
     481                 :             : 
     482                 :             : bool
     483                 :        7052 : pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
     484                 :             : {
     485                 :             :         /*
     486                 :             :          * If the caller said to execute the IO synchronously, do so.
     487                 :             :          *
     488                 :             :          * XXX: We could optimize the logic when to execute synchronously by first
     489                 :             :          * checking if there are other IOs in flight and only synchronously
     490                 :             :          * executing if not. Unclear whether that'll be sufficiently common to be
     491                 :             :          * worth worrying about.
     492                 :             :          */
     493         [ +  + ]:        7052 :         if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
     494                 :        5966 :                 return true;
     495                 :             : 
     496                 :             :         /* Check if the IO method requires synchronous execution of IO */
     497         [ +  - ]:        1086 :         if (pgaio_method_ops->needs_synchronous_execution)
     498                 :        1086 :                 return pgaio_method_ops->needs_synchronous_execution(ioh);
     499                 :             : 
     500                 :           0 :         return false;
     501                 :        7052 : }
     502                 :             : 
     503                 :             : /*
     504                 :             :  * Handle IO being processed by IO method.
     505                 :             :  *
     506                 :             :  * Should be called by IO methods / synchronous IO execution, just before the
     507                 :             :  * IO is performed.
     508                 :             :  */
     509                 :             : void
     510                 :        7052 : pgaio_io_prepare_submit(PgAioHandle *ioh)
     511                 :             : {
     512                 :        7052 :         pgaio_io_update_state(ioh, PGAIO_HS_SUBMITTED);
     513                 :             : 
     514                 :        7052 :         dclist_push_tail(&pgaio_my_backend->in_flight_ios, &ioh->node);
     515                 :        7052 : }
     516                 :             : 
     517                 :             : /*
     518                 :             :  * Handle IO getting completed by a method.
     519                 :             :  *
     520                 :             :  * Should be called by IO methods / synchronous IO execution, just after the
     521                 :             :  * IO has been performed.
     522                 :             :  *
     523                 :             :  * Expects to be called in a critical section. We expect IOs to be usable for
     524                 :             :  * WAL etc, which requires being able to execute completion callbacks in a
     525                 :             :  * critical section.
     526                 :             :  */
     527                 :             : void
     528                 :        6681 : pgaio_io_process_completion(PgAioHandle *ioh, int result)
     529                 :             : {
     530         [ +  - ]:        6681 :         Assert(ioh->state == PGAIO_HS_SUBMITTED);
     531                 :             : 
     532         [ +  - ]:        6681 :         Assert(CritSectionCount > 0);
     533                 :             : 
     534                 :        6681 :         ioh->result = result;
     535                 :             : 
     536                 :        6681 :         pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_IO);
     537                 :             : 
     538                 :             :         INJECTION_POINT("aio-process-completion-before-shared", ioh);
     539                 :             : 
     540                 :        6681 :         pgaio_io_call_complete_shared(ioh);
     541                 :             : 
     542                 :        6681 :         pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_SHARED);
     543                 :             : 
     544                 :             :         /* condition variable broadcast ensures state is visible before wakeup */
     545                 :        6681 :         ConditionVariableBroadcast(&ioh->cv);
     546                 :             : 
     547                 :             :         /* contains call to pgaio_io_call_complete_local() */
     548         [ +  + ]:        6681 :         if (ioh->owner_procno == MyProcNumber)
     549                 :        6394 :                 pgaio_io_reclaim(ioh);
     550                 :        6681 : }
     551                 :             : 
     552                 :             : /*
     553                 :             :  * Has the IO completed and thus the IO handle been reused?
     554                 :             :  *
     555                 :             :  * This is useful when waiting for IO completion at a low level (e.g. in an IO
     556                 :             :  * method's ->wait_one() callback).
     557                 :             :  */
     558                 :             : bool
     559                 :        9984 : pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
     560                 :             : {
     561                 :        9984 :         *state = ioh->state;
     562                 :             : 
     563                 :             :         /*
     564                 :             :          * Ensure that we don't see an earlier state of the handle than ioh->state
     565                 :             :          * due to compiler or CPU reordering. This protects both ->generation as
     566                 :             :          * directly used here, and other fields in the handle accessed in the
     567                 :             :          * caller if the handle was not reused.
     568                 :             :          */
     569                 :        9984 :         pg_read_barrier();
     570                 :             : 
     571                 :        9984 :         return ioh->generation != ref_generation;
     572                 :             : }
     573                 :             : 
     574                 :             : /*
     575                 :             :  * Wait for IO to complete. External code should never use this, outside of
     576                 :             :  * the AIO subsystem waits are only allowed via pgaio_wref_wait().
     577                 :             :  */
     578                 :             : static void
     579                 :         593 : pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
     580                 :             : {
     581                 :         593 :         PgAioHandleState state;
     582                 :         593 :         bool            am_owner;
     583                 :             : 
     584                 :         593 :         am_owner = ioh->owner_procno == MyProcNumber;
     585                 :             : 
     586         [ -  + ]:         593 :         if (pgaio_io_was_recycled(ioh, ref_generation, &state))
     587                 :           0 :                 return;
     588                 :             : 
     589         [ -  + ]:         593 :         if (am_owner)
     590                 :             :         {
     591                 :         593 :                 if (state != PGAIO_HS_SUBMITTED
     592         [ +  + ]:         593 :                         && state != PGAIO_HS_COMPLETED_IO
     593         [ -  + ]:           1 :                         && state != PGAIO_HS_COMPLETED_SHARED
     594   [ #  #  #  # ]:           0 :                         && state != PGAIO_HS_COMPLETED_LOCAL)
     595                 :             :                 {
     596   [ #  #  #  # ]:           0 :                         elog(PANIC, "waiting for own IO %d in wrong state: %s",
     597                 :             :                                  pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
     598                 :           0 :                 }
     599                 :         593 :         }
     600                 :             : 
     601                 :        1186 :         while (true)
     602                 :             :         {
     603         [ +  - ]:        1186 :                 if (pgaio_io_was_recycled(ioh, ref_generation, &state))
     604                 :           0 :                         return;
     605                 :             : 
     606   [ +  -  +  +  :        1186 :                 switch (state)
                      - ]
     607                 :             :                 {
     608                 :             :                         case PGAIO_HS_IDLE:
     609                 :             :                         case PGAIO_HS_HANDED_OUT:
     610   [ #  #  #  # ]:           0 :                                 elog(ERROR, "IO in wrong state: %d", state);
     611                 :           0 :                                 break;
     612                 :             : 
     613                 :             :                         case PGAIO_HS_SUBMITTED:
     614                 :             : 
     615                 :             :                                 /*
     616                 :             :                                  * If we need to wait via the IO method, do so now. Don't
     617                 :             :                                  * check via the IO method if the issuing backend is executing
     618                 :             :                                  * the IO synchronously.
     619                 :             :                                  */
     620   [ -  +  #  # ]:         592 :                                 if (pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS))
     621                 :             :                                 {
     622                 :           0 :                                         pgaio_method_ops->wait_one(ioh, ref_generation);
     623                 :           0 :                                         continue;
     624                 :             :                                 }
     625                 :             :                                 /* fallthrough */
     626                 :             : 
     627                 :             :                                 /* waiting for owner to submit */
     628                 :             :                         case PGAIO_HS_DEFINED:
     629                 :             :                         case PGAIO_HS_STAGED:
     630                 :             :                                 /* waiting for reaper to complete */
     631                 :             :                                 /* fallthrough */
     632                 :             :                         case PGAIO_HS_COMPLETED_IO:
     633                 :             :                                 /* shouldn't be able to hit this otherwise */
     634         [ +  - ]:         593 :                                 Assert(IsUnderPostmaster);
     635                 :             :                                 /* ensure we're going to get woken up */
     636                 :         593 :                                 ConditionVariablePrepareToSleep(&ioh->cv);
     637                 :             : 
     638         [ -  + ]:        1186 :                                 while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
     639                 :             :                                 {
     640   [ +  +  -  + ]:        1186 :                                         if (state == PGAIO_HS_COMPLETED_SHARED ||
     641                 :         593 :                                                 state == PGAIO_HS_COMPLETED_LOCAL)
     642                 :         593 :                                                 break;
     643                 :         593 :                                         ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
     644                 :             :                                 }
     645                 :             : 
     646                 :         593 :                                 ConditionVariableCancelSleep();
     647                 :         593 :                                 break;
     648                 :             : 
     649                 :             :                         case PGAIO_HS_COMPLETED_SHARED:
     650                 :             :                         case PGAIO_HS_COMPLETED_LOCAL:
     651                 :             : 
     652                 :             :                                 /*
     653                 :             :                                  * Note that no interrupts are processed between
     654                 :             :                                  * pgaio_io_was_recycled() and this check - that's important
     655                 :             :                                  * as otherwise an interrupt could have already reclaimed the
     656                 :             :                                  * handle.
     657                 :             :                                  */
     658         [ -  + ]:         593 :                                 if (am_owner)
     659                 :         593 :                                         pgaio_io_reclaim(ioh);
     660                 :         593 :                                 return;
     661                 :             :                 }
     662                 :             :         }
     663                 :         593 : }
     664                 :             : 
     665                 :             : /*
     666                 :             :  * Make IO handle ready to be reused after IO has completed or after the
     667                 :             :  * handle has been released without being used.
     668                 :             :  *
     669                 :             :  * Note that callers need to be careful about only calling this in the right
     670                 :             :  * state and that no interrupts can be processed between the state check and
     671                 :             :  * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
     672                 :             :  * already have reclaimed the handle.
     673                 :             :  */
     674                 :             : static void
     675                 :        7053 : pgaio_io_reclaim(PgAioHandle *ioh)
     676                 :             : {
     677                 :             :         /* This is only ok if it's our IO */
     678         [ +  - ]:        7053 :         Assert(ioh->owner_procno == MyProcNumber);
     679         [ +  - ]:        7053 :         Assert(ioh->state != PGAIO_HS_IDLE);
     680                 :             : 
     681                 :             :         /* see comment in function header */
     682                 :        7053 :         HOLD_INTERRUPTS();
     683                 :             : 
     684                 :             :         /*
     685                 :             :          * It's a bit ugly, but right now the easiest place to put the execution
     686                 :             :          * of local completion callbacks is this function, as we need to execute
     687                 :             :          * local callbacks just before reclaiming at multiple callsites.
     688                 :             :          */
     689         [ +  + ]:        7053 :         if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
     690                 :             :         {
     691                 :        7052 :                 PgAioResult local_result;
     692                 :             : 
     693                 :        7052 :                 local_result = pgaio_io_call_complete_local(ioh);
     694                 :        7052 :                 pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_LOCAL);
     695                 :             : 
     696         [ -  + ]:        7052 :                 if (ioh->report_return)
     697                 :             :                 {
     698                 :        7052 :                         ioh->report_return->result = local_result;
     699                 :        7052 :                         ioh->report_return->target_data = ioh->target_data;
     700                 :        7052 :                 }
     701                 :        7052 :         }
     702                 :             : 
     703   [ -  +  -  + ]:        7053 :         pgaio_debug_io(DEBUG4, ioh,
     704                 :             :                                    "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
     705                 :             :                                    pgaio_result_status_string(ioh->distilled_result.status),
     706                 :             :                                    ioh->distilled_result.id,
     707                 :             :                                    ioh->distilled_result.error_data,
     708                 :             :                                    ioh->result);
     709                 :             : 
     710                 :             :         /* if the IO has been defined, it's on the in-flight list, remove */
     711         [ +  + ]:        7053 :         if (ioh->state != PGAIO_HS_HANDED_OUT)
     712                 :        7052 :                 dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node);
     713                 :             : 
     714         [ -  + ]:        7053 :         if (ioh->resowner)
     715                 :             :         {
     716                 :        7053 :                 ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
     717                 :        7053 :                 ioh->resowner = NULL;
     718                 :        7053 :         }
     719                 :             : 
     720         [ +  - ]:        7053 :         Assert(!ioh->resowner);
     721                 :             : 
     722                 :             :         /*
     723                 :             :          * Update generation & state first, before resetting the IO's fields,
     724                 :             :          * otherwise a concurrent "viewer" could think the fields are valid, even
     725                 :             :          * though they are being reset.  Increment the generation first, so that
     726                 :             :          * we can assert elsewhere that we never wait for an IDLE IO.  While it's
     727                 :             :          * a bit weird for the state to go backwards for a generation, it's OK
     728                 :             :          * here, as there cannot be references to the "reborn" IO yet.  Can't
     729                 :             :          * update both at once, so something has to give.
     730                 :             :          */
     731                 :        7053 :         ioh->generation++;
     732                 :        7053 :         pgaio_io_update_state(ioh, PGAIO_HS_IDLE);
     733                 :             : 
     734                 :             :         /* ensure the state update is visible before we reset fields */
     735                 :        7053 :         pg_write_barrier();
     736                 :             : 
     737                 :        7053 :         ioh->op = PGAIO_OP_INVALID;
     738                 :        7053 :         ioh->target = PGAIO_TID_INVALID;
     739                 :        7053 :         ioh->flags = 0;
     740                 :        7053 :         ioh->num_callbacks = 0;
     741                 :        7053 :         ioh->handle_data_len = 0;
     742                 :        7053 :         ioh->report_return = NULL;
     743                 :        7053 :         ioh->result = 0;
     744                 :        7053 :         ioh->distilled_result.status = PGAIO_RS_UNKNOWN;
     745                 :             : 
     746                 :             :         /*
     747                 :             :          * We push the IO to the head of the idle IO list, that seems more cache
     748                 :             :          * efficient in cases where only a few IOs are used.
     749                 :             :          */
     750                 :        7053 :         dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
     751                 :             : 
     752         [ +  - ]:        7053 :         RESUME_INTERRUPTS();
     753                 :        7053 : }
     754                 :             : 
     755                 :             : /*
     756                 :             :  * Wait for an IO handle to become usable.
     757                 :             :  *
     758                 :             :  * This only really is useful for pgaio_io_acquire().
     759                 :             :  */
     760                 :             : static void
     761                 :           0 : pgaio_io_wait_for_free(void)
     762                 :             : {
     763                 :           0 :         int                     reclaimed = 0;
     764                 :             : 
     765   [ #  #  #  # ]:           0 :         pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs",
     766                 :             :                                 pgaio_my_backend->num_staged_ios,
     767                 :             :                                 dclist_count(&pgaio_my_backend->in_flight_ios),
     768                 :             :                                 dclist_count(&pgaio_my_backend->idle_ios));
     769                 :             : 
     770                 :             :         /*
     771                 :             :          * First check if any of our IOs actually have completed - when using
     772                 :             :          * worker, that'll often be the case. We could do so as part of the loop
     773                 :             :          * below, but that'd potentially lead us to wait for some IO submitted
     774                 :             :          * before.
     775                 :             :          */
     776         [ #  # ]:           0 :         for (int i = 0; i < io_max_concurrency; i++)
     777                 :             :         {
     778                 :           0 :                 PgAioHandle *ioh = &pgaio_ctl->io_handles[pgaio_my_backend->io_handle_off + i];
     779                 :             : 
     780         [ #  # ]:           0 :                 if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
     781                 :             :                 {
     782                 :             :                         /*
     783                 :             :                          * Note that no interrupts are processed between the state check
     784                 :             :                          * and the call to reclaim - that's important as otherwise an
     785                 :             :                          * interrupt could have already reclaimed the handle.
     786                 :             :                          *
     787                 :             :                          * Need to ensure that there's no reordering, in the more common
     788                 :             :                          * paths, where we wait for IO, that's done by
     789                 :             :                          * pgaio_io_was_recycled().
     790                 :             :                          */
     791                 :           0 :                         pg_read_barrier();
     792                 :           0 :                         pgaio_io_reclaim(ioh);
     793                 :           0 :                         reclaimed++;
     794                 :           0 :                 }
     795                 :           0 :         }
     796                 :             : 
     797         [ #  # ]:           0 :         if (reclaimed > 0)
     798                 :           0 :                 return;
     799                 :             : 
     800                 :             :         /*
     801                 :             :          * If we have any unsubmitted IOs, submit them now. We'll start waiting in
     802                 :             :          * a second, so it's better they're in flight. This also addresses the
     803                 :             :          * edge-case that all IOs are unsubmitted.
     804                 :             :          */
     805         [ #  # ]:           0 :         if (pgaio_my_backend->num_staged_ios > 0)
     806                 :           0 :                 pgaio_submit_staged();
     807                 :             : 
     808                 :             :         /* possibly some IOs finished during submission */
     809         [ #  # ]:           0 :         if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
     810                 :           0 :                 return;
     811                 :             : 
     812         [ #  # ]:           0 :         if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
     813   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     814                 :             :                                 errmsg_internal("no free IOs despite no in-flight IOs"),
     815                 :             :                                 errdetail_internal("%d pending, %u in-flight, %u idle IOs",
     816                 :             :                                                                    pgaio_my_backend->num_staged_ios,
     817                 :             :                                                                    dclist_count(&pgaio_my_backend->in_flight_ios),
     818                 :             :                                                                    dclist_count(&pgaio_my_backend->idle_ios)));
     819                 :             : 
     820                 :             :         /*
     821                 :             :          * Wait for the oldest in-flight IO to complete.
     822                 :             :          *
     823                 :             :          * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
     824                 :             :          * for that specific IO to complete, we just need *any* IO to complete.
     825                 :             :          */
     826                 :             :         {
     827                 :           0 :                 PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
     828                 :             :                                                                                            &pgaio_my_backend->in_flight_ios);
     829                 :           0 :                 uint64          generation = ioh->generation;
     830                 :             : 
     831   [ #  #  #  # ]:           0 :                 switch ((PgAioHandleState) ioh->state)
     832                 :             :                 {
     833                 :             :                                 /* should not be in in-flight list */
     834                 :             :                         case PGAIO_HS_IDLE:
     835                 :             :                         case PGAIO_HS_DEFINED:
     836                 :             :                         case PGAIO_HS_HANDED_OUT:
     837                 :             :                         case PGAIO_HS_STAGED:
     838                 :             :                         case PGAIO_HS_COMPLETED_LOCAL:
     839   [ #  #  #  # ]:           0 :                                 elog(ERROR, "shouldn't get here with io:%d in state %d",
     840                 :             :                                          pgaio_io_get_id(ioh), ioh->state);
     841                 :           0 :                                 break;
     842                 :             : 
     843                 :             :                         case PGAIO_HS_COMPLETED_IO:
     844                 :             :                         case PGAIO_HS_SUBMITTED:
     845   [ #  #  #  # ]:           0 :                                 pgaio_debug_io(DEBUG2, ioh,
     846                 :             :                                                            "waiting for free io with %u in flight",
     847                 :             :                                                            dclist_count(&pgaio_my_backend->in_flight_ios));
     848                 :             : 
     849                 :             :                                 /*
     850                 :             :                                  * In a more general case this would be racy, because the
     851                 :             :                                  * generation could increase after we read ioh->state above.
     852                 :             :                                  * But we are only looking at IOs by the current backend and
     853                 :             :                                  * the IO can only be recycled by this backend.  Even this is
     854                 :             :                                  * only OK because we get the handle's generation before
     855                 :             :                                  * potentially processing interrupts, e.g. as part of
     856                 :             :                                  * pgaio_debug_io().
     857                 :             :                                  */
     858                 :           0 :                                 pgaio_io_wait(ioh, generation);
     859                 :           0 :                                 break;
     860                 :             : 
     861                 :             :                         case PGAIO_HS_COMPLETED_SHARED:
     862                 :             : 
     863                 :             :                                 /*
     864                 :             :                                  * It's possible that another backend just finished this IO.
     865                 :             :                                  *
     866                 :             :                                  * Note that no interrupts are processed between the state
     867                 :             :                                  * check and the call to reclaim - that's important as
     868                 :             :                                  * otherwise an interrupt could have already reclaimed the
     869                 :             :                                  * handle.
     870                 :             :                                  *
     871                 :             :                                  * Need to ensure that there's no reordering, in the more
     872                 :             :                                  * common paths, where we wait for IO, that's done by
     873                 :             :                                  * pgaio_io_was_recycled().
     874                 :             :                                  */
     875                 :           0 :                                 pg_read_barrier();
     876                 :           0 :                                 pgaio_io_reclaim(ioh);
     877                 :           0 :                                 break;
     878                 :             :                 }
     879                 :             : 
     880         [ #  # ]:           0 :                 if (dclist_count(&pgaio_my_backend->idle_ios) == 0)
     881   [ #  #  #  # ]:           0 :                         elog(PANIC, "no idle IO after waiting for IO to terminate");
     882                 :             :                 return;
     883                 :           0 :         }
     884                 :           0 : }
     885                 :             : 
     886                 :             : /*
     887                 :             :  * Internal - code outside of AIO should never need this and it'd be hard for
     888                 :             :  * such code to be safe.
     889                 :             :  */
     890                 :             : static PgAioHandle *
     891                 :        7612 : pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
     892                 :             : {
     893                 :        7612 :         PgAioHandle *ioh;
     894                 :             : 
     895         [ +  - ]:        7612 :         Assert(iow->aio_index < pgaio_ctl->io_handle_count);
     896                 :             : 
     897                 :        7612 :         ioh = &pgaio_ctl->io_handles[iow->aio_index];
     898                 :             : 
     899                 :       15224 :         *ref_generation = ((uint64) iow->generation_upper) << 32 |
     900                 :        7612 :                 iow->generation_lower;
     901                 :             : 
     902         [ +  - ]:        7612 :         Assert(*ref_generation != 0);
     903                 :             : 
     904                 :       15224 :         return ioh;
     905                 :        7612 : }
     906                 :             : 
     907                 :             : static const char *
     908                 :           0 : pgaio_io_state_get_name(PgAioHandleState s)
     909                 :             : {
     910                 :             : #define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
     911   [ #  #  #  #  :           0 :         switch (s)
             #  #  #  #  
                      # ]
     912                 :             :         {
     913                 :           0 :                         PGAIO_HS_TOSTR_CASE(IDLE);
     914                 :           0 :                         PGAIO_HS_TOSTR_CASE(HANDED_OUT);
     915                 :           0 :                         PGAIO_HS_TOSTR_CASE(DEFINED);
     916                 :           0 :                         PGAIO_HS_TOSTR_CASE(STAGED);
     917                 :           0 :                         PGAIO_HS_TOSTR_CASE(SUBMITTED);
     918                 :           0 :                         PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
     919                 :           0 :                         PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
     920                 :           0 :                         PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
     921                 :             :         }
     922                 :             : #undef PGAIO_HS_TOSTR_CASE
     923                 :             : 
     924                 :           0 :         return NULL;                            /* silence compiler */
     925                 :           0 : }
     926                 :             : 
     927                 :             : const char *
     928                 :           0 : pgaio_io_get_state_name(PgAioHandle *ioh)
     929                 :             : {
     930                 :           0 :         return pgaio_io_state_get_name(ioh->state);
     931                 :             : }
     932                 :             : 
     933                 :             : const char *
     934                 :           0 : pgaio_result_status_string(PgAioResultStatus rs)
     935                 :             : {
     936   [ #  #  #  #  :           0 :         switch (rs)
                   #  # ]
     937                 :             :         {
     938                 :             :                 case PGAIO_RS_UNKNOWN:
     939                 :           0 :                         return "UNKNOWN";
     940                 :             :                 case PGAIO_RS_OK:
     941                 :           0 :                         return "OK";
     942                 :             :                 case PGAIO_RS_WARNING:
     943                 :           0 :                         return "WARNING";
     944                 :             :                 case PGAIO_RS_PARTIAL:
     945                 :           0 :                         return "PARTIAL";
     946                 :             :                 case PGAIO_RS_ERROR:
     947                 :           0 :                         return "ERROR";
     948                 :             :         }
     949                 :             : 
     950                 :           0 :         return NULL;                            /* silence compiler */
     951                 :           0 : }
     952                 :             : 
     953                 :             : 
     954                 :             : 
     955                 :             : /* --------------------------------------------------------------------------------
     956                 :             :  * Functions primarily related to IO Wait References
     957                 :             :  * --------------------------------------------------------------------------------
     958                 :             :  */
     959                 :             : 
     960                 :             : /*
     961                 :             :  * Mark a wait reference as invalid
     962                 :             :  */
     963                 :             : void
     964                 :      176647 : pgaio_wref_clear(PgAioWaitRef *iow)
     965                 :             : {
     966                 :      176647 :         iow->aio_index = PG_UINT32_MAX;
     967                 :      176647 : }
     968                 :             : 
     969                 :             : /* Is the wait reference valid? */
     970                 :             : bool
     971                 :       35727 : pgaio_wref_valid(PgAioWaitRef *iow)
     972                 :             : {
     973                 :       35727 :         return iow->aio_index != PG_UINT32_MAX;
     974                 :             : }
     975                 :             : 
     976                 :             : /*
     977                 :             :  * Similar to pgaio_io_get_id(), just for wait references.
     978                 :             :  */
     979                 :             : int
     980                 :           0 : pgaio_wref_get_id(PgAioWaitRef *iow)
     981                 :             : {
     982         [ #  # ]:           0 :         Assert(pgaio_wref_valid(iow));
     983                 :           0 :         return iow->aio_index;
     984                 :             : }
     985                 :             : 
     986                 :             : /*
     987                 :             :  * Wait for the IO to have completed. Can be called in any process, not just
     988                 :             :  * in the issuing backend.
     989                 :             :  */
     990                 :             : void
     991                 :         593 : pgaio_wref_wait(PgAioWaitRef *iow)
     992                 :             : {
     993                 :         593 :         uint64          ref_generation;
     994                 :         593 :         PgAioHandle *ioh;
     995                 :             : 
     996                 :         593 :         ioh = pgaio_io_from_wref(iow, &ref_generation);
     997                 :             : 
     998                 :         593 :         pgaio_io_wait(ioh, ref_generation);
     999                 :         593 : }
    1000                 :             : 
    1001                 :             : /*
    1002                 :             :  * Check if the referenced IO completed, without blocking.
    1003                 :             :  */
    1004                 :             : bool
    1005                 :        7019 : pgaio_wref_check_done(PgAioWaitRef *iow)
    1006                 :             : {
    1007                 :        7019 :         uint64          ref_generation;
    1008                 :        7019 :         PgAioHandleState state;
    1009                 :        7019 :         bool            am_owner;
    1010                 :        7019 :         PgAioHandle *ioh;
    1011                 :             : 
    1012                 :        7019 :         ioh = pgaio_io_from_wref(iow, &ref_generation);
    1013                 :             : 
    1014         [ +  + ]:        7019 :         if (pgaio_io_was_recycled(ioh, ref_generation, &state))
    1015                 :        6361 :                 return true;
    1016                 :             : 
    1017         [ +  - ]:         658 :         if (state == PGAIO_HS_IDLE)
    1018                 :           0 :                 return true;
    1019                 :             : 
    1020                 :         658 :         am_owner = ioh->owner_procno == MyProcNumber;
    1021                 :             : 
    1022   [ +  +  -  + ]:         658 :         if (state == PGAIO_HS_COMPLETED_SHARED ||
    1023                 :         593 :                 state == PGAIO_HS_COMPLETED_LOCAL)
    1024                 :             :         {
    1025                 :             :                 /*
    1026                 :             :                  * Note that no interrupts are processed between
    1027                 :             :                  * pgaio_io_was_recycled() and this check - that's important as
    1028                 :             :                  * otherwise an interrupt could have already reclaimed the handle.
    1029                 :             :                  */
    1030         [ -  + ]:          65 :                 if (am_owner)
    1031                 :          65 :                         pgaio_io_reclaim(ioh);
    1032                 :          65 :                 return true;
    1033                 :             :         }
    1034                 :             : 
    1035                 :             :         /*
    1036                 :             :          * XXX: It likely would be worth checking in with the io method, to give
    1037                 :             :          * the IO method a chance to check if there are completion events queued.
    1038                 :             :          */
    1039                 :             : 
    1040                 :         593 :         return false;
    1041                 :        7019 : }
    1042                 :             : 
    1043                 :             : 
    1044                 :             : 
    1045                 :             : /* --------------------------------------------------------------------------------
    1046                 :             :  * Actions on multiple IOs.
    1047                 :             :  * --------------------------------------------------------------------------------
    1048                 :             :  */
    1049                 :             : 
    1050                 :             : /*
    1051                 :             :  * Submit IOs in batches going forward.
    1052                 :             :  *
    1053                 :             :  * Submitting multiple IOs at once can be substantially faster than doing so
    1054                 :             :  * one-by-one. At the same time, submitting multiple IOs at once requires more
    1055                 :             :  * care to avoid deadlocks.
    1056                 :             :  *
    1057                 :             :  * Consider backend A staging an IO for buffer 1 and then trying to start IO
    1058                 :             :  * on buffer 2, while backend B does the inverse. If A submitted the IO before
    1059                 :             :  * moving on to buffer 2, this works just fine, B will wait for the IO to
    1060                 :             :  * complete. But if batching were used, each backend will wait for IO that has
    1061                 :             :  * not yet been submitted to complete, i.e. forever.
    1062                 :             :  *
    1063                 :             :  * End batch submission mode with pgaio_exit_batchmode().  (Throwing errors is
    1064                 :             :  * allowed; error recovery will end the batch.)
    1065                 :             :  *
    1066                 :             :  * To avoid deadlocks, code needs to ensure that it will not wait for another
    1067                 :             :  * backend while there is unsubmitted IO. E.g. by using conditional lock
    1068                 :             :  * acquisition when acquiring buffer locks. To check if there currently are
    1069                 :             :  * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
    1070                 :             :  * pgaio_submit_staged().
    1071                 :             :  *
    1072                 :             :  * It is not allowed to enter batchmode while already in batchmode, it's
    1073                 :             :  * unlikely to ever be needed, as code needs to be explicitly aware of being
    1074                 :             :  * called in batchmode, to avoid the deadlock risks explained above.
    1075                 :             :  *
    1076                 :             :  * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
    1077                 :             :  * e.g. because too many IOs have been staged or because pgaio_submit_staged()
    1078                 :             :  * was called.
    1079                 :             :  */
    1080                 :             : void
    1081                 :      735174 : pgaio_enter_batchmode(void)
    1082                 :             : {
    1083         [ +  - ]:      735174 :         if (pgaio_my_backend->in_batchmode)
    1084   [ #  #  #  # ]:           0 :                 elog(ERROR, "starting batch while batch already in progress");
    1085                 :      735174 :         pgaio_my_backend->in_batchmode = true;
    1086                 :      735174 : }
    1087                 :             : 
    1088                 :             : /*
    1089                 :             :  * Stop submitting IOs in batches.
    1090                 :             :  */
    1091                 :             : void
    1092                 :      735172 : pgaio_exit_batchmode(void)
    1093                 :             : {
    1094         [ +  - ]:      735172 :         Assert(pgaio_my_backend->in_batchmode);
    1095                 :             : 
    1096                 :      735172 :         pgaio_submit_staged();
    1097                 :      735172 :         pgaio_my_backend->in_batchmode = false;
    1098                 :      735172 : }
    1099                 :             : 
    1100                 :             : /*
    1101                 :             :  * Are there staged but unsubmitted IOs?
    1102                 :             :  *
    1103                 :             :  * See comment above pgaio_enter_batchmode() for why code may need to check if
    1104                 :             :  * there is IO in that state.
    1105                 :             :  */
    1106                 :             : bool
    1107                 :        7053 : pgaio_have_staged(void)
    1108                 :             : {
    1109   [ +  +  +  - ]:        7053 :         Assert(pgaio_my_backend->in_batchmode ||
    1110                 :             :                    pgaio_my_backend->num_staged_ios == 0);
    1111                 :        7053 :         return pgaio_my_backend->num_staged_ios > 0;
    1112                 :             : }
    1113                 :             : 
    1114                 :             : /*
    1115                 :             :  * Submit all staged but not yet submitted IOs.
    1116                 :             :  *
    1117                 :             :  * Unless in batch mode, this never needs to be called, as IOs get submitted
    1118                 :             :  * as soon as possible. While in batchmode pgaio_submit_staged() can be called
    1119                 :             :  * before waiting on another backend, to avoid the risk of deadlocks. See
    1120                 :             :  * pgaio_enter_batchmode().
    1121                 :             :  */
    1122                 :             : void
    1123                 :      735181 : pgaio_submit_staged(void)
    1124                 :             : {
    1125                 :      735181 :         int                     total_submitted = 0;
    1126                 :      735181 :         int                     did_submit;
    1127                 :             : 
    1128         [ +  + ]:      735181 :         if (pgaio_my_backend->num_staged_ios == 0)
    1129                 :      734527 :                 return;
    1130                 :             : 
    1131                 :             : 
    1132                 :         654 :         START_CRIT_SECTION();
    1133                 :             : 
    1134                 :        1308 :         did_submit = pgaio_method_ops->submit(pgaio_my_backend->num_staged_ios,
    1135                 :         654 :                                                                                   pgaio_my_backend->staged_ios);
    1136                 :             : 
    1137         [ +  - ]:         654 :         END_CRIT_SECTION();
    1138                 :             : 
    1139                 :         654 :         total_submitted += did_submit;
    1140                 :             : 
    1141         [ +  - ]:         654 :         Assert(total_submitted == did_submit);
    1142                 :             : 
    1143                 :         654 :         pgaio_my_backend->num_staged_ios = 0;
    1144                 :             : 
    1145   [ -  +  -  + ]:         654 :         pgaio_debug(DEBUG4,
    1146                 :             :                                 "aio: submitted %d IOs",
    1147                 :             :                                 total_submitted);
    1148         [ -  + ]:      735181 : }
    1149                 :             : 
    1150                 :             : 
    1151                 :             : 
    1152                 :             : /* --------------------------------------------------------------------------------
    1153                 :             :  * Other
    1154                 :             :  * --------------------------------------------------------------------------------
    1155                 :             :  */
    1156                 :             : 
    1157                 :             : 
    1158                 :             : /*
    1159                 :             :  * Perform AIO related cleanup after an error.
    1160                 :             :  *
    1161                 :             :  * This should be called early in the error recovery paths, as later steps may
    1162                 :             :  * need to issue AIO (e.g. to record a transaction abort WAL record).
    1163                 :             :  */
    1164                 :             : void
    1165                 :        8199 : pgaio_error_cleanup(void)
    1166                 :             : {
    1167                 :             :         /*
    1168                 :             :          * It is possible that code errored out after pgaio_enter_batchmode() but
    1169                 :             :          * before pgaio_exit_batchmode() was called. In that case we need to
    1170                 :             :          * submit the IO now.
    1171                 :             :          */
    1172         [ +  + ]:        8199 :         if (pgaio_my_backend->in_batchmode)
    1173                 :             :         {
    1174                 :           2 :                 pgaio_my_backend->in_batchmode = false;
    1175                 :             : 
    1176                 :           2 :                 pgaio_submit_staged();
    1177                 :           2 :         }
    1178                 :             : 
    1179                 :             :         /*
    1180                 :             :          * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
    1181                 :             :          */
    1182         [ +  - ]:        8199 :         Assert(pgaio_my_backend->num_staged_ios == 0);
    1183                 :        8199 : }
    1184                 :             : 
    1185                 :             : /*
    1186                 :             :  * Perform AIO related checks at (sub-)transactional boundaries.
    1187                 :             :  *
    1188                 :             :  * This should be called late during (sub-)transactional commit/abort, after
    1189                 :             :  * all steps that might need to perform AIO, so that we can verify that the
    1190                 :             :  * AIO subsystem is in a valid state at the end of a transaction.
    1191                 :             :  */
    1192                 :             : void
    1193                 :       59900 : AtEOXact_Aio(bool is_commit)
    1194                 :             : {
    1195                 :             :         /*
    1196                 :             :          * We should never be in batch mode at transactional boundaries. In case
    1197                 :             :          * an error was thrown while in batch mode, pgaio_error_cleanup() should
    1198                 :             :          * have exited batchmode.
    1199                 :             :          *
    1200                 :             :          * In case we are in batchmode somehow, make sure to submit all staged
    1201                 :             :          * IOs, other backends may need them to complete to continue.
    1202                 :             :          */
    1203         [ +  - ]:       59900 :         if (pgaio_my_backend->in_batchmode)
    1204                 :             :         {
    1205                 :           0 :                 pgaio_error_cleanup();
    1206   [ #  #  #  # ]:           0 :                 elog(WARNING, "open AIO batch at end of (sub-)transaction");
    1207                 :           0 :         }
    1208                 :             : 
    1209                 :             :         /*
    1210                 :             :          * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
    1211                 :             :          */
    1212         [ +  - ]:       59900 :         Assert(pgaio_my_backend->num_staged_ios == 0);
    1213                 :       59900 : }
    1214                 :             : 
    1215                 :             : /*
    1216                 :             :  * Need to submit staged but not yet submitted IOs using the fd, otherwise
    1217                 :             :  * the IO would end up targeting something bogus.
    1218                 :             :  */
    1219                 :             : void
    1220                 :       49005 : pgaio_closing_fd(int fd)
    1221                 :             : {
    1222                 :             :         /*
    1223                 :             :          * Might be called before AIO is initialized or in a subprocess that
    1224                 :             :          * doesn't use AIO.
    1225                 :             :          */
    1226         [ +  + ]:       49005 :         if (!pgaio_my_backend)
    1227                 :          14 :                 return;
    1228                 :             : 
    1229                 :             :         /*
    1230                 :             :          * For now just submit all staged IOs - we could be more selective, but
    1231                 :             :          * it's probably not worth it.
    1232                 :             :          */
    1233         [ +  - ]:       48991 :         if (pgaio_my_backend->num_staged_ios > 0)
    1234                 :             :         {
    1235   [ #  #  #  # ]:           0 :                 pgaio_debug(DEBUG2,
    1236                 :             :                                         "submitting %d IOs before FD %d gets closed",
    1237                 :             :                                         pgaio_my_backend->num_staged_ios, fd);
    1238                 :           0 :                 pgaio_submit_staged();
    1239                 :           0 :         }
    1240                 :             : 
    1241                 :             :         /*
    1242                 :             :          * If requested by the IO method, wait for all IOs that use the
    1243                 :             :          * to-be-closed FD.
    1244                 :             :          */
    1245         [ +  - ]:       48991 :         if (pgaio_method_ops->wait_on_fd_before_close)
    1246                 :             :         {
    1247                 :             :                 /*
    1248                 :             :                  * As waiting for one IO to complete may complete multiple IOs, we
    1249                 :             :                  * can't just use a mutable list iterator. The maximum number of
    1250                 :             :                  * in-flight IOs is fairly small, so just restart the loop after
    1251                 :             :                  * waiting for an IO.
    1252                 :             :                  */
    1253         [ #  # ]:           0 :                 while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
    1254                 :             :                 {
    1255                 :           0 :                         dlist_iter      iter;
    1256                 :           0 :                         PgAioHandle *ioh = NULL;
    1257                 :           0 :                         uint64          generation;
    1258                 :             : 
    1259   [ #  #  #  # ]:           0 :                         dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
    1260                 :             :                         {
    1261                 :           0 :                                 ioh = dclist_container(PgAioHandle, node, iter.cur);
    1262                 :             : 
    1263                 :           0 :                                 generation = ioh->generation;
    1264                 :             : 
    1265         [ #  # ]:           0 :                                 if (pgaio_io_uses_fd(ioh, fd))
    1266                 :           0 :                                         break;
    1267                 :             :                                 else
    1268                 :           0 :                                         ioh = NULL;
    1269                 :           0 :                         }
    1270                 :             : 
    1271         [ #  # ]:           0 :                         if (!ioh)
    1272                 :           0 :                                 break;
    1273                 :             : 
    1274   [ #  #  #  # ]:           0 :                         pgaio_debug_io(DEBUG2, ioh,
    1275                 :             :                                                    "waiting for IO before FD %d gets closed, %u in-flight IOs",
    1276                 :             :                                                    fd, dclist_count(&pgaio_my_backend->in_flight_ios));
    1277                 :             : 
    1278                 :             :                         /* see comment in pgaio_io_wait_for_free() about raciness */
    1279                 :           0 :                         pgaio_io_wait(ioh, generation);
    1280      [ #  #  # ]:           0 :                 }
    1281                 :           0 :         }
    1282                 :       49005 : }
    1283                 :             : 
    1284                 :             : /*
    1285                 :             :  * Registered as before_shmem_exit() callback in pgaio_init_backend()
    1286                 :             :  */
    1287                 :             : void
    1288                 :         803 : pgaio_shutdown(int code, Datum arg)
    1289                 :             : {
    1290         [ +  - ]:         803 :         Assert(pgaio_my_backend);
    1291         [ +  - ]:         803 :         Assert(!pgaio_my_backend->handed_out_io);
    1292                 :             : 
    1293                 :             :         /* first clean up resources as we would at a transaction boundary */
    1294                 :         803 :         AtEOXact_Aio(code == 0);
    1295                 :             : 
    1296                 :             :         /*
    1297                 :             :          * Before exiting, make sure that all IOs are finished. That has two main
    1298                 :             :          * purposes:
    1299                 :             :          *
    1300                 :             :          * - Some kernel-level AIO mechanisms don't deal well with the issuer of
    1301                 :             :          * an AIO exiting before IO completed
    1302                 :             :          *
    1303                 :             :          * - It'd be confusing to see partially finished IOs in stats views etc
    1304                 :             :          */
    1305         [ -  + ]:         803 :         while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
    1306                 :             :         {
    1307                 :           0 :                 PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
    1308                 :           0 :                 uint64          generation = ioh->generation;
    1309                 :             : 
    1310   [ #  #  #  # ]:           0 :                 pgaio_debug_io(DEBUG2, ioh,
    1311                 :             :                                            "waiting for IO to complete during shutdown, %u in-flight IOs",
    1312                 :             :                                            dclist_count(&pgaio_my_backend->in_flight_ios));
    1313                 :             : 
    1314                 :             :                 /* see comment in pgaio_io_wait_for_free() about raciness */
    1315                 :           0 :                 pgaio_io_wait(ioh, generation);
    1316                 :           0 :         }
    1317                 :             : 
    1318                 :         803 :         pgaio_my_backend = NULL;
    1319                 :         803 : }
    1320                 :             : 
    1321                 :             : void
    1322                 :           6 : assign_io_method(int newval, void *extra)
    1323                 :             : {
    1324         [ +  - ]:           6 :         Assert(newval < lengthof(pgaio_method_ops_table));
    1325         [ +  - ]:           6 :         Assert(pgaio_method_ops_table[newval] != NULL);
    1326                 :             : 
    1327                 :           6 :         pgaio_method_ops = pgaio_method_ops_table[newval];
    1328                 :           6 : }
    1329                 :             : 
    1330                 :             : bool
    1331                 :          12 : check_io_max_concurrency(int *newval, void **extra, GucSource source)
    1332                 :             : {
    1333         [ +  + ]:          12 :         if (*newval == -1)
    1334                 :             :         {
    1335                 :             :                 /*
    1336                 :             :                  * Auto-tuning will be applied later during startup, as auto-tuning
    1337                 :             :                  * depends on the value of various GUCs.
    1338                 :             :                  */
    1339                 :           6 :                 return true;
    1340                 :             :         }
    1341         [ +  - ]:           6 :         else if (*newval == 0)
    1342                 :             :         {
    1343                 :           0 :                 GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
    1344                 :           0 :                 return false;
    1345                 :             :         }
    1346                 :             : 
    1347                 :           6 :         return true;
    1348                 :          12 : }
        

Generated by: LCOV version 2.3.2-1