LCOV - code coverage report
Current view: top level - contrib/pg_buffercache - pg_buffercache_pages.c (source / functions) Coverage Total Hit
Test: Code coverage Lines: 0.0 % 440 0
Test Date: 2026-01-26 10:56:24 Functions: 0.0 % 25 0
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * pg_buffercache_pages.c
       4              :  *        display some contents of the buffer cache
       5              :  *
       6              :  *        contrib/pg_buffercache/pg_buffercache_pages.c
       7              :  *-------------------------------------------------------------------------
       8              :  */
       9              : #include "postgres.h"
      10              : 
      11              : #include "access/htup_details.h"
      12              : #include "access/relation.h"
      13              : #include "catalog/pg_type.h"
      14              : #include "funcapi.h"
      15              : #include "port/pg_numa.h"
      16              : #include "storage/buf_internals.h"
      17              : #include "storage/bufmgr.h"
      18              : #include "utils/rel.h"
      19              : 
      20              : 
      21              : #define NUM_BUFFERCACHE_PAGES_MIN_ELEM  8
      22              : #define NUM_BUFFERCACHE_PAGES_ELEM      9
      23              : #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
      24              : #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
      25              : #define NUM_BUFFERCACHE_EVICT_ELEM 2
      26              : #define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
      27              : #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
      28              : #define NUM_BUFFERCACHE_MARK_DIRTY_ELEM 2
      29              : #define NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM 3
      30              : #define NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM 3
      31              : 
      32              : #define NUM_BUFFERCACHE_OS_PAGES_ELEM   3
      33              : 
      34            0 : PG_MODULE_MAGIC_EXT(
      35              :                                         .name = "pg_buffercache",
      36              :                                         .version = PG_VERSION
      37              : );
      38              : 
      39              : /*
      40              :  * Record structure holding the to be exposed cache data.
      41              :  */
      42              : typedef struct
      43              : {
      44              :         uint32          bufferid;
      45              :         RelFileNumber relfilenumber;
      46              :         Oid                     reltablespace;
      47              :         Oid                     reldatabase;
      48              :         ForkNumber      forknum;
      49              :         BlockNumber blocknum;
      50              :         bool            isvalid;
      51              :         bool            isdirty;
      52              :         uint16          usagecount;
      53              : 
      54              :         /*
      55              :          * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
      56              :          * being pinned by too many backends and each backend will only pin once
      57              :          * because of bufmgr.c's PrivateRefCount infrastructure.
      58              :          */
      59              :         int32           pinning_backends;
      60              : } BufferCachePagesRec;
      61              : 
      62              : 
      63              : /*
      64              :  * Function context for data persisting over repeated calls.
      65              :  */
      66              : typedef struct
      67              : {
      68              :         TupleDesc       tupdesc;
      69              :         BufferCachePagesRec *record;
      70              : } BufferCachePagesContext;
      71              : 
      72              : /*
      73              :  * Record structure holding the to be exposed cache data for OS pages.  This
      74              :  * structure is used by pg_buffercache_os_pages(), where NUMA information may
      75              :  * or may not be included.
      76              :  */
      77              : typedef struct
      78              : {
      79              :         uint32          bufferid;
      80              :         int64           page_num;
      81              :         int32           numa_node;
      82              : } BufferCacheOsPagesRec;
      83              : 
      84              : /*
      85              :  * Function context for data persisting over repeated calls.
      86              :  */
      87              : typedef struct
      88              : {
      89              :         TupleDesc       tupdesc;
      90              :         bool            include_numa;
      91              :         BufferCacheOsPagesRec *record;
      92              : } BufferCacheOsPagesContext;
      93              : 
      94              : 
      95              : /*
      96              :  * Function returning data from the shared buffer cache - buffer number,
      97              :  * relation node/tablespace/database/blocknum and dirty indicator.
      98              :  */
      99            0 : PG_FUNCTION_INFO_V1(pg_buffercache_pages);
     100            0 : PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
     101            0 : PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
     102            0 : PG_FUNCTION_INFO_V1(pg_buffercache_summary);
     103            0 : PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
     104            0 : PG_FUNCTION_INFO_V1(pg_buffercache_evict);
     105            0 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation);
     106            0 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
     107            0 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty);
     108            0 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_relation);
     109            0 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_all);
     110              : 
     111              : 
     112              : /* Only need to touch memory once per backend process lifetime */
     113              : static bool firstNumaTouch = true;
     114              : 
     115              : 
     116              : Datum
     117            0 : pg_buffercache_pages(PG_FUNCTION_ARGS)
     118              : {
     119            0 :         FuncCallContext *funcctx;
     120            0 :         Datum           result;
     121            0 :         MemoryContext oldcontext;
     122            0 :         BufferCachePagesContext *fctx;  /* User function context. */
     123            0 :         TupleDesc       tupledesc;
     124            0 :         TupleDesc       expected_tupledesc;
     125            0 :         HeapTuple       tuple;
     126              : 
     127            0 :         if (SRF_IS_FIRSTCALL())
     128              :         {
     129            0 :                 int                     i;
     130              : 
     131            0 :                 funcctx = SRF_FIRSTCALL_INIT();
     132              : 
     133              :                 /* Switch context when allocating stuff to be used in later calls */
     134            0 :                 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     135              : 
     136              :                 /* Create a user function context for cross-call persistence */
     137            0 :                 fctx = palloc_object(BufferCachePagesContext);
     138              : 
     139              :                 /*
     140              :                  * To smoothly support upgrades from version 1.0 of this extension
     141              :                  * transparently handle the (non-)existence of the pinning_backends
     142              :                  * column. We unfortunately have to get the result type for that... -
     143              :                  * we can't use the result type determined by the function definition
     144              :                  * without potentially crashing when somebody uses the old (or even
     145              :                  * wrong) function definition though.
     146              :                  */
     147            0 :                 if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
     148            0 :                         elog(ERROR, "return type must be a row type");
     149              : 
     150            0 :                 if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
     151            0 :                         expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
     152            0 :                         elog(ERROR, "incorrect number of output arguments");
     153              : 
     154              :                 /* Construct a tuple descriptor for the result rows. */
     155            0 :                 tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
     156            0 :                 TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
     157              :                                                    INT4OID, -1, 0);
     158            0 :                 TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
     159              :                                                    OIDOID, -1, 0);
     160            0 :                 TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
     161              :                                                    OIDOID, -1, 0);
     162            0 :                 TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
     163              :                                                    OIDOID, -1, 0);
     164            0 :                 TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
     165              :                                                    INT2OID, -1, 0);
     166            0 :                 TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
     167              :                                                    INT8OID, -1, 0);
     168            0 :                 TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
     169              :                                                    BOOLOID, -1, 0);
     170            0 :                 TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
     171              :                                                    INT2OID, -1, 0);
     172              : 
     173            0 :                 if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
     174            0 :                         TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
     175              :                                                            INT4OID, -1, 0);
     176              : 
     177            0 :                 fctx->tupdesc = BlessTupleDesc(tupledesc);
     178              : 
     179              :                 /* Allocate NBuffers worth of BufferCachePagesRec records. */
     180            0 :                 fctx->record = (BufferCachePagesRec *)
     181            0 :                         MemoryContextAllocHuge(CurrentMemoryContext,
     182            0 :                                                                    sizeof(BufferCachePagesRec) * NBuffers);
     183              : 
     184              :                 /* Set max calls and remember the user function context. */
     185            0 :                 funcctx->max_calls = NBuffers;
     186            0 :                 funcctx->user_fctx = fctx;
     187              : 
     188              :                 /* Return to original context when allocating transient memory */
     189            0 :                 MemoryContextSwitchTo(oldcontext);
     190              : 
     191              :                 /*
     192              :                  * Scan through all the buffers, saving the relevant fields in the
     193              :                  * fctx->record structure.
     194              :                  *
     195              :                  * We don't hold the partition locks, so we don't get a consistent
     196              :                  * snapshot across all buffers, but we do grab the buffer header
     197              :                  * locks, so the information of each buffer is self-consistent.
     198              :                  */
     199            0 :                 for (i = 0; i < NBuffers; i++)
     200              :                 {
     201            0 :                         BufferDesc *bufHdr;
     202            0 :                         uint64          buf_state;
     203              : 
     204            0 :                         CHECK_FOR_INTERRUPTS();
     205              : 
     206            0 :                         bufHdr = GetBufferDescriptor(i);
     207              :                         /* Lock each buffer header before inspecting. */
     208            0 :                         buf_state = LockBufHdr(bufHdr);
     209              : 
     210            0 :                         fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
     211            0 :                         fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
     212            0 :                         fctx->record[i].reltablespace = bufHdr->tag.spcOid;
     213            0 :                         fctx->record[i].reldatabase = bufHdr->tag.dbOid;
     214            0 :                         fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
     215            0 :                         fctx->record[i].blocknum = bufHdr->tag.blockNum;
     216            0 :                         fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
     217            0 :                         fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
     218              : 
     219            0 :                         if (buf_state & BM_DIRTY)
     220            0 :                                 fctx->record[i].isdirty = true;
     221              :                         else
     222            0 :                                 fctx->record[i].isdirty = false;
     223              : 
     224              :                         /* Note if the buffer is valid, and has storage created */
     225            0 :                         if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
     226            0 :                                 fctx->record[i].isvalid = true;
     227              :                         else
     228            0 :                                 fctx->record[i].isvalid = false;
     229              : 
     230            0 :                         UnlockBufHdr(bufHdr);
     231            0 :                 }
     232            0 :         }
     233              : 
     234            0 :         funcctx = SRF_PERCALL_SETUP();
     235              : 
     236              :         /* Get the saved state */
     237            0 :         fctx = funcctx->user_fctx;
     238              : 
     239            0 :         if (funcctx->call_cntr < funcctx->max_calls)
     240              :         {
     241            0 :                 uint32          i = funcctx->call_cntr;
     242            0 :                 Datum           values[NUM_BUFFERCACHE_PAGES_ELEM];
     243            0 :                 bool            nulls[NUM_BUFFERCACHE_PAGES_ELEM];
     244              : 
     245            0 :                 values[0] = Int32GetDatum(fctx->record[i].bufferid);
     246            0 :                 nulls[0] = false;
     247              : 
     248              :                 /*
     249              :                  * Set all fields except the bufferid to null if the buffer is unused
     250              :                  * or not valid.
     251              :                  */
     252            0 :                 if (fctx->record[i].blocknum == InvalidBlockNumber ||
     253            0 :                         fctx->record[i].isvalid == false)
     254              :                 {
     255            0 :                         nulls[1] = true;
     256            0 :                         nulls[2] = true;
     257            0 :                         nulls[3] = true;
     258            0 :                         nulls[4] = true;
     259            0 :                         nulls[5] = true;
     260            0 :                         nulls[6] = true;
     261            0 :                         nulls[7] = true;
     262              :                         /* unused for v1.0 callers, but the array is always long enough */
     263            0 :                         nulls[8] = true;
     264            0 :                 }
     265              :                 else
     266              :                 {
     267            0 :                         values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
     268            0 :                         nulls[1] = false;
     269            0 :                         values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
     270            0 :                         nulls[2] = false;
     271            0 :                         values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
     272            0 :                         nulls[3] = false;
     273            0 :                         values[4] = Int16GetDatum(fctx->record[i].forknum);
     274            0 :                         nulls[4] = false;
     275            0 :                         values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
     276            0 :                         nulls[5] = false;
     277            0 :                         values[6] = BoolGetDatum(fctx->record[i].isdirty);
     278            0 :                         nulls[6] = false;
     279            0 :                         values[7] = UInt16GetDatum(fctx->record[i].usagecount);
     280            0 :                         nulls[7] = false;
     281              :                         /* unused for v1.0 callers, but the array is always long enough */
     282            0 :                         values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
     283            0 :                         nulls[8] = false;
     284              :                 }
     285              : 
     286              :                 /* Build and return the tuple. */
     287            0 :                 tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
     288            0 :                 result = HeapTupleGetDatum(tuple);
     289              : 
     290            0 :                 SRF_RETURN_NEXT(funcctx, result);
     291            0 :         }
     292              :         else
     293            0 :                 SRF_RETURN_DONE(funcctx);
     294            0 : }
     295              : 
     296              : /*
     297              :  * Inquire about OS pages mappings for shared buffers, with NUMA information,
     298              :  * optionally.
     299              :  *
     300              :  * When "include_numa" is false, this routines ignores everything related
     301              :  * to NUMA (returned as NULL values), returning mapping information between
     302              :  * shared buffers and OS pages.
     303              :  *
     304              :  * When "include_numa" is true, NUMA is initialized and numa_node values
     305              :  * are generated.  In order to get reliable results we also need to touch
     306              :  * memory pages, so that the inquiry about NUMA memory node does not return
     307              :  * -2, indicating unmapped/unallocated pages.
     308              :  *
     309              :  * Buffers may be smaller or larger than OS memory pages. For each buffer we
     310              :  * return one entry for each memory page used by the buffer (if the buffer is
     311              :  * smaller, it only uses a part of one memory page).
     312              :  *
     313              :  * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
     314              :  * one is always a multiple of the other.
     315              :  *
     316              :  */
     317              : static Datum
     318            0 : pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa)
     319              : {
     320            0 :         FuncCallContext *funcctx;
     321            0 :         MemoryContext oldcontext;
     322            0 :         BufferCacheOsPagesContext *fctx;        /* User function context. */
     323            0 :         TupleDesc       tupledesc;
     324            0 :         TupleDesc       expected_tupledesc;
     325            0 :         HeapTuple       tuple;
     326            0 :         Datum           result;
     327              : 
     328            0 :         if (SRF_IS_FIRSTCALL())
     329              :         {
     330            0 :                 int                     i,
     331              :                                         idx;
     332            0 :                 Size            os_page_size;
     333            0 :                 int                     pages_per_buffer;
     334            0 :                 int                *os_page_status = NULL;
     335            0 :                 uint64          os_page_count = 0;
     336            0 :                 int                     max_entries;
     337            0 :                 char       *startptr,
     338              :                                    *endptr;
     339              : 
     340              :                 /* If NUMA information is requested, initialize NUMA support. */
     341            0 :                 if (include_numa && pg_numa_init() == -1)
     342            0 :                         elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
     343              : 
     344              :                 /*
     345              :                  * The database block size and OS memory page size are unlikely to be
     346              :                  * the same. The block size is 1-32KB, the memory page size depends on
     347              :                  * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
     348              :                  * there are also features like THP etc. Moreover, we don't quite know
     349              :                  * how the pages and buffers "align" in memory - the buffers may be
     350              :                  * shifted in some way, using more memory pages than necessary.
     351              :                  *
     352              :                  * So we need to be careful about mapping buffers to memory pages. We
     353              :                  * calculate the maximum number of pages a buffer might use, so that
     354              :                  * we allocate enough space for the entries. And then we count the
     355              :                  * actual number of entries as we scan the buffers.
     356              :                  *
     357              :                  * This information is needed before calling move_pages() for NUMA
     358              :                  * node id inquiry.
     359              :                  */
     360            0 :                 os_page_size = pg_get_shmem_pagesize();
     361              : 
     362              :                 /*
     363              :                  * The pages and block size is expected to be 2^k, so one divides the
     364              :                  * other (we don't know in which direction). This does not say
     365              :                  * anything about relative alignment of pages/buffers.
     366              :                  */
     367            0 :                 Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
     368              : 
     369            0 :                 if (include_numa)
     370              :                 {
     371            0 :                         void      **os_page_ptrs = NULL;
     372              : 
     373              :                         /*
     374              :                          * How many addresses we are going to query?  Simply get the page
     375              :                          * for the first buffer, and first page after the last buffer, and
     376              :                          * count the pages from that.
     377              :                          */
     378            0 :                         startptr = (char *) TYPEALIGN_DOWN(os_page_size,
     379              :                                                                                            BufferGetBlock(1));
     380            0 :                         endptr = (char *) TYPEALIGN(os_page_size,
     381              :                                                                                 (char *) BufferGetBlock(NBuffers) + BLCKSZ);
     382            0 :                         os_page_count = (endptr - startptr) / os_page_size;
     383              : 
     384              :                         /* Used to determine the NUMA node for all OS pages at once */
     385            0 :                         os_page_ptrs = palloc0_array(void *, os_page_count);
     386            0 :                         os_page_status = palloc_array(int, os_page_count);
     387              : 
     388              :                         /*
     389              :                          * Fill pointers for all the memory pages.  This loop stores and
     390              :                          * touches (if needed) addresses into os_page_ptrs[] as input to
     391              :                          * one big move_pages(2) inquiry system call, as done in
     392              :                          * pg_numa_query_pages().
     393              :                          */
     394            0 :                         idx = 0;
     395            0 :                         for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
     396              :                         {
     397            0 :                                 os_page_ptrs[idx++] = ptr;
     398              : 
     399              :                                 /* Only need to touch memory once per backend process lifetime */
     400            0 :                                 if (firstNumaTouch)
     401            0 :                                         pg_numa_touch_mem_if_required(ptr);
     402            0 :                         }
     403              : 
     404            0 :                         Assert(idx == os_page_count);
     405              : 
     406            0 :                         elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
     407              :                                  "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
     408              : 
     409              :                         /*
     410              :                          * If we ever get 0xff back from kernel inquiry, then we probably
     411              :                          * have bug in our buffers to OS page mapping code here.
     412              :                          */
     413            0 :                         memset(os_page_status, 0xff, sizeof(int) * os_page_count);
     414              : 
     415              :                         /* Query NUMA status for all the pointers */
     416            0 :                         if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
     417            0 :                                 elog(ERROR, "failed NUMA pages inquiry: %m");
     418            0 :                 }
     419              : 
     420              :                 /* Initialize the multi-call context, load entries about buffers */
     421              : 
     422            0 :                 funcctx = SRF_FIRSTCALL_INIT();
     423              : 
     424              :                 /* Switch context when allocating stuff to be used in later calls */
     425            0 :                 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     426              : 
     427              :                 /* Create a user function context for cross-call persistence */
     428            0 :                 fctx = palloc_object(BufferCacheOsPagesContext);
     429              : 
     430            0 :                 if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
     431            0 :                         elog(ERROR, "return type must be a row type");
     432              : 
     433            0 :                 if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
     434            0 :                         elog(ERROR, "incorrect number of output arguments");
     435              : 
     436              :                 /* Construct a tuple descriptor for the result rows. */
     437            0 :                 tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
     438            0 :                 TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
     439              :                                                    INT4OID, -1, 0);
     440            0 :                 TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
     441              :                                                    INT8OID, -1, 0);
     442            0 :                 TupleDescInitEntry(tupledesc, (AttrNumber) 3, "numa_node",
     443              :                                                    INT4OID, -1, 0);
     444              : 
     445            0 :                 fctx->tupdesc = BlessTupleDesc(tupledesc);
     446            0 :                 fctx->include_numa = include_numa;
     447              : 
     448              :                 /*
     449              :                  * Each buffer needs at least one entry, but it might be offset in
     450              :                  * some way, and use one extra entry. So we allocate space for the
     451              :                  * maximum number of entries we might need, and then count the exact
     452              :                  * number as we're walking buffers. That way we can do it in one pass,
     453              :                  * without reallocating memory.
     454              :                  */
     455            0 :                 pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
     456            0 :                 max_entries = NBuffers * pages_per_buffer;
     457              : 
     458              :                 /* Allocate entries for BufferCacheOsPagesRec records. */
     459            0 :                 fctx->record = (BufferCacheOsPagesRec *)
     460            0 :                         MemoryContextAllocHuge(CurrentMemoryContext,
     461            0 :                                                                    sizeof(BufferCacheOsPagesRec) * max_entries);
     462              : 
     463              :                 /* Return to original context when allocating transient memory */
     464            0 :                 MemoryContextSwitchTo(oldcontext);
     465              : 
     466            0 :                 if (include_numa && firstNumaTouch)
     467            0 :                         elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
     468              : 
     469              :                 /*
     470              :                  * Scan through all the buffers, saving the relevant fields in the
     471              :                  * fctx->record structure.
     472              :                  *
     473              :                  * We don't hold the partition locks, so we don't get a consistent
     474              :                  * snapshot across all buffers, but we do grab the buffer header
     475              :                  * locks, so the information of each buffer is self-consistent.
     476              :                  */
     477            0 :                 startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
     478            0 :                 idx = 0;
     479            0 :                 for (i = 0; i < NBuffers; i++)
     480              :                 {
     481            0 :                         char       *buffptr = (char *) BufferGetBlock(i + 1);
     482            0 :                         BufferDesc *bufHdr;
     483            0 :                         uint32          bufferid;
     484            0 :                         int32           page_num;
     485            0 :                         char       *startptr_buff,
     486              :                                            *endptr_buff;
     487              : 
     488            0 :                         CHECK_FOR_INTERRUPTS();
     489              : 
     490            0 :                         bufHdr = GetBufferDescriptor(i);
     491              : 
     492              :                         /* Lock each buffer header before inspecting. */
     493            0 :                         LockBufHdr(bufHdr);
     494            0 :                         bufferid = BufferDescriptorGetBuffer(bufHdr);
     495            0 :                         UnlockBufHdr(bufHdr);
     496              : 
     497              :                         /* start of the first page of this buffer */
     498            0 :                         startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
     499              : 
     500              :                         /* end of the buffer (no need to align to memory page) */
     501            0 :                         endptr_buff = buffptr + BLCKSZ;
     502              : 
     503            0 :                         Assert(startptr_buff < endptr_buff);
     504              : 
     505              :                         /* calculate ID of the first page for this buffer */
     506            0 :                         page_num = (startptr_buff - startptr) / os_page_size;
     507              : 
     508              :                         /* Add an entry for each OS page overlapping with this buffer. */
     509            0 :                         for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
     510              :                         {
     511            0 :                                 fctx->record[idx].bufferid = bufferid;
     512            0 :                                 fctx->record[idx].page_num = page_num;
     513            0 :                                 fctx->record[idx].numa_node = include_numa ? os_page_status[page_num] : -1;
     514              : 
     515              :                                 /* advance to the next entry/page */
     516            0 :                                 ++idx;
     517            0 :                                 ++page_num;
     518            0 :                         }
     519            0 :                 }
     520              : 
     521            0 :                 Assert(idx <= max_entries);
     522              : 
     523            0 :                 if (include_numa)
     524            0 :                         Assert(idx >= os_page_count);
     525              : 
     526              :                 /* Set max calls and remember the user function context. */
     527            0 :                 funcctx->max_calls = idx;
     528            0 :                 funcctx->user_fctx = fctx;
     529              : 
     530              :                 /* Remember this backend touched the pages (only relevant for NUMA) */
     531            0 :                 if (include_numa)
     532            0 :                         firstNumaTouch = false;
     533            0 :         }
     534              : 
     535            0 :         funcctx = SRF_PERCALL_SETUP();
     536              : 
     537              :         /* Get the saved state */
     538            0 :         fctx = funcctx->user_fctx;
     539              : 
     540            0 :         if (funcctx->call_cntr < funcctx->max_calls)
     541              :         {
     542            0 :                 uint32          i = funcctx->call_cntr;
     543            0 :                 Datum           values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
     544            0 :                 bool            nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
     545              : 
     546            0 :                 values[0] = Int32GetDatum(fctx->record[i].bufferid);
     547            0 :                 nulls[0] = false;
     548              : 
     549            0 :                 values[1] = Int64GetDatum(fctx->record[i].page_num);
     550            0 :                 nulls[1] = false;
     551              : 
     552            0 :                 if (fctx->include_numa)
     553              :                 {
     554            0 :                         values[2] = Int32GetDatum(fctx->record[i].numa_node);
     555            0 :                         nulls[2] = false;
     556            0 :                 }
     557              :                 else
     558              :                 {
     559            0 :                         values[2] = (Datum) 0;
     560            0 :                         nulls[2] = true;
     561              :                 }
     562              : 
     563              :                 /* Build and return the tuple. */
     564            0 :                 tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
     565            0 :                 result = HeapTupleGetDatum(tuple);
     566              : 
     567            0 :                 SRF_RETURN_NEXT(funcctx, result);
     568            0 :         }
     569              :         else
     570            0 :                 SRF_RETURN_DONE(funcctx);
     571            0 : }
     572              : 
     573              : /*
     574              :  * pg_buffercache_os_pages
     575              :  *
     576              :  * Retrieve information about OS pages, with or without NUMA information.
     577              :  */
     578              : Datum
     579            0 : pg_buffercache_os_pages(PG_FUNCTION_ARGS)
     580              : {
     581            0 :         bool            include_numa;
     582              : 
     583              :         /* Get the boolean parameter that controls the NUMA behavior. */
     584            0 :         include_numa = PG_GETARG_BOOL(0);
     585              : 
     586            0 :         return pg_buffercache_os_pages_internal(fcinfo, include_numa);
     587            0 : }
     588              : 
     589              : /* Backward-compatible wrapper for v1.6. */
     590              : Datum
     591            0 : pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
     592              : {
     593              :         /* Call internal function with include_numa=true */
     594            0 :         return pg_buffercache_os_pages_internal(fcinfo, true);
     595              : }
     596              : 
     597              : Datum
     598            0 : pg_buffercache_summary(PG_FUNCTION_ARGS)
     599              : {
     600            0 :         Datum           result;
     601            0 :         TupleDesc       tupledesc;
     602            0 :         HeapTuple       tuple;
     603            0 :         Datum           values[NUM_BUFFERCACHE_SUMMARY_ELEM];
     604            0 :         bool            nulls[NUM_BUFFERCACHE_SUMMARY_ELEM];
     605              : 
     606            0 :         int32           buffers_used = 0;
     607            0 :         int32           buffers_unused = 0;
     608            0 :         int32           buffers_dirty = 0;
     609            0 :         int32           buffers_pinned = 0;
     610            0 :         int64           usagecount_total = 0;
     611              : 
     612            0 :         if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     613            0 :                 elog(ERROR, "return type must be a row type");
     614              : 
     615            0 :         for (int i = 0; i < NBuffers; i++)
     616              :         {
     617            0 :                 BufferDesc *bufHdr;
     618            0 :                 uint64          buf_state;
     619              : 
     620            0 :                 CHECK_FOR_INTERRUPTS();
     621              : 
     622              :                 /*
     623              :                  * This function summarizes the state of all headers. Locking the
     624              :                  * buffer headers wouldn't provide an improved result as the state of
     625              :                  * the buffer can still change after we release the lock and it'd
     626              :                  * noticeably increase the cost of the function.
     627              :                  */
     628            0 :                 bufHdr = GetBufferDescriptor(i);
     629            0 :                 buf_state = pg_atomic_read_u64(&bufHdr->state);
     630              : 
     631            0 :                 if (buf_state & BM_VALID)
     632              :                 {
     633            0 :                         buffers_used++;
     634            0 :                         usagecount_total += BUF_STATE_GET_USAGECOUNT(buf_state);
     635              : 
     636            0 :                         if (buf_state & BM_DIRTY)
     637            0 :                                 buffers_dirty++;
     638            0 :                 }
     639              :                 else
     640            0 :                         buffers_unused++;
     641              : 
     642            0 :                 if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
     643            0 :                         buffers_pinned++;
     644            0 :         }
     645              : 
     646            0 :         memset(nulls, 0, sizeof(nulls));
     647            0 :         values[0] = Int32GetDatum(buffers_used);
     648            0 :         values[1] = Int32GetDatum(buffers_unused);
     649            0 :         values[2] = Int32GetDatum(buffers_dirty);
     650            0 :         values[3] = Int32GetDatum(buffers_pinned);
     651              : 
     652            0 :         if (buffers_used != 0)
     653            0 :                 values[4] = Float8GetDatum((double) usagecount_total / buffers_used);
     654              :         else
     655            0 :                 nulls[4] = true;
     656              : 
     657              :         /* Build and return the tuple. */
     658            0 :         tuple = heap_form_tuple(tupledesc, values, nulls);
     659            0 :         result = HeapTupleGetDatum(tuple);
     660              : 
     661            0 :         PG_RETURN_DATUM(result);
     662            0 : }
     663              : 
     664              : Datum
     665            0 : pg_buffercache_usage_counts(PG_FUNCTION_ARGS)
     666              : {
     667            0 :         ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
     668            0 :         int                     usage_counts[BM_MAX_USAGE_COUNT + 1] = {0};
     669            0 :         int                     dirty[BM_MAX_USAGE_COUNT + 1] = {0};
     670            0 :         int                     pinned[BM_MAX_USAGE_COUNT + 1] = {0};
     671            0 :         Datum           values[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM];
     672            0 :         bool            nulls[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM] = {0};
     673              : 
     674            0 :         InitMaterializedSRF(fcinfo, 0);
     675              : 
     676            0 :         for (int i = 0; i < NBuffers; i++)
     677              :         {
     678            0 :                 BufferDesc *bufHdr = GetBufferDescriptor(i);
     679            0 :                 uint64          buf_state = pg_atomic_read_u64(&bufHdr->state);
     680            0 :                 int                     usage_count;
     681              : 
     682            0 :                 CHECK_FOR_INTERRUPTS();
     683              : 
     684            0 :                 usage_count = BUF_STATE_GET_USAGECOUNT(buf_state);
     685            0 :                 usage_counts[usage_count]++;
     686              : 
     687            0 :                 if (buf_state & BM_DIRTY)
     688            0 :                         dirty[usage_count]++;
     689              : 
     690            0 :                 if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
     691            0 :                         pinned[usage_count]++;
     692            0 :         }
     693              : 
     694            0 :         for (int i = 0; i < BM_MAX_USAGE_COUNT + 1; i++)
     695              :         {
     696            0 :                 values[0] = Int32GetDatum(i);
     697            0 :                 values[1] = Int32GetDatum(usage_counts[i]);
     698            0 :                 values[2] = Int32GetDatum(dirty[i]);
     699            0 :                 values[3] = Int32GetDatum(pinned[i]);
     700              : 
     701            0 :                 tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
     702            0 :         }
     703              : 
     704            0 :         return (Datum) 0;
     705            0 : }
     706              : 
     707              : /*
     708              :  * Helper function to check if the user has superuser privileges.
     709              :  */
     710              : static void
     711            0 : pg_buffercache_superuser_check(char *func_name)
     712              : {
     713            0 :         if (!superuser())
     714            0 :                 ereport(ERROR,
     715              :                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
     716              :                                  errmsg("must be superuser to use %s()",
     717              :                                                 func_name)));
     718            0 : }
     719              : 
     720              : /*
     721              :  * Try to evict a shared buffer.
     722              :  */
     723              : Datum
     724            0 : pg_buffercache_evict(PG_FUNCTION_ARGS)
     725              : {
     726            0 :         Datum           result;
     727            0 :         TupleDesc       tupledesc;
     728            0 :         HeapTuple       tuple;
     729            0 :         Datum           values[NUM_BUFFERCACHE_EVICT_ELEM];
     730            0 :         bool            nulls[NUM_BUFFERCACHE_EVICT_ELEM] = {0};
     731              : 
     732            0 :         Buffer          buf = PG_GETARG_INT32(0);
     733            0 :         bool            buffer_flushed;
     734              : 
     735            0 :         if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     736            0 :                 elog(ERROR, "return type must be a row type");
     737              : 
     738            0 :         pg_buffercache_superuser_check("pg_buffercache_evict");
     739              : 
     740            0 :         if (buf < 1 || buf > NBuffers)
     741            0 :                 elog(ERROR, "bad buffer ID: %d", buf);
     742              : 
     743            0 :         values[0] = BoolGetDatum(EvictUnpinnedBuffer(buf, &buffer_flushed));
     744            0 :         values[1] = BoolGetDatum(buffer_flushed);
     745              : 
     746            0 :         tuple = heap_form_tuple(tupledesc, values, nulls);
     747            0 :         result = HeapTupleGetDatum(tuple);
     748              : 
     749            0 :         PG_RETURN_DATUM(result);
     750            0 : }
     751              : 
     752              : /*
     753              :  * Try to evict specified relation.
     754              :  */
     755              : Datum
     756            0 : pg_buffercache_evict_relation(PG_FUNCTION_ARGS)
     757              : {
     758            0 :         Datum           result;
     759            0 :         TupleDesc       tupledesc;
     760            0 :         HeapTuple       tuple;
     761            0 :         Datum           values[NUM_BUFFERCACHE_EVICT_RELATION_ELEM];
     762            0 :         bool            nulls[NUM_BUFFERCACHE_EVICT_RELATION_ELEM] = {0};
     763              : 
     764            0 :         Oid                     relOid;
     765            0 :         Relation        rel;
     766              : 
     767            0 :         int32           buffers_evicted = 0;
     768            0 :         int32           buffers_flushed = 0;
     769            0 :         int32           buffers_skipped = 0;
     770              : 
     771            0 :         if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     772            0 :                 elog(ERROR, "return type must be a row type");
     773              : 
     774            0 :         pg_buffercache_superuser_check("pg_buffercache_evict_relation");
     775              : 
     776            0 :         relOid = PG_GETARG_OID(0);
     777              : 
     778            0 :         rel = relation_open(relOid, AccessShareLock);
     779              : 
     780            0 :         if (RelationUsesLocalBuffers(rel))
     781            0 :                 ereport(ERROR,
     782              :                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     783              :                                  errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
     784              :                                                 "pg_buffercache_evict_relation")));
     785              : 
     786            0 :         EvictRelUnpinnedBuffers(rel, &buffers_evicted, &buffers_flushed,
     787              :                                                         &buffers_skipped);
     788              : 
     789            0 :         relation_close(rel, AccessShareLock);
     790              : 
     791            0 :         values[0] = Int32GetDatum(buffers_evicted);
     792            0 :         values[1] = Int32GetDatum(buffers_flushed);
     793            0 :         values[2] = Int32GetDatum(buffers_skipped);
     794              : 
     795            0 :         tuple = heap_form_tuple(tupledesc, values, nulls);
     796            0 :         result = HeapTupleGetDatum(tuple);
     797              : 
     798            0 :         PG_RETURN_DATUM(result);
     799            0 : }
     800              : 
     801              : 
     802              : /*
     803              :  * Try to evict all shared buffers.
     804              :  */
     805              : Datum
     806            0 : pg_buffercache_evict_all(PG_FUNCTION_ARGS)
     807              : {
     808            0 :         Datum           result;
     809            0 :         TupleDesc       tupledesc;
     810            0 :         HeapTuple       tuple;
     811            0 :         Datum           values[NUM_BUFFERCACHE_EVICT_ALL_ELEM];
     812            0 :         bool            nulls[NUM_BUFFERCACHE_EVICT_ALL_ELEM] = {0};
     813              : 
     814            0 :         int32           buffers_evicted = 0;
     815            0 :         int32           buffers_flushed = 0;
     816            0 :         int32           buffers_skipped = 0;
     817              : 
     818            0 :         if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     819            0 :                 elog(ERROR, "return type must be a row type");
     820              : 
     821            0 :         pg_buffercache_superuser_check("pg_buffercache_evict_all");
     822              : 
     823            0 :         EvictAllUnpinnedBuffers(&buffers_evicted, &buffers_flushed,
     824              :                                                         &buffers_skipped);
     825              : 
     826            0 :         values[0] = Int32GetDatum(buffers_evicted);
     827            0 :         values[1] = Int32GetDatum(buffers_flushed);
     828            0 :         values[2] = Int32GetDatum(buffers_skipped);
     829              : 
     830            0 :         tuple = heap_form_tuple(tupledesc, values, nulls);
     831            0 :         result = HeapTupleGetDatum(tuple);
     832              : 
     833            0 :         PG_RETURN_DATUM(result);
     834            0 : }
     835              : 
     836              : /*
     837              :  * Try to mark a shared buffer as dirty.
     838              :  */
     839              : Datum
     840            0 : pg_buffercache_mark_dirty(PG_FUNCTION_ARGS)
     841              : {
     842              : 
     843            0 :         Datum           result;
     844            0 :         TupleDesc       tupledesc;
     845            0 :         HeapTuple       tuple;
     846            0 :         Datum           values[NUM_BUFFERCACHE_MARK_DIRTY_ELEM];
     847            0 :         bool            nulls[NUM_BUFFERCACHE_MARK_DIRTY_ELEM] = {0};
     848              : 
     849            0 :         Buffer          buf = PG_GETARG_INT32(0);
     850            0 :         bool            buffer_already_dirty;
     851              : 
     852            0 :         if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     853            0 :                 elog(ERROR, "return type must be a row type");
     854              : 
     855            0 :         pg_buffercache_superuser_check("pg_buffercache_mark_dirty");
     856              : 
     857            0 :         if (buf < 1 || buf > NBuffers)
     858            0 :                 elog(ERROR, "bad buffer ID: %d", buf);
     859              : 
     860            0 :         values[0] = BoolGetDatum(MarkDirtyUnpinnedBuffer(buf, &buffer_already_dirty));
     861            0 :         values[1] = BoolGetDatum(buffer_already_dirty);
     862              : 
     863            0 :         tuple = heap_form_tuple(tupledesc, values, nulls);
     864            0 :         result = HeapTupleGetDatum(tuple);
     865              : 
     866            0 :         PG_RETURN_DATUM(result);
     867            0 : }
     868              : 
     869              : /*
     870              :  * Try to mark all the shared buffers of a relation as dirty.
     871              :  */
     872              : Datum
     873            0 : pg_buffercache_mark_dirty_relation(PG_FUNCTION_ARGS)
     874              : {
     875            0 :         Datum           result;
     876            0 :         TupleDesc       tupledesc;
     877            0 :         HeapTuple       tuple;
     878            0 :         Datum           values[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM];
     879            0 :         bool            nulls[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM] = {0};
     880              : 
     881            0 :         Oid                     relOid;
     882            0 :         Relation        rel;
     883              : 
     884            0 :         int32           buffers_already_dirty = 0;
     885            0 :         int32           buffers_dirtied = 0;
     886            0 :         int32           buffers_skipped = 0;
     887              : 
     888            0 :         if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     889            0 :                 elog(ERROR, "return type must be a row type");
     890              : 
     891            0 :         pg_buffercache_superuser_check("pg_buffercache_mark_dirty_relation");
     892              : 
     893            0 :         relOid = PG_GETARG_OID(0);
     894              : 
     895            0 :         rel = relation_open(relOid, AccessShareLock);
     896              : 
     897            0 :         if (RelationUsesLocalBuffers(rel))
     898            0 :                 ereport(ERROR,
     899              :                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     900              :                                  errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
     901              :                                                 "pg_buffercache_mark_dirty_relation")));
     902              : 
     903            0 :         MarkDirtyRelUnpinnedBuffers(rel, &buffers_dirtied, &buffers_already_dirty,
     904              :                                                                 &buffers_skipped);
     905              : 
     906            0 :         relation_close(rel, AccessShareLock);
     907              : 
     908            0 :         values[0] = Int32GetDatum(buffers_dirtied);
     909            0 :         values[1] = Int32GetDatum(buffers_already_dirty);
     910            0 :         values[2] = Int32GetDatum(buffers_skipped);
     911              : 
     912            0 :         tuple = heap_form_tuple(tupledesc, values, nulls);
     913            0 :         result = HeapTupleGetDatum(tuple);
     914              : 
     915            0 :         PG_RETURN_DATUM(result);
     916            0 : }
     917              : 
     918              : /*
     919              :  * Try to mark all the shared buffers as dirty.
     920              :  */
     921              : Datum
     922            0 : pg_buffercache_mark_dirty_all(PG_FUNCTION_ARGS)
     923              : {
     924            0 :         Datum           result;
     925            0 :         TupleDesc       tupledesc;
     926            0 :         HeapTuple       tuple;
     927            0 :         Datum           values[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM];
     928            0 :         bool            nulls[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM] = {0};
     929              : 
     930            0 :         int32           buffers_already_dirty = 0;
     931            0 :         int32           buffers_dirtied = 0;
     932            0 :         int32           buffers_skipped = 0;
     933              : 
     934            0 :         if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     935            0 :                 elog(ERROR, "return type must be a row type");
     936              : 
     937            0 :         pg_buffercache_superuser_check("pg_buffercache_mark_dirty_all");
     938              : 
     939            0 :         MarkDirtyAllUnpinnedBuffers(&buffers_dirtied, &buffers_already_dirty,
     940              :                                                                 &buffers_skipped);
     941              : 
     942            0 :         values[0] = Int32GetDatum(buffers_dirtied);
     943            0 :         values[1] = Int32GetDatum(buffers_already_dirty);
     944            0 :         values[2] = Int32GetDatum(buffers_skipped);
     945              : 
     946            0 :         tuple = heap_form_tuple(tupledesc, values, nulls);
     947            0 :         result = HeapTupleGetDatum(tuple);
     948              : 
     949            0 :         PG_RETURN_DATUM(result);
     950            0 : }
        

Generated by: LCOV version 2.3.2-1