Branch data Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * freelist.c
4 : : * routines for managing the buffer pool's replacement strategy.
5 : : *
6 : : *
7 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : *
11 : : * IDENTIFICATION
12 : : * src/backend/storage/buffer/freelist.c
13 : : *
14 : : *-------------------------------------------------------------------------
15 : : */
16 : : #include "postgres.h"
17 : :
18 : : #include "pgstat.h"
19 : : #include "port/atomics.h"
20 : : #include "storage/buf_internals.h"
21 : : #include "storage/bufmgr.h"
22 : : #include "storage/proc.h"
23 : :
24 : : #define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var))))
25 : :
26 : :
27 : : /*
28 : : * The shared freelist control information.
29 : : */
30 : : typedef struct
31 : : {
32 : : /* Spinlock: protects the values below */
33 : : slock_t buffer_strategy_lock;
34 : :
35 : : /*
36 : : * clock-sweep hand: index of next buffer to consider grabbing. Note that
37 : : * this isn't a concrete buffer - we only ever increase the value. So, to
38 : : * get an actual buffer, it needs to be used modulo NBuffers.
39 : : */
40 : : pg_atomic_uint32 nextVictimBuffer;
41 : :
42 : : /*
43 : : * Statistics. These counters should be wide enough that they can't
44 : : * overflow during a single bgwriter cycle.
45 : : */
46 : : uint32 completePasses; /* Complete cycles of the clock-sweep */
47 : : pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */
48 : :
49 : : /*
50 : : * Bgworker process to be notified upon activity or -1 if none. See
51 : : * StrategyNotifyBgWriter.
52 : : */
53 : : int bgwprocno;
54 : : } BufferStrategyControl;
55 : :
56 : : /* Pointers to shared state */
57 : : static BufferStrategyControl *StrategyControl = NULL;
58 : :
59 : : /*
60 : : * Private (non-shared) state for managing a ring of shared buffers to re-use.
61 : : * This is currently the only kind of BufferAccessStrategy object, but someday
62 : : * we might have more kinds.
63 : : */
64 : : typedef struct BufferAccessStrategyData
65 : : {
66 : : /* Overall strategy type */
67 : : BufferAccessStrategyType btype;
68 : : /* Number of elements in buffers[] array */
69 : : int nbuffers;
70 : :
71 : : /*
72 : : * Index of the "current" slot in the ring, ie, the one most recently
73 : : * returned by GetBufferFromRing.
74 : : */
75 : : int current;
76 : :
77 : : /*
78 : : * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
79 : : * have not yet selected a buffer for this ring slot. For allocation
80 : : * simplicity this is palloc'd together with the fixed fields of the
81 : : * struct.
82 : : */
83 : : Buffer buffers[FLEXIBLE_ARRAY_MEMBER];
84 : : } BufferAccessStrategyData;
85 : :
86 : :
87 : : /* Prototypes for internal functions */
88 : : static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
89 : : uint64 *buf_state);
90 : : static void AddBufferToRing(BufferAccessStrategy strategy,
91 : : BufferDesc *buf);
92 : :
93 : : /*
94 : : * ClockSweepTick - Helper routine for StrategyGetBuffer()
95 : : *
96 : : * Move the clock hand one buffer ahead of its current position and return the
97 : : * id of the buffer now under the hand.
98 : : */
99 : : static inline uint32
100 : 66394 : ClockSweepTick(void)
101 : : {
102 : 66394 : uint32 victim;
103 : :
104 : : /*
105 : : * Atomically move hand ahead one buffer - if there's several processes
106 : : * doing this, this can lead to buffers being returned slightly out of
107 : : * apparent order.
108 : : */
109 : 66394 : victim =
110 : 66394 : pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
111 : :
112 [ + + ]: 66394 : if (victim >= NBuffers)
113 : : {
114 : 3 : uint32 originalVictim = victim;
115 : :
116 : : /* always wrap what we look up in BufferDescriptors */
117 : 3 : victim = victim % NBuffers;
118 : :
119 : : /*
120 : : * If we're the one that just caused a wraparound, force
121 : : * completePasses to be incremented while holding the spinlock. We
122 : : * need the spinlock so StrategySyncStart() can return a consistent
123 : : * value consisting of nextVictimBuffer and completePasses.
124 : : */
125 [ + - ]: 3 : if (victim == 0)
126 : : {
127 : 3 : uint32 expected;
128 : 3 : uint32 wrapped;
129 : 3 : bool success = false;
130 : :
131 : 3 : expected = originalVictim + 1;
132 : :
133 [ + + ]: 6 : while (!success)
134 : : {
135 : : /*
136 : : * Acquire the spinlock while increasing completePasses. That
137 : : * allows other readers to read nextVictimBuffer and
138 : : * completePasses in a consistent manner which is required for
139 : : * StrategySyncStart(). In theory delaying the increment
140 : : * could lead to an overflow of nextVictimBuffers, but that's
141 : : * highly unlikely and wouldn't be particularly harmful.
142 : : */
143 [ - + ]: 3 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
144 : :
145 : 3 : wrapped = expected % NBuffers;
146 : :
147 : 6 : success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
148 : 3 : &expected, wrapped);
149 [ - + ]: 3 : if (success)
150 : 3 : StrategyControl->completePasses++;
151 : 3 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
152 : : }
153 : 3 : }
154 : 3 : }
155 : 132788 : return victim;
156 : 66394 : }
157 : :
158 : : /*
159 : : * StrategyGetBuffer
160 : : *
161 : : * Called by the bufmgr to get the next candidate buffer to use in
162 : : * GetVictimBuffer(). The only hard requirement GetVictimBuffer() has is that
163 : : * the selected buffer must not currently be pinned by anyone.
164 : : *
165 : : * strategy is a BufferAccessStrategy object, or NULL for default strategy.
166 : : *
167 : : * It is the callers responsibility to ensure the buffer ownership can be
168 : : * tracked via TrackNewBufferPin().
169 : : *
170 : : * The buffer is pinned and marked as owned, using TrackNewBufferPin(),
171 : : * before returning.
172 : : */
173 : : BufferDesc *
174 : 40537 : StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_ring)
175 : : {
176 : 40537 : BufferDesc *buf;
177 : 40537 : int bgwprocno;
178 : 40537 : int trycounter;
179 : :
180 : 40537 : *from_ring = false;
181 : :
182 : : /*
183 : : * If given a strategy object, see whether it can select a buffer. We
184 : : * assume strategy objects don't need buffer_strategy_lock.
185 : : */
186 [ + + ]: 40537 : if (strategy != NULL)
187 : : {
188 : 9305 : buf = GetBufferFromRing(strategy, buf_state);
189 [ + + ]: 9305 : if (buf != NULL)
190 : : {
191 : 17 : *from_ring = true;
192 : 17 : return buf;
193 : : }
194 : 9288 : }
195 : :
196 : : /*
197 : : * If asked, we need to waken the bgwriter. Since we don't want to rely on
198 : : * a spinlock for this we force a read from shared memory once, and then
199 : : * set the latch based on that value. We need to go through that length
200 : : * because otherwise bgwprocno might be reset while/after we check because
201 : : * the compiler might just reread from memory.
202 : : *
203 : : * This can possibly set the latch of the wrong process if the bgwriter
204 : : * dies in the wrong moment. But since PGPROC->procLatch is never
205 : : * deallocated the worst consequence of that is that we set the latch of
206 : : * some arbitrary process.
207 : : */
208 : 40520 : bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
209 [ + + ]: 40520 : if (bgwprocno != -1)
210 : : {
211 : : /* reset bgwprocno first, before setting the latch */
212 : 2 : StrategyControl->bgwprocno = -1;
213 : :
214 : : /*
215 : : * Not acquiring ProcArrayLock here which is slightly icky. It's
216 : : * actually fine because procLatch isn't ever freed, so we just can
217 : : * potentially set the wrong process' (or no process') latch.
218 : : */
219 : 2 : SetLatch(&GetPGProcByNumber(bgwprocno)->procLatch);
220 : 2 : }
221 : :
222 : : /*
223 : : * We count buffer allocation requests so that the bgwriter can estimate
224 : : * the rate of buffer consumption. Note that buffers recycled by a
225 : : * strategy object are intentionally not counted here.
226 : : */
227 : 40520 : pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
228 : :
229 : : /* Use the "clock sweep" algorithm to find a free buffer */
230 : 40520 : trycounter = NBuffers;
231 : 66394 : for (;;)
232 : : {
233 : 66394 : uint64 old_buf_state;
234 : 66394 : uint64 local_buf_state;
235 : :
236 : 66394 : buf = GetBufferDescriptor(ClockSweepTick());
237 : :
238 : : /*
239 : : * Check whether the buffer can be used and pin it if so. Do this
240 : : * using a CAS loop, to avoid having to lock the buffer header.
241 : : */
242 : 66394 : old_buf_state = pg_atomic_read_u64(&buf->state);
243 : 66394 : for (;;)
244 : : {
245 : 66394 : local_buf_state = old_buf_state;
246 : :
247 : : /*
248 : : * If the buffer is pinned or has a nonzero usage_count, we cannot
249 : : * use it; decrement the usage_count (unless pinned) and keep
250 : : * scanning.
251 : : */
252 : :
253 [ + + ]: 66394 : if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0)
254 : : {
255 [ + - ]: 1 : if (--trycounter == 0)
256 : : {
257 : : /*
258 : : * We've scanned all the buffers without making any state
259 : : * changes, so all the buffers are pinned (or were when we
260 : : * looked at them). We could hope that someone will free
261 : : * one eventually, but it's probably better to fail than
262 : : * to risk getting stuck in an infinite loop.
263 : : */
264 [ # # # # ]: 0 : elog(ERROR, "no unpinned buffers available");
265 : 0 : }
266 : 1 : break;
267 : : }
268 : :
269 : : /* See equivalent code in PinBuffer() */
270 [ - + ]: 66393 : if (unlikely(local_buf_state & BM_LOCKED))
271 : : {
272 : 0 : old_buf_state = WaitBufHdrUnlocked(buf);
273 : 0 : continue;
274 : : }
275 : :
276 [ + + ]: 66393 : if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
277 : : {
278 : 25873 : local_buf_state -= BUF_USAGECOUNT_ONE;
279 : :
280 [ - + - + ]: 51746 : if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
281 : 25873 : local_buf_state))
282 : : {
283 : 25873 : trycounter = NBuffers;
284 : 25873 : break;
285 : : }
286 : 0 : }
287 : : else
288 : : {
289 : : /* pin the buffer if the CAS succeeds */
290 : 40520 : local_buf_state += BUF_REFCOUNT_ONE;
291 : :
292 [ + - + - ]: 81040 : if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
293 : 40520 : local_buf_state))
294 : : {
295 : : /* Found a usable buffer */
296 [ + + ]: 40520 : if (strategy != NULL)
297 : 9288 : AddBufferToRing(strategy, buf);
298 : 40520 : *buf_state = local_buf_state;
299 : :
300 : 40520 : TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
301 : :
302 : 40520 : return buf;
303 : : }
304 : : }
305 : : }
306 [ + + ]: 66394 : }
307 : 40537 : }
308 : :
309 : : /*
310 : : * StrategySyncStart -- tell BgBufferSync where to start syncing
311 : : *
312 : : * The result is the buffer index of the best buffer to sync first.
313 : : * BgBufferSync() will proceed circularly around the buffer array from there.
314 : : *
315 : : * In addition, we return the completed-pass count (which is effectively
316 : : * the higher-order bits of nextVictimBuffer) and the count of recent buffer
317 : : * allocs if non-NULL pointers are passed. The alloc count is reset after
318 : : * being read.
319 : : */
320 : : int
321 : 24 : StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
322 : : {
323 : 24 : uint32 nextVictimBuffer;
324 : 24 : int result;
325 : :
326 [ - + ]: 24 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
327 : 24 : nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
328 : 24 : result = nextVictimBuffer % NBuffers;
329 : :
330 [ - + ]: 24 : if (complete_passes)
331 : : {
332 : 24 : *complete_passes = StrategyControl->completePasses;
333 : :
334 : : /*
335 : : * Additionally add the number of wraparounds that happened before
336 : : * completePasses could be incremented. C.f. ClockSweepTick().
337 : : */
338 : 24 : *complete_passes += nextVictimBuffer / NBuffers;
339 : 24 : }
340 : :
341 [ - + ]: 24 : if (num_buf_alloc)
342 : : {
343 : 24 : *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
344 : 24 : }
345 : 24 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
346 : 48 : return result;
347 : 24 : }
348 : :
349 : : /*
350 : : * StrategyNotifyBgWriter -- set or clear allocation notification latch
351 : : *
352 : : * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will
353 : : * set that latch. Pass -1 to clear the pending notification before it
354 : : * happens. This feature is used by the bgwriter process to wake itself up
355 : : * from hibernation, and is not meant for anybody else to use.
356 : : */
357 : : void
358 : 6 : StrategyNotifyBgWriter(int bgwprocno)
359 : : {
360 : : /*
361 : : * We acquire buffer_strategy_lock just to ensure that the store appears
362 : : * atomic to StrategyGetBuffer. The bgwriter should call this rather
363 : : * infrequently, so there's no performance penalty from being safe.
364 : : */
365 [ - + ]: 6 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
366 : 6 : StrategyControl->bgwprocno = bgwprocno;
367 : 6 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
368 : 6 : }
369 : :
370 : :
371 : : /*
372 : : * StrategyShmemSize
373 : : *
374 : : * estimate the size of shared memory used by the freelist-related structures.
375 : : *
376 : : * Note: for somewhat historical reasons, the buffer lookup hashtable size
377 : : * is also determined here.
378 : : */
379 : : Size
380 : 9 : StrategyShmemSize(void)
381 : : {
382 : 9 : Size size = 0;
383 : :
384 : : /* size of lookup hash table ... see comment in StrategyInitialize */
385 : 9 : size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
386 : :
387 : : /* size of the shared replacement strategy control block */
388 : 9 : size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
389 : :
390 : 18 : return size;
391 : 9 : }
392 : :
393 : : /*
394 : : * StrategyInitialize -- initialize the buffer cache replacement
395 : : * strategy.
396 : : *
397 : : * Assumes: All of the buffers are already built into a linked list.
398 : : * Only called by postmaster and only during initialization.
399 : : */
400 : : void
401 : 6 : StrategyInitialize(bool init)
402 : : {
403 : 6 : bool found;
404 : :
405 : : /*
406 : : * Initialize the shared buffer lookup hashtable.
407 : : *
408 : : * Since we can't tolerate running out of lookup table entries, we must be
409 : : * sure to specify an adequate table size here. The maximum steady-state
410 : : * usage is of course NBuffers entries, but BufferAlloc() tries to insert
411 : : * a new entry before deleting the old. In principle this could be
412 : : * happening in each partition concurrently, so we could need as many as
413 : : * NBuffers + NUM_BUFFER_PARTITIONS entries.
414 : : */
415 : 6 : InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
416 : :
417 : : /*
418 : : * Get or create the shared strategy control block
419 : : */
420 : 6 : StrategyControl = (BufferStrategyControl *)
421 : 6 : ShmemInitStruct("Buffer Strategy Status",
422 : : sizeof(BufferStrategyControl),
423 : : &found);
424 : :
425 [ + - ]: 6 : if (!found)
426 : : {
427 : : /*
428 : : * Only done once, usually in postmaster
429 : : */
430 [ + - ]: 6 : Assert(init);
431 : :
432 : 6 : SpinLockInit(&StrategyControl->buffer_strategy_lock);
433 : :
434 : : /* Initialize the clock-sweep pointer */
435 : 6 : pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
436 : :
437 : : /* Clear statistics */
438 : 6 : StrategyControl->completePasses = 0;
439 : 6 : pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
440 : :
441 : : /* No pending notification */
442 : 6 : StrategyControl->bgwprocno = -1;
443 : 6 : }
444 : : else
445 [ # # ]: 0 : Assert(!init);
446 : 6 : }
447 : :
448 : :
449 : : /* ----------------------------------------------------------------
450 : : * Backend-private buffer ring management
451 : : * ----------------------------------------------------------------
452 : : */
453 : :
454 : :
455 : : /*
456 : : * GetAccessStrategy -- create a BufferAccessStrategy object
457 : : *
458 : : * The object is allocated in the current memory context.
459 : : */
460 : : BufferAccessStrategy
461 : 2143 : GetAccessStrategy(BufferAccessStrategyType btype)
462 : : {
463 : 2143 : int ring_size_kb;
464 : :
465 : : /*
466 : : * Select ring size to use. See buffer/README for rationales.
467 : : *
468 : : * Note: if you change the ring size for BAS_BULKREAD, see also
469 : : * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
470 : : */
471 [ - + + - : 2143 : switch (btype)
- ]
472 : : {
473 : : case BAS_NORMAL:
474 : : /* if someone asks for NORMAL, just give 'em a "default" object */
475 : 0 : return NULL;
476 : :
477 : : case BAS_BULKREAD:
478 : : {
479 : 741 : int ring_max_kb;
480 : :
481 : : /*
482 : : * The ring always needs to be large enough to allow some
483 : : * separation in time between providing a buffer to the user
484 : : * of the strategy and that buffer being reused. Otherwise the
485 : : * user's pin will prevent reuse of the buffer, even without
486 : : * concurrent activity.
487 : : *
488 : : * We also need to ensure the ring always is large enough for
489 : : * SYNC_SCAN_REPORT_INTERVAL, as noted above.
490 : : *
491 : : * Thus we start out a minimal size and increase the size
492 : : * further if appropriate.
493 : : */
494 : 741 : ring_size_kb = 256;
495 : :
496 : : /*
497 : : * There's no point in a larger ring if we won't be allowed to
498 : : * pin sufficiently many buffers. But we never limit to less
499 : : * than the minimal size above.
500 : : */
501 : 741 : ring_max_kb = GetPinLimit() * (BLCKSZ / 1024);
502 [ - + ]: 741 : ring_max_kb = Max(ring_size_kb, ring_max_kb);
503 : :
504 : : /*
505 : : * We would like the ring to additionally have space for the
506 : : * configured degree of IO concurrency. While being read in,
507 : : * buffers can obviously not yet be reused.
508 : : *
509 : : * Each IO can be up to io_combine_limit blocks large, and we
510 : : * want to start up to effective_io_concurrency IOs.
511 : : *
512 : : * Note that effective_io_concurrency may be 0, which disables
513 : : * AIO.
514 : : */
515 : 1482 : ring_size_kb += (BLCKSZ / 1024) *
516 : 1482 : io_combine_limit * effective_io_concurrency;
517 : :
518 [ - + ]: 741 : if (ring_size_kb > ring_max_kb)
519 : 741 : ring_size_kb = ring_max_kb;
520 : : break;
521 : 741 : }
522 : : case BAS_BULKWRITE:
523 : 1402 : ring_size_kb = 16 * 1024;
524 : 1402 : break;
525 : : case BAS_VACUUM:
526 : 0 : ring_size_kb = 2048;
527 : 0 : break;
528 : :
529 : : default:
530 [ # # # # ]: 0 : elog(ERROR, "unrecognized buffer access strategy: %d",
531 : : (int) btype);
532 : 0 : return NULL; /* keep compiler quiet */
533 : : }
534 : :
535 : 2143 : return GetAccessStrategyWithSize(btype, ring_size_kb);
536 : 2143 : }
537 : :
538 : : /*
539 : : * GetAccessStrategyWithSize -- create a BufferAccessStrategy object with a
540 : : * number of buffers equivalent to the passed in size.
541 : : *
542 : : * If the given ring size is 0, no BufferAccessStrategy will be created and
543 : : * the function will return NULL. ring_size_kb must not be negative.
544 : : */
545 : : BufferAccessStrategy
546 : 2664 : GetAccessStrategyWithSize(BufferAccessStrategyType btype, int ring_size_kb)
547 : : {
548 : 2664 : int ring_buffers;
549 : 2664 : BufferAccessStrategy strategy;
550 : :
551 [ + - ]: 2664 : Assert(ring_size_kb >= 0);
552 : :
553 : : /* Figure out how many buffers ring_size_kb is */
554 : 2664 : ring_buffers = ring_size_kb / (BLCKSZ / 1024);
555 : :
556 : : /* 0 means unlimited, so no BufferAccessStrategy required */
557 [ + + ]: 2664 : if (ring_buffers == 0)
558 : 2 : return NULL;
559 : :
560 : : /* Cap to 1/8th of shared_buffers */
561 [ - + ]: 2662 : ring_buffers = Min(NBuffers / 8, ring_buffers);
562 : :
563 : : /* NBuffers should never be less than 16, so this shouldn't happen */
564 [ + - ]: 2662 : Assert(ring_buffers > 0);
565 : :
566 : : /* Allocate the object and initialize all elements to zeroes */
567 : 2662 : strategy = (BufferAccessStrategy)
568 : 2662 : palloc0(offsetof(BufferAccessStrategyData, buffers) +
569 : 2662 : ring_buffers * sizeof(Buffer));
570 : :
571 : : /* Set fields that don't start out zero */
572 : 2662 : strategy->btype = btype;
573 : 2662 : strategy->nbuffers = ring_buffers;
574 : :
575 : 2662 : return strategy;
576 : 2664 : }
577 : :
578 : : /*
579 : : * GetAccessStrategyBufferCount -- an accessor for the number of buffers in
580 : : * the ring
581 : : *
582 : : * Returns 0 on NULL input to match behavior of GetAccessStrategyWithSize()
583 : : * returning NULL with 0 size.
584 : : */
585 : : int
586 : 6 : GetAccessStrategyBufferCount(BufferAccessStrategy strategy)
587 : : {
588 [ + - ]: 6 : if (strategy == NULL)
589 : 0 : return 0;
590 : :
591 : 6 : return strategy->nbuffers;
592 : 6 : }
593 : :
594 : : /*
595 : : * GetAccessStrategyPinLimit -- get cap of number of buffers that should be pinned
596 : : *
597 : : * When pinning extra buffers to look ahead, users of a ring-based strategy are
598 : : * in danger of pinning too much of the ring at once while performing look-ahead.
599 : : * For some strategies, that means "escaping" from the ring, and in others it
600 : : * means forcing dirty data to disk very frequently with associated WAL
601 : : * flushing. Since external code has no insight into any of that, allow
602 : : * individual strategy types to expose a clamp that should be applied when
603 : : * deciding on a maximum number of buffers to pin at once.
604 : : *
605 : : * Callers should combine this number with other relevant limits and take the
606 : : * minimum.
607 : : */
608 : : int
609 : 347601 : GetAccessStrategyPinLimit(BufferAccessStrategy strategy)
610 : : {
611 [ + + ]: 347601 : if (strategy == NULL)
612 : 345007 : return NBuffers;
613 : :
614 [ + + ]: 2594 : switch (strategy->btype)
615 : : {
616 : : case BAS_BULKREAD:
617 : :
618 : : /*
619 : : * Since BAS_BULKREAD uses StrategyRejectBuffer(), dirty buffers
620 : : * shouldn't be a problem and the caller is free to pin up to the
621 : : * entire ring at once.
622 : : */
623 : 738 : return strategy->nbuffers;
624 : :
625 : : default:
626 : :
627 : : /*
628 : : * Tell caller not to pin more than half the buffers in the ring.
629 : : * This is a trade-off between look ahead distance and deferring
630 : : * writeback and associated WAL traffic.
631 : : */
632 : 1856 : return strategy->nbuffers / 2;
633 : : }
634 : 347601 : }
635 : :
636 : : /*
637 : : * FreeAccessStrategy -- release a BufferAccessStrategy object
638 : : *
639 : : * A simple pfree would do at the moment, but we would prefer that callers
640 : : * don't assume that much about the representation of BufferAccessStrategy.
641 : : */
642 : : void
643 : 2110 : FreeAccessStrategy(BufferAccessStrategy strategy)
644 : : {
645 : : /* don't crash if called on a "default" strategy */
646 [ - + ]: 2110 : if (strategy != NULL)
647 : 2110 : pfree(strategy);
648 : 2110 : }
649 : :
650 : : /*
651 : : * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
652 : : * ring is empty / not usable.
653 : : *
654 : : * The buffer is pinned and marked as owned, using TrackNewBufferPin(), before
655 : : * returning.
656 : : */
657 : : static BufferDesc *
658 : 9305 : GetBufferFromRing(BufferAccessStrategy strategy, uint64 *buf_state)
659 : : {
660 : 9305 : BufferDesc *buf;
661 : 9305 : Buffer bufnum;
662 : 9305 : uint64 old_buf_state;
663 : 9305 : uint64 local_buf_state; /* to avoid repeated (de-)referencing */
664 : :
665 : :
666 : : /* Advance to next ring slot */
667 [ + + ]: 9305 : if (++strategy->current >= strategy->nbuffers)
668 : 3 : strategy->current = 0;
669 : :
670 : : /*
671 : : * If the slot hasn't been filled yet, tell the caller to allocate a new
672 : : * buffer with the normal allocation strategy. He will then fill this
673 : : * slot by calling AddBufferToRing with the new buffer.
674 : : */
675 : 9305 : bufnum = strategy->buffers[strategy->current];
676 [ + + ]: 9305 : if (bufnum == InvalidBuffer)
677 : 9288 : return NULL;
678 : :
679 : 17 : buf = GetBufferDescriptor(bufnum - 1);
680 : :
681 : : /*
682 : : * Check whether the buffer can be used and pin it if so. Do this using a
683 : : * CAS loop, to avoid having to lock the buffer header.
684 : : */
685 : 17 : old_buf_state = pg_atomic_read_u64(&buf->state);
686 : 17 : for (;;)
687 : : {
688 : 17 : local_buf_state = old_buf_state;
689 : :
690 : : /*
691 : : * If the buffer is pinned we cannot use it under any circumstances.
692 : : *
693 : : * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
694 : : * since our own previous usage of the ring element would have left it
695 : : * there, but it might've been decremented by clock-sweep since then).
696 : : * A higher usage_count indicates someone else has touched the buffer,
697 : : * so we shouldn't re-use it.
698 : : */
699 : 17 : if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0
700 [ + - - + ]: 17 : || BUF_STATE_GET_USAGECOUNT(local_buf_state) > 1)
701 : 0 : break;
702 : :
703 : : /* See equivalent code in PinBuffer() */
704 [ - + ]: 17 : if (unlikely(local_buf_state & BM_LOCKED))
705 : : {
706 : 0 : old_buf_state = WaitBufHdrUnlocked(buf);
707 : 0 : continue;
708 : : }
709 : :
710 : : /* pin the buffer if the CAS succeeds */
711 : 17 : local_buf_state += BUF_REFCOUNT_ONE;
712 : :
713 [ + - + - ]: 34 : if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
714 : 17 : local_buf_state))
715 : : {
716 : 17 : *buf_state = local_buf_state;
717 : :
718 : 17 : TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
719 : 17 : return buf;
720 : : }
721 : : }
722 : :
723 : : /*
724 : : * Tell caller to allocate a new buffer with the normal allocation
725 : : * strategy. He'll then replace this ring element via AddBufferToRing.
726 : : */
727 : 0 : return NULL;
728 : 9305 : }
729 : :
730 : : /*
731 : : * AddBufferToRing -- add a buffer to the buffer ring
732 : : *
733 : : * Caller must hold the buffer header spinlock on the buffer. Since this
734 : : * is called with the spinlock held, it had better be quite cheap.
735 : : */
736 : : static void
737 : 9288 : AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
738 : : {
739 : 9288 : strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
740 : 9288 : }
741 : :
742 : : /*
743 : : * Utility function returning the IOContext of a given BufferAccessStrategy's
744 : : * strategy ring.
745 : : */
746 : : IOContext
747 : 11370285 : IOContextForStrategy(BufferAccessStrategy strategy)
748 : : {
749 [ + + ]: 11370285 : if (!strategy)
750 : 11317990 : return IOCONTEXT_NORMAL;
751 : :
752 [ - + + + : 52295 : switch (strategy->btype)
- ]
753 : : {
754 : : case BAS_NORMAL:
755 : :
756 : : /*
757 : : * Currently, GetAccessStrategy() returns NULL for
758 : : * BufferAccessStrategyType BAS_NORMAL, so this case is
759 : : * unreachable.
760 : : */
761 : 0 : pg_unreachable();
762 : : return IOCONTEXT_NORMAL;
763 : : case BAS_BULKREAD:
764 : 4018 : return IOCONTEXT_BULKREAD;
765 : : case BAS_BULKWRITE:
766 : 16488 : return IOCONTEXT_BULKWRITE;
767 : : case BAS_VACUUM:
768 : 31789 : return IOCONTEXT_VACUUM;
769 : : }
770 : :
771 [ # # # # ]: 0 : elog(ERROR, "unrecognized BufferAccessStrategyType: %d", strategy->btype);
772 : 0 : pg_unreachable();
773 : 11370285 : }
774 : :
775 : : /*
776 : : * StrategyRejectBuffer -- consider rejecting a dirty buffer
777 : : *
778 : : * When a nondefault strategy is used, the buffer manager calls this function
779 : : * when it turns out that the buffer selected by StrategyGetBuffer needs to
780 : : * be written out and doing so would require flushing WAL too. This gives us
781 : : * a chance to choose a different victim.
782 : : *
783 : : * Returns true if buffer manager should ask for a new victim, and false
784 : : * if this buffer should be written and re-used.
785 : : */
786 : : bool
787 : 0 : StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
788 : : {
789 : : /* We only do this in bulkread mode */
790 [ # # ]: 0 : if (strategy->btype != BAS_BULKREAD)
791 : 0 : return false;
792 : :
793 : : /* Don't muck with behavior of normal buffer-replacement strategy */
794 [ # # # # ]: 0 : if (!from_ring ||
795 : 0 : strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
796 : 0 : return false;
797 : :
798 : : /*
799 : : * Remove the dirty buffer from the ring; necessary to prevent infinite
800 : : * loop if all ring members are dirty.
801 : : */
802 : 0 : strategy->buffers[strategy->current] = InvalidBuffer;
803 : :
804 : 0 : return true;
805 : 0 : }
|