Branch data Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtree.c
4 : : * Implementation of Lehman and Yao's btree management algorithm for
5 : : * Postgres.
6 : : *
7 : : * NOTES
8 : : * This file contains only the public interface routines.
9 : : *
10 : : *
11 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
12 : : * Portions Copyright (c) 1994, Regents of the University of California
13 : : *
14 : : * IDENTIFICATION
15 : : * src/backend/access/nbtree/nbtree.c
16 : : *
17 : : *-------------------------------------------------------------------------
18 : : */
19 : : #include "postgres.h"
20 : :
21 : : #include "access/nbtree.h"
22 : : #include "access/relscan.h"
23 : : #include "access/stratnum.h"
24 : : #include "commands/progress.h"
25 : : #include "commands/vacuum.h"
26 : : #include "nodes/execnodes.h"
27 : : #include "pgstat.h"
28 : : #include "storage/bulk_write.h"
29 : : #include "storage/condition_variable.h"
30 : : #include "storage/indexfsm.h"
31 : : #include "storage/ipc.h"
32 : : #include "storage/lmgr.h"
33 : : #include "storage/read_stream.h"
34 : : #include "utils/datum.h"
35 : : #include "utils/fmgrprotos.h"
36 : : #include "utils/index_selfuncs.h"
37 : : #include "utils/memutils.h"
38 : :
39 : :
40 : : /*
41 : : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
42 : : *
43 : : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
44 : : * scan to advance it via another call to _bt_first.
45 : : *
46 : : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
47 : : * a new page; others must wait.
48 : : *
49 : : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
50 : : * to a new page; some process can start doing that.
51 : : *
52 : : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
53 : : */
54 : : typedef enum
55 : : {
56 : : BTPARALLEL_NOT_INITIALIZED,
57 : : BTPARALLEL_NEED_PRIMSCAN,
58 : : BTPARALLEL_ADVANCING,
59 : : BTPARALLEL_IDLE,
60 : : BTPARALLEL_DONE,
61 : : } BTPS_State;
62 : :
63 : : /*
64 : : * BTParallelScanDescData contains btree specific shared information required
65 : : * for parallel scan.
66 : : */
67 : : typedef struct BTParallelScanDescData
68 : : {
69 : : BlockNumber btps_nextScanPage; /* next page to be scanned */
70 : : BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
71 : : * btps_nextScanPage */
72 : : BTPS_State btps_pageStatus; /* indicates whether next page is
73 : : * available for scan. see above for
74 : : * possible states of parallel scan. */
75 : : LWLock btps_lock; /* protects shared parallel state */
76 : : ConditionVariable btps_cv; /* used to synchronize parallel scan */
77 : :
78 : : /*
79 : : * btps_arrElems is used when scans need to schedule another primitive
80 : : * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
81 : : * offsets for each = scan key associated with a ScalarArrayOp array.
82 : : */
83 : : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
84 : :
85 : : /*
86 : : * Additional space (at the end of the struct) is used when scans need to
87 : : * schedule another primitive index scan with one or more skip arrays.
88 : : * Holds a flattened datum representation for each = scan key associated
89 : : * with a skip array.
90 : : */
91 : : } BTParallelScanDescData;
92 : :
93 : : typedef struct BTParallelScanDescData *BTParallelScanDesc;
94 : :
95 : :
96 : : static bool _bt_start_prim_scan(IndexScanDesc scan);
97 : : static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
98 : : BTScanOpaque so);
99 : : static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
100 : : BTScanOpaque so);
101 : : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
102 : : IndexBulkDeleteCallback callback, void *callback_state,
103 : : BTCycleId cycleid);
104 : : static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
105 : : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
106 : : IndexTuple posting,
107 : : OffsetNumber updatedoffset,
108 : : int *nremaining);
109 : :
110 : :
111 : : /*
112 : : * Btree handler function: return IndexAmRoutine with access method parameters
113 : : * and callbacks.
114 : : */
115 : : Datum
116 : 102304 : bthandler(PG_FUNCTION_ARGS)
117 : : {
118 : : static const IndexAmRoutine amroutine = {
119 : : .type = T_IndexAmRoutine,
120 : : .amstrategies = BTMaxStrategyNumber,
121 : : .amsupport = BTNProcs,
122 : : .amoptsprocnum = BTOPTIONS_PROC,
123 : : .amcanorder = true,
124 : : .amcanorderbyop = false,
125 : : .amcanhash = false,
126 : : .amconsistentequality = true,
127 : : .amconsistentordering = true,
128 : : .amcanbackward = true,
129 : : .amcanunique = true,
130 : : .amcanmulticol = true,
131 : : .amoptionalkey = true,
132 : : .amsearcharray = true,
133 : : .amsearchnulls = true,
134 : : .amstorage = false,
135 : : .amclusterable = true,
136 : : .ampredlocks = true,
137 : : .amcanparallel = true,
138 : : .amcanbuildparallel = true,
139 : : .amcaninclude = true,
140 : : .amusemaintenanceworkmem = false,
141 : : .amsummarizing = false,
142 : : .amparallelvacuumoptions =
143 : : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP,
144 : : .amkeytype = InvalidOid,
145 : :
146 : : .ambuild = btbuild,
147 : : .ambuildempty = btbuildempty,
148 : : .aminsert = btinsert,
149 : : .aminsertcleanup = NULL,
150 : : .ambulkdelete = btbulkdelete,
151 : : .amvacuumcleanup = btvacuumcleanup,
152 : : .amcanreturn = btcanreturn,
153 : : .amcostestimate = btcostestimate,
154 : : .amgettreeheight = btgettreeheight,
155 : : .amoptions = btoptions,
156 : : .amproperty = btproperty,
157 : : .ambuildphasename = btbuildphasename,
158 : : .amvalidate = btvalidate,
159 : : .amadjustmembers = btadjustmembers,
160 : : .ambeginscan = btbeginscan,
161 : : .amrescan = btrescan,
162 : : .amgettuple = btgettuple,
163 : : .amgetbitmap = btgetbitmap,
164 : : .amendscan = btendscan,
165 : : .ammarkpos = btmarkpos,
166 : : .amrestrpos = btrestrpos,
167 : : .amestimateparallelscan = btestimateparallelscan,
168 : : .aminitparallelscan = btinitparallelscan,
169 : : .amparallelrescan = btparallelrescan,
170 : : .amtranslatestrategy = bttranslatestrategy,
171 : : .amtranslatecmptype = bttranslatecmptype,
172 : : };
173 : :
174 : 102304 : PG_RETURN_POINTER(&amroutine);
175 : : }
176 : :
177 : : /*
178 : : * btbuildempty() -- build an empty btree index in the initialization fork
179 : : */
180 : : void
181 : 17 : btbuildempty(Relation index)
182 : : {
183 : 17 : bool allequalimage = _bt_allequalimage(index, false);
184 : 17 : BulkWriteState *bulkstate;
185 : 17 : BulkWriteBuffer metabuf;
186 : :
187 : 17 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
188 : :
189 : : /* Construct metapage. */
190 : 17 : metabuf = smgr_bulk_get_buf(bulkstate);
191 : 17 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
192 : 17 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
193 : :
194 : 17 : smgr_bulk_finish(bulkstate);
195 : 17 : }
196 : :
197 : : /*
198 : : * btinsert() -- insert an index tuple into a btree.
199 : : *
200 : : * Descend the tree recursively, find the appropriate location for our
201 : : * new tuple, and put it there.
202 : : */
203 : : bool
204 : 790593 : btinsert(Relation rel, Datum *values, bool *isnull,
205 : : ItemPointer ht_ctid, Relation heapRel,
206 : : IndexUniqueCheck checkUnique,
207 : : bool indexUnchanged,
208 : : IndexInfo *indexInfo)
209 : : {
210 : 790593 : bool result;
211 : 790593 : IndexTuple itup;
212 : :
213 : : /* generate an index tuple */
214 : 790593 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
215 : 790593 : itup->t_tid = *ht_ctid;
216 : :
217 : 790593 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
218 : :
219 : 790593 : pfree(itup);
220 : :
221 : 1581186 : return result;
222 : 790593 : }
223 : :
224 : : /*
225 : : * btgettuple() -- Get the next tuple in the scan.
226 : : */
227 : : bool
228 : 2751579 : btgettuple(IndexScanDesc scan, ScanDirection dir)
229 : : {
230 : 2751579 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
231 : 2751579 : bool res;
232 : :
233 [ + - ]: 2751579 : Assert(scan->heapRelation != NULL);
234 : :
235 : : /* btree indexes are never lossy */
236 : 2751579 : scan->xs_recheck = false;
237 : :
238 : : /* Each loop iteration performs another primitive index scan */
239 : 2751579 : do
240 : : {
241 : : /*
242 : : * If we've already initialized this scan, we can just advance it in
243 : : * the appropriate direction. If we haven't done so yet, we call
244 : : * _bt_first() to get the first item in the scan.
245 : : */
246 [ + + - + : 2754395 : if (!BTScanPosIsValid(so->currPos))
+ + ]
247 : 1053880 : res = _bt_first(scan, dir);
248 : : else
249 : : {
250 : : /*
251 : : * Check to see if we should kill the previously-fetched tuple.
252 : : */
253 [ + + ]: 1700515 : if (scan->kill_prior_tuple)
254 : : {
255 : : /*
256 : : * Yes, remember it for later. (We'll deal with all such
257 : : * tuples at once right before leaving the index page.) The
258 : : * test for numKilled overrun is not just paranoia: if the
259 : : * caller reverses direction in the indexscan then the same
260 : : * item might get entered multiple times. It's not worth
261 : : * trying to optimize that, so we don't detect it, but instead
262 : : * just forget any excess entries.
263 : : */
264 [ + + ]: 66831 : if (so->killedItems == NULL)
265 : 8132 : so->killedItems = palloc_array(int, MaxTIDsPerBTreePage);
266 [ - + ]: 66831 : if (so->numKilled < MaxTIDsPerBTreePage)
267 : 66831 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
268 : 66831 : }
269 : :
270 : : /*
271 : : * Now continue the scan.
272 : : */
273 : 1700515 : res = _bt_next(scan, dir);
274 : : }
275 : :
276 : : /* If we have a tuple, return it ... */
277 [ + + ]: 2754395 : if (res)
278 : 2241227 : break;
279 : : /* ... otherwise see if we need another primitive index scan */
280 [ + + + + ]: 513168 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
281 : :
282 : 5503158 : return res;
283 : 2751579 : }
284 : :
285 : : /*
286 : : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
287 : : */
288 : : int64
289 : 1089 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
290 : : {
291 : 1089 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
292 : 1089 : int64 ntids = 0;
293 : 1089 : ItemPointer heapTid;
294 : :
295 [ + - ]: 1089 : Assert(scan->heapRelation == NULL);
296 : :
297 : : /* Each loop iteration performs another primitive index scan */
298 : 1089 : do
299 : : {
300 : : /* Fetch the first page & tuple */
301 [ + + ]: 1183 : if (_bt_first(scan, ForwardScanDirection))
302 : : {
303 : : /* Save tuple ID, and continue scanning */
304 : 666 : heapTid = &scan->xs_heaptid;
305 : 666 : tbm_add_tuples(tbm, heapTid, 1, false);
306 : 666 : ntids++;
307 : :
308 : 230158 : for (;;)
309 : : {
310 : : /*
311 : : * Advance to next tuple within page. This is the same as the
312 : : * easy case in _bt_next().
313 : : */
314 [ + + ]: 230158 : if (++so->currPos.itemIndex > so->currPos.lastItem)
315 : : {
316 : : /* let _bt_next do the heavy lifting */
317 [ + + ]: 1091 : if (!_bt_next(scan, ForwardScanDirection))
318 : 666 : break;
319 : 425 : }
320 : :
321 : : /* Save tuple ID, and continue scanning */
322 : 229492 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
323 : 229492 : tbm_add_tuples(tbm, heapTid, 1, false);
324 : 229492 : ntids++;
325 : : }
326 : 666 : }
327 : : /* Now see if we need another primitive index scan */
328 [ + + + + ]: 1183 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
329 : :
330 : 2178 : return ntids;
331 : 1089 : }
332 : :
333 : : /*
334 : : * btbeginscan() -- start a scan on a btree index
335 : : */
336 : : IndexScanDesc
337 : 993374 : btbeginscan(Relation rel, int nkeys, int norderbys)
338 : : {
339 : 993374 : IndexScanDesc scan;
340 : 993374 : BTScanOpaque so;
341 : :
342 : : /* no order by operators allowed */
343 [ + - ]: 993374 : Assert(norderbys == 0);
344 : :
345 : : /* get the scan */
346 : 993374 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
347 : :
348 : : /* allocate private workspace */
349 : 993374 : so = palloc_object(BTScanOpaqueData);
350 : 993374 : BTScanPosInvalidate(so->currPos);
351 : 993374 : BTScanPosInvalidate(so->markPos);
352 [ + + ]: 993374 : if (scan->numberOfKeys > 0)
353 : 992648 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
354 : : else
355 : 726 : so->keyData = NULL;
356 : :
357 : 993374 : so->skipScan = false;
358 : 993374 : so->needPrimScan = false;
359 : 993374 : so->scanBehind = false;
360 : 993374 : so->oppositeDirCheck = false;
361 : 993374 : so->arrayKeys = NULL;
362 : 993374 : so->orderProcs = NULL;
363 : 993374 : so->arrayContext = NULL;
364 : :
365 : 993374 : so->killedItems = NULL; /* until needed */
366 : 993374 : so->numKilled = 0;
367 : :
368 : : /*
369 : : * We don't know yet whether the scan will be index-only, so we do not
370 : : * allocate the tuple workspace arrays until btrescan. However, we set up
371 : : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
372 : : */
373 : 993374 : so->currTuples = so->markTuples = NULL;
374 : :
375 : 993374 : scan->xs_itupdesc = RelationGetDescr(rel);
376 : :
377 : 993374 : scan->opaque = so;
378 : :
379 : 1986748 : return scan;
380 : 993374 : }
381 : :
382 : : /*
383 : : * btrescan() -- rescan an index relation
384 : : */
385 : : void
386 : 1052233 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
387 : : ScanKey orderbys, int norderbys)
388 : : {
389 : 1052233 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
390 : :
391 : : /* we aren't holding any read locks, but gotta drop the pins */
392 [ + + + - : 1052233 : if (BTScanPosIsValid(so->currPos))
+ + ]
393 : : {
394 : : /* Before leaving current page, deal with any killed items */
395 [ + + ]: 10625 : if (so->numKilled > 0)
396 : 85 : _bt_killitems(scan);
397 [ - + # # : 10625 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
398 : 10625 : BTScanPosInvalidate(so->currPos);
399 : 10625 : }
400 : :
401 : : /*
402 : : * We prefer to eagerly drop leaf page pins before btgettuple returns.
403 : : * This avoids making VACUUM wait to acquire a cleanup lock on the page.
404 : : *
405 : : * We cannot safely drop leaf page pins during index-only scans due to a
406 : : * race condition involving VACUUM setting pages all-visible in the VM.
407 : : * It's also unsafe for plain index scans that use a non-MVCC snapshot.
408 : : *
409 : : * When we drop pins eagerly, the mechanism that marks so->killedItems[]
410 : : * index tuples LP_DEAD has to deal with concurrent TID recycling races.
411 : : * The scheme used to detect unsafe TID recycling won't work when scanning
412 : : * unlogged relations (since it involves saving an affected page's LSN).
413 : : * Opt out of eager pin dropping during unlogged relation scans for now
414 : : * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
415 : : *
416 : : * Also opt out of dropping leaf page pins eagerly during bitmap scans.
417 : : * Pins cannot be held for more than an instant during bitmap scans either
418 : : * way, so we might as well avoid wasting cycles on acquiring page LSNs.
419 : : *
420 : : * See nbtree/README section on making concurrent TID recycling safe.
421 : : *
422 : : * Note: so->dropPin should never change across rescans.
423 : : */
424 [ + + ]: 2057772 : so->dropPin = (!scan->xs_want_itup &&
425 [ + + ]: 1005539 : IsMVCCSnapshot(scan->xs_snapshot) &&
426 [ + + + + : 1005539 : RelationNeedsWAL(scan->indexRelation) &&
+ + + + ]
427 : 968066 : scan->heapRelation != NULL);
428 : :
429 : 1052233 : so->markItemIndex = -1;
430 : 1052233 : so->needPrimScan = false;
431 : 1052233 : so->scanBehind = false;
432 : 1052233 : so->oppositeDirCheck = false;
433 [ + - + - : 1052233 : BTScanPosUnpinIfPinned(so->markPos);
+ - ]
434 : 1052233 : BTScanPosInvalidate(so->markPos);
435 : :
436 : : /*
437 : : * Allocate tuple workspace arrays, if needed for an index-only scan and
438 : : * not already done in a previous rescan call. To save on palloc
439 : : * overhead, both workspaces are allocated as one palloc block; only this
440 : : * function and btendscan know that.
441 : : */
442 [ + + + + ]: 1052233 : if (scan->xs_want_itup && so->currTuples == NULL)
443 : : {
444 : 9608 : so->currTuples = (char *) palloc(BLCKSZ * 2);
445 : 9608 : so->markTuples = so->currTuples + BLCKSZ;
446 : 9608 : }
447 : :
448 : : /*
449 : : * Reset the scan keys
450 : : */
451 [ + + + + ]: 1052233 : if (scankey && scan->numberOfKeys > 0)
452 : 1051498 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
453 : 1052233 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
454 : 1052233 : so->numArrayKeys = 0; /* ditto */
455 : 1052233 : }
456 : :
457 : : /*
458 : : * btendscan() -- close down a scan
459 : : */
460 : : void
461 : 993162 : btendscan(IndexScanDesc scan)
462 : : {
463 : 993162 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
464 : :
465 : : /* we aren't holding any read locks, but gotta drop the pins */
466 [ + + + - : 993162 : if (BTScanPosIsValid(so->currPos))
+ + ]
467 : : {
468 : : /* Before leaving current page, deal with any killed items */
469 [ + + ]: 529943 : if (so->numKilled > 0)
470 : 2567 : _bt_killitems(scan);
471 [ - + # # : 529943 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
472 : 529943 : }
473 : :
474 : 993162 : so->markItemIndex = -1;
475 [ + + + - : 993162 : BTScanPosUnpinIfPinned(so->markPos);
+ + ]
476 : :
477 : : /* No need to invalidate positions, the RAM is about to be freed. */
478 : :
479 : : /* Release storage */
480 [ + + ]: 993162 : if (so->keyData != NULL)
481 : 992440 : pfree(so->keyData);
482 : : /* so->arrayKeys and so->orderProcs are in arrayContext */
483 [ + + ]: 993162 : if (so->arrayContext != NULL)
484 : 374 : MemoryContextDelete(so->arrayContext);
485 [ + + ]: 993162 : if (so->killedItems != NULL)
486 : 8122 : pfree(so->killedItems);
487 [ + + ]: 993162 : if (so->currTuples != NULL)
488 : 9601 : pfree(so->currTuples);
489 : : /* so->markTuples should not be pfree'd, see btrescan */
490 : 993162 : pfree(so);
491 : 993162 : }
492 : :
493 : : /*
494 : : * btmarkpos() -- save current scan position
495 : : */
496 : : void
497 : 21003 : btmarkpos(IndexScanDesc scan)
498 : : {
499 : 21003 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
500 : :
501 : : /* There may be an old mark with a pin (but no lock). */
502 [ + + + - : 21003 : BTScanPosUnpinIfPinned(so->markPos);
+ + ]
503 : :
504 : : /*
505 : : * Just record the current itemIndex. If we later step to next page
506 : : * before releasing the marked position, _bt_steppage makes a full copy of
507 : : * the currPos struct in markPos. If (as often happens) the mark is moved
508 : : * before we leave the page, we don't have to do that work.
509 : : */
510 [ - + # # : 21003 : if (BTScanPosIsValid(so->currPos))
+ - ]
511 : 21003 : so->markItemIndex = so->currPos.itemIndex;
512 : : else
513 : : {
514 : 0 : BTScanPosInvalidate(so->markPos);
515 : 0 : so->markItemIndex = -1;
516 : : }
517 : 21003 : }
518 : :
519 : : /*
520 : : * btrestrpos() -- restore scan to last saved position
521 : : */
522 : : void
523 : 9003 : btrestrpos(IndexScanDesc scan)
524 : : {
525 : 9003 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
526 : :
527 [ + + ]: 9003 : if (so->markItemIndex >= 0)
528 : : {
529 : : /*
530 : : * The scan has never moved to a new page since the last mark. Just
531 : : * restore the itemIndex.
532 : : *
533 : : * NB: In this case we can't count on anything in so->markPos to be
534 : : * accurate.
535 : : */
536 : 8985 : so->currPos.itemIndex = so->markItemIndex;
537 : 8985 : }
538 : : else
539 : : {
540 : : /*
541 : : * The scan moved to a new page after last mark or restore, and we are
542 : : * now restoring to the marked page. We aren't holding any read
543 : : * locks, but if we're still holding the pin for the current position,
544 : : * we must drop it.
545 : : */
546 [ - + # # : 18 : if (BTScanPosIsValid(so->currPos))
- + ]
547 : : {
548 : : /* Before leaving current page, deal with any killed items */
549 [ + - ]: 18 : if (so->numKilled > 0)
550 : 0 : _bt_killitems(scan);
551 [ - + # # : 18 : BTScanPosUnpinIfPinned(so->currPos);
+ - ]
552 : 18 : }
553 : :
554 [ - + # # : 18 : if (BTScanPosIsValid(so->markPos))
+ - ]
555 : : {
556 : : /* bump pin on mark buffer for assignment to current buffer */
557 [ - + # # : 18 : if (BTScanPosIsPinned(so->markPos))
+ - ]
558 : 0 : IncrBufferRefCount(so->markPos.buf);
559 : 18 : memcpy(&so->currPos, &so->markPos,
560 : : offsetof(BTScanPosData, items[1]) +
561 : : so->markPos.lastItem * sizeof(BTScanPosItem));
562 [ + - ]: 18 : if (so->currTuples)
563 : 0 : memcpy(so->currTuples, so->markTuples,
564 : : so->markPos.nextTupleOffset);
565 : : /* Reset the scan's array keys (see _bt_steppage for why) */
566 [ + - ]: 18 : if (so->numArrayKeys)
567 : : {
568 : 0 : _bt_start_array_keys(scan, so->currPos.dir);
569 : 0 : so->needPrimScan = false;
570 : 0 : }
571 : 18 : }
572 : : else
573 : 0 : BTScanPosInvalidate(so->currPos);
574 : : }
575 : 9003 : }
576 : :
577 : : /*
578 : : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
579 : : */
580 : : Size
581 : 10 : btestimateparallelscan(Relation rel, int nkeys, int norderbys)
582 : : {
583 : 10 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
584 : 10 : Size estnbtreeshared,
585 : : genericattrspace;
586 : :
587 : : /*
588 : : * Pessimistically assume that every input scan key will be output with
589 : : * its own SAOP array
590 : : */
591 : 10 : estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
592 : 10 : sizeof(int) * nkeys;
593 : :
594 : : /* Single column indexes cannot possibly use a skip array */
595 [ + + ]: 10 : if (nkeyatts == 1)
596 : 7 : return estnbtreeshared;
597 : :
598 : : /*
599 : : * Pessimistically assume that all attributes prior to the least
600 : : * significant attribute require a skip array (and an associated key)
601 : : */
602 : 3 : genericattrspace = datumEstimateSpace((Datum) 0, false, true,
603 : : sizeof(Datum));
604 [ + + ]: 6 : for (int attnum = 1; attnum < nkeyatts; attnum++)
605 : : {
606 : 3 : CompactAttribute *attr;
607 : :
608 : : /*
609 : : * We make the conservative assumption that every index column will
610 : : * also require a skip array.
611 : : *
612 : : * Every skip array must have space to store its scan key's sk_flags.
613 : : */
614 : 3 : estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
615 : :
616 : : /* Consider space required to store a datum of opclass input type */
617 : 3 : attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
618 [ + - ]: 3 : if (attr->attbyval)
619 : : {
620 : : /* This index attribute stores pass-by-value datums */
621 : 6 : Size estfixed = datumEstimateSpace((Datum) 0, false,
622 : 3 : true, attr->attlen);
623 : :
624 : 3 : estnbtreeshared = add_size(estnbtreeshared, estfixed);
625 : : continue;
626 : 3 : }
627 : :
628 : : /*
629 : : * This index attribute stores pass-by-reference datums.
630 : : *
631 : : * Assume that serializing this array will use just as much space as a
632 : : * pass-by-value datum, in addition to space for the largest possible
633 : : * whole index tuple (this is not just a per-datum portion of the
634 : : * largest possible tuple because that'd be almost as large anyway).
635 : : *
636 : : * This is quite conservative, but it's not clear how we could do much
637 : : * better. The executor requires an up-front storage request size
638 : : * that reliably covers the scan's high watermark memory usage. We
639 : : * can't be sure of the real high watermark until the scan is over.
640 : : */
641 : 0 : estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
642 : 0 : estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
643 [ - + - ]: 3 : }
644 : :
645 : 3 : return estnbtreeshared;
646 : 10 : }
647 : :
648 : : /*
649 : : * _bt_start_prim_scan() -- start scheduled primitive index scan?
650 : : *
651 : : * Returns true if _bt_checkkeys scheduled another primitive index scan, just
652 : : * as the last one ended. Otherwise returns false, indicating that the array
653 : : * keys are now fully exhausted.
654 : : *
655 : : * Only call here during scans with one or more equality type array scan keys,
656 : : * after _bt_first or _bt_next return false.
657 : : */
658 : : static bool
659 : 14324 : _bt_start_prim_scan(IndexScanDesc scan)
660 : : {
661 : 14324 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
662 : :
663 [ + - ]: 14324 : Assert(so->numArrayKeys);
664 : :
665 : 14324 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
666 : :
667 : : /*
668 : : * Array keys are advanced within _bt_checkkeys when the scan reaches the
669 : : * leaf level (more precisely, they're advanced when the scan reaches the
670 : : * end of each distinct set of array elements). This process avoids
671 : : * repeat access to leaf pages (across multiple primitive index scans) by
672 : : * advancing the scan's array keys when it allows the primitive index scan
673 : : * to find nearby matching tuples (or when it eliminates ranges of array
674 : : * key space that can't possibly be satisfied by any index tuple).
675 : : *
676 : : * _bt_checkkeys sets a simple flag variable to schedule another primitive
677 : : * index scan. The flag tells us what to do.
678 : : *
679 : : * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
680 : : * various cases where that won't happen. For example, if the index is
681 : : * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
682 : : * We also don't expect a call to _bt_checkkeys during searches for a
683 : : * non-existent value that happens to be lower/higher than any existing
684 : : * value in the index.
685 : : *
686 : : * We don't require special handling for these cases -- we don't need to
687 : : * be explicitly instructed to _not_ perform another primitive index scan.
688 : : * It's up to code under the control of _bt_first to always set the flag
689 : : * when another primitive index scan will be required.
690 : : *
691 : : * This works correctly, even with the tricky cases listed above, which
692 : : * all involve access to leaf pages "near the boundaries of the key space"
693 : : * (whether it's from a leftmost/rightmost page, or an imaginary empty
694 : : * leaf root page). If _bt_checkkeys cannot be reached by a primitive
695 : : * index scan for one set of array keys, then it also won't be reached for
696 : : * any later set ("later" in terms of the direction that we scan the index
697 : : * and advance the arrays). The array keys won't have advanced in these
698 : : * cases, but that's the correct behavior (even _bt_advance_array_keys
699 : : * won't always advance the arrays at the point they become "exhausted").
700 : : */
701 [ + + ]: 14324 : if (so->needPrimScan)
702 : : {
703 : : /*
704 : : * Flag was set -- must call _bt_first again, which will reset the
705 : : * scan's needPrimScan flag
706 : : */
707 : 2910 : return true;
708 : : }
709 : :
710 : : /* The top-level index scan ran out of tuples in this scan direction */
711 [ + + ]: 11414 : if (scan->parallel_scan != NULL)
712 : 5 : _bt_parallel_done(scan);
713 : :
714 : 11414 : return false;
715 : 14324 : }
716 : :
717 : : /*
718 : : * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
719 : : *
720 : : * Caller must have exclusively locked btscan->btps_lock when called.
721 : : */
722 : : static void
723 : 6 : _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
724 : : BTScanOpaque so)
725 : : {
726 : 6 : char *datumshared;
727 : :
728 : : /* Space for serialized datums begins immediately after btps_arrElems[] */
729 : 6 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
730 [ + + ]: 12 : for (int i = 0; i < so->numArrayKeys; i++)
731 : : {
732 : 6 : BTArrayKeyInfo *array = &so->arrayKeys[i];
733 : 6 : ScanKey skey = &so->keyData[array->scan_key];
734 : :
735 [ + - ]: 6 : if (array->num_elems != -1)
736 : : {
737 : : /* Save SAOP array's cur_elem (no need to copy key/datum) */
738 [ + - ]: 6 : Assert(!(skey->sk_flags & SK_BT_SKIP));
739 : 6 : btscan->btps_arrElems[i] = array->cur_elem;
740 : 6 : continue;
741 : : }
742 : :
743 : : /* Save all mutable state associated with skip array's key */
744 [ # # ]: 0 : Assert(skey->sk_flags & SK_BT_SKIP);
745 : 0 : memcpy(datumshared, &skey->sk_flags, sizeof(int));
746 : 0 : datumshared += sizeof(int);
747 : :
748 [ # # ]: 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
749 : : {
750 : : /* No sk_argument datum to serialize */
751 [ # # ]: 0 : Assert(skey->sk_argument == 0);
752 : 0 : continue;
753 : : }
754 : :
755 : 0 : datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
756 : 0 : array->attbyval, array->attlen, &datumshared);
757 [ - + - ]: 6 : }
758 : 6 : }
759 : :
760 : : /*
761 : : * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
762 : : *
763 : : * Caller must have exclusively locked btscan->btps_lock when called.
764 : : */
765 : : static void
766 : 6 : _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
767 : : BTScanOpaque so)
768 : : {
769 : 6 : char *datumshared;
770 : :
771 : : /* Space for serialized datums begins immediately after btps_arrElems[] */
772 : 6 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
773 [ + + ]: 12 : for (int i = 0; i < so->numArrayKeys; i++)
774 : : {
775 : 6 : BTArrayKeyInfo *array = &so->arrayKeys[i];
776 : 6 : ScanKey skey = &so->keyData[array->scan_key];
777 : 6 : bool isnull;
778 : :
779 [ + - ]: 6 : if (array->num_elems != -1)
780 : : {
781 : : /* Restore SAOP array using its saved cur_elem */
782 [ + - ]: 6 : Assert(!(skey->sk_flags & SK_BT_SKIP));
783 : 6 : array->cur_elem = btscan->btps_arrElems[i];
784 : 6 : skey->sk_argument = array->elem_values[array->cur_elem];
785 : 6 : continue;
786 : : }
787 : :
788 : : /* Restore skip array by restoring its key directly */
789 [ # # # # ]: 0 : if (!array->attbyval && skey->sk_argument)
790 : 0 : pfree(DatumGetPointer(skey->sk_argument));
791 : 0 : skey->sk_argument = (Datum) 0;
792 : 0 : memcpy(&skey->sk_flags, datumshared, sizeof(int));
793 : 0 : datumshared += sizeof(int);
794 : :
795 [ # # ]: 0 : Assert(skey->sk_flags & SK_BT_SKIP);
796 : :
797 [ # # ]: 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
798 : : {
799 : : /* No sk_argument datum to restore */
800 : 0 : continue;
801 : : }
802 : :
803 : 0 : skey->sk_argument = datumRestore(&datumshared, &isnull);
804 [ # # ]: 0 : if (isnull)
805 : : {
806 [ # # ]: 0 : Assert(skey->sk_argument == 0);
807 [ # # ]: 0 : Assert(skey->sk_flags & SK_SEARCHNULL);
808 [ # # ]: 0 : Assert(skey->sk_flags & SK_ISNULL);
809 : 0 : }
810 [ - + - ]: 6 : }
811 : 6 : }
812 : :
813 : : /*
814 : : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
815 : : */
816 : : void
817 : 10 : btinitparallelscan(void *target)
818 : : {
819 : 10 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
820 : :
821 : 10 : LWLockInitialize(&bt_target->btps_lock,
822 : : LWTRANCHE_PARALLEL_BTREE_SCAN);
823 : 10 : bt_target->btps_nextScanPage = InvalidBlockNumber;
824 : 10 : bt_target->btps_lastCurrPage = InvalidBlockNumber;
825 : 10 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
826 : 10 : ConditionVariableInit(&bt_target->btps_cv);
827 : 10 : }
828 : :
829 : : /*
830 : : * btparallelrescan() -- reset parallel scan
831 : : */
832 : : void
833 : 4 : btparallelrescan(IndexScanDesc scan)
834 : : {
835 : 4 : BTParallelScanDesc btscan;
836 : 4 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
837 : :
838 [ + - ]: 4 : Assert(parallel_scan);
839 : :
840 : 4 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
841 : : parallel_scan->ps_offset_am);
842 : :
843 : : /*
844 : : * In theory, we don't need to acquire the LWLock here, because there
845 : : * shouldn't be any other workers running at this point, but we do so for
846 : : * consistency.
847 : : */
848 : 4 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
849 : 4 : btscan->btps_nextScanPage = InvalidBlockNumber;
850 : 4 : btscan->btps_lastCurrPage = InvalidBlockNumber;
851 : 4 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
852 : 4 : LWLockRelease(&btscan->btps_lock);
853 : 4 : }
854 : :
855 : : /*
856 : : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
857 : : * page. Other scans must wait until we call _bt_parallel_release()
858 : : * or _bt_parallel_done().
859 : : *
860 : : * The return value is true if we successfully seized the scan and false
861 : : * if we did not. The latter case occurs when no pages remain, or when
862 : : * another primitive index scan is scheduled that caller's backend cannot
863 : : * start just yet (only backends that call from _bt_first are capable of
864 : : * starting primitive index scans, which they indicate by passing first=true).
865 : : *
866 : : * If the return value is true, *next_scan_page returns the next page of the
867 : : * scan, and *last_curr_page returns the page that *next_scan_page came from.
868 : : * An invalid *next_scan_page means the scan hasn't yet started, or that
869 : : * caller needs to start the next primitive index scan (if it's the latter
870 : : * case we'll set so.needPrimScan).
871 : : *
872 : : * Callers should ignore the value of *next_scan_page and *last_curr_page if
873 : : * the return value is false.
874 : : */
875 : : bool
876 : 275 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
877 : : BlockNumber *last_curr_page, bool first)
878 : : {
879 : 275 : Relation rel = scan->indexRelation;
880 : 275 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
881 : 550 : bool exit_loop = false,
882 : 275 : status = true,
883 : 275 : endscan = false;
884 : 275 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
885 : 275 : BTParallelScanDesc btscan;
886 : :
887 : 275 : *next_scan_page = InvalidBlockNumber;
888 : 275 : *last_curr_page = InvalidBlockNumber;
889 : :
890 : : /*
891 : : * Reset so->currPos, and initialize moreLeft/moreRight such that the next
892 : : * call to _bt_readnextpage treats this backend similarly to a serial
893 : : * backend that steps from *last_curr_page to *next_scan_page (unless this
894 : : * backend's so->currPos is initialized by _bt_readfirstpage before then).
895 : : */
896 : 275 : BTScanPosInvalidate(so->currPos);
897 : 275 : so->currPos.moreLeft = so->currPos.moreRight = true;
898 : :
899 [ + + ]: 275 : if (first)
900 : : {
901 : : /*
902 : : * Initialize array related state when called from _bt_first, assuming
903 : : * that this will be the first primitive index scan for the scan
904 : : */
905 : 73 : so->needPrimScan = false;
906 : 73 : so->scanBehind = false;
907 : 73 : so->oppositeDirCheck = false;
908 : 73 : }
909 : : else
910 : : {
911 : : /*
912 : : * Don't attempt to seize the scan when it requires another primitive
913 : : * index scan, since caller's backend cannot start it right now
914 : : */
915 [ - + ]: 202 : if (so->needPrimScan)
916 : 0 : return false;
917 : : }
918 : :
919 : 275 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
920 : : parallel_scan->ps_offset_am);
921 : :
922 : 275 : while (1)
923 : : {
924 : 275 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
925 : :
926 [ + + ]: 275 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
927 : : {
928 : : /* We're done with this parallel index scan */
929 : 52 : status = false;
930 : 52 : }
931 [ + + + + ]: 223 : else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
932 : 203 : btscan->btps_nextScanPage == P_NONE)
933 : : {
934 : : /* End this parallel index scan */
935 : 1 : status = false;
936 : 1 : endscan = true;
937 : 1 : }
938 [ + + ]: 222 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
939 : : {
940 [ - + ]: 6 : Assert(so->numArrayKeys);
941 : :
942 [ + - ]: 6 : if (first)
943 : : {
944 : : /* Can start scheduled primitive scan right away, so do so */
945 : 6 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
946 : :
947 : : /* Restore scan's array keys from serialized values */
948 : 6 : _bt_parallel_restore_arrays(rel, btscan, so);
949 : 6 : exit_loop = true;
950 : 6 : }
951 : : else
952 : : {
953 : : /*
954 : : * Don't attempt to seize the scan when it requires another
955 : : * primitive index scan, since caller's backend cannot start
956 : : * it right now
957 : : */
958 : 0 : status = false;
959 : : }
960 : :
961 : : /*
962 : : * Either way, update backend local state to indicate that a
963 : : * pending primitive scan is required
964 : : */
965 : 6 : so->needPrimScan = true;
966 : 6 : so->scanBehind = false;
967 : 6 : so->oppositeDirCheck = false;
968 : 6 : }
969 [ - + ]: 216 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
970 : : {
971 : : /*
972 : : * We have successfully seized control of the scan for the purpose
973 : : * of advancing it to a new page!
974 : : */
975 : 216 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
976 [ - + ]: 216 : Assert(btscan->btps_nextScanPage != P_NONE);
977 : 216 : *next_scan_page = btscan->btps_nextScanPage;
978 : 216 : *last_curr_page = btscan->btps_lastCurrPage;
979 : 216 : exit_loop = true;
980 : 216 : }
981 : 275 : LWLockRelease(&btscan->btps_lock);
982 [ + + + - ]: 275 : if (exit_loop || !status)
983 : 275 : break;
984 : 0 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
985 : : }
986 : 275 : ConditionVariableCancelSleep();
987 : :
988 : : /* When the scan has reached the rightmost (or leftmost) page, end it */
989 [ + + ]: 275 : if (endscan)
990 : 1 : _bt_parallel_done(scan);
991 : :
992 : 275 : return status;
993 : 275 : }
994 : :
995 : : /*
996 : : * _bt_parallel_release() -- Complete the process of advancing the scan to a
997 : : * new page. We now have the new value btps_nextScanPage; another backend
998 : : * can now begin advancing the scan.
999 : : *
1000 : : * Callers whose scan uses array keys must save their curr_page argument so
1001 : : * that it can be passed to _bt_parallel_primscan_schedule, should caller
1002 : : * determine that another primitive index scan is required.
1003 : : *
1004 : : * If caller's next_scan_page is P_NONE, the scan has reached the index's
1005 : : * rightmost/leftmost page. This is treated as reaching the end of the scan
1006 : : * within _bt_parallel_seize.
1007 : : *
1008 : : * Note: unlike the serial case, parallel scans don't need to remember both
1009 : : * sibling links. next_scan_page is whichever link is next given the scan's
1010 : : * direction. That's all we'll ever need, since the direction of a parallel
1011 : : * scan can never change.
1012 : : */
1013 : : void
1014 : 222 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
1015 : : BlockNumber curr_page)
1016 : : {
1017 : 222 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1018 : 222 : BTParallelScanDesc btscan;
1019 : :
1020 [ + - ]: 222 : Assert(BlockNumberIsValid(next_scan_page));
1021 : :
1022 : 222 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1023 : : parallel_scan->ps_offset_am);
1024 : :
1025 : 222 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1026 : 222 : btscan->btps_nextScanPage = next_scan_page;
1027 : 222 : btscan->btps_lastCurrPage = curr_page;
1028 : 222 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
1029 : 222 : LWLockRelease(&btscan->btps_lock);
1030 : 222 : ConditionVariableSignal(&btscan->btps_cv);
1031 : 222 : }
1032 : :
1033 : : /*
1034 : : * _bt_parallel_done() -- Mark the parallel scan as complete.
1035 : : *
1036 : : * When there are no pages left to scan, this function should be called to
1037 : : * notify other workers. Otherwise, they might wait forever for the scan to
1038 : : * advance to the next page.
1039 : : */
1040 : : void
1041 : 514304 : _bt_parallel_done(IndexScanDesc scan)
1042 : : {
1043 : 514304 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1044 : 514304 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1045 : 514304 : BTParallelScanDesc btscan;
1046 : 514304 : bool status_changed = false;
1047 : :
1048 [ + - + - : 514304 : Assert(!BTScanPosIsValid(so->currPos));
+ - ]
1049 : :
1050 : : /* Do nothing, for non-parallel scans */
1051 [ + + ]: 514304 : if (parallel_scan == NULL)
1052 : 514279 : return;
1053 : :
1054 : : /*
1055 : : * Should not mark parallel scan done when there's still a pending
1056 : : * primitive index scan
1057 : : */
1058 [ + + ]: 25 : if (so->needPrimScan)
1059 : 6 : return;
1060 : :
1061 : 19 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1062 : : parallel_scan->ps_offset_am);
1063 : :
1064 : : /*
1065 : : * Mark the parallel scan as done, unless some other process did so
1066 : : * already
1067 : : */
1068 : 19 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1069 [ + - ]: 19 : Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1070 [ + + ]: 19 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1071 : : {
1072 : 14 : btscan->btps_pageStatus = BTPARALLEL_DONE;
1073 : 14 : status_changed = true;
1074 : 14 : }
1075 : 19 : LWLockRelease(&btscan->btps_lock);
1076 : :
1077 : : /* wake up all the workers associated with this parallel scan */
1078 [ + + ]: 19 : if (status_changed)
1079 : 14 : ConditionVariableBroadcast(&btscan->btps_cv);
1080 [ - + ]: 514304 : }
1081 : :
1082 : : /*
1083 : : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
1084 : : *
1085 : : * Caller passes the curr_page most recently passed to _bt_parallel_release
1086 : : * by its backend. Caller successfully schedules the next primitive index scan
1087 : : * if the shared parallel state hasn't been seized since caller's backend last
1088 : : * advanced the scan.
1089 : : */
1090 : : void
1091 : 6 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
1092 : : {
1093 : 6 : Relation rel = scan->indexRelation;
1094 : 6 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1095 : 6 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1096 : 6 : BTParallelScanDesc btscan;
1097 : :
1098 [ + - ]: 6 : Assert(so->numArrayKeys);
1099 : :
1100 : 6 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1101 : : parallel_scan->ps_offset_am);
1102 : :
1103 : 6 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1104 [ + - - + ]: 6 : if (btscan->btps_lastCurrPage == curr_page &&
1105 : 6 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
1106 : : {
1107 : 6 : btscan->btps_nextScanPage = InvalidBlockNumber;
1108 : 6 : btscan->btps_lastCurrPage = InvalidBlockNumber;
1109 : 6 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1110 : :
1111 : : /* Serialize scan's current array keys */
1112 : 6 : _bt_parallel_serialize_arrays(rel, btscan, so);
1113 : 6 : }
1114 : 6 : LWLockRelease(&btscan->btps_lock);
1115 : 6 : }
1116 : :
1117 : : /*
1118 : : * Bulk deletion of all index entries pointing to a set of heap tuples.
1119 : : * The set of target tuples is specified via a callback routine that tells
1120 : : * whether any given heap tuple (identified by ItemPointer) is being deleted.
1121 : : *
1122 : : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1123 : : */
1124 : : IndexBulkDeleteResult *
1125 : 169 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1126 : : IndexBulkDeleteCallback callback, void *callback_state)
1127 : : {
1128 : 169 : Relation rel = info->index;
1129 : 169 : BTCycleId cycleid;
1130 : :
1131 : : /* allocate stats if first time through, else re-use existing struct */
1132 [ + + ]: 169 : if (stats == NULL)
1133 : 161 : stats = palloc0_object(IndexBulkDeleteResult);
1134 : :
1135 : : /* Establish the vacuum cycle ID to use for this scan */
1136 : : /* The ENSURE stuff ensures we clean up shared memory on failure */
1137 [ + - ]: 169 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1138 : : {
1139 : 169 : cycleid = _bt_start_vacuum(rel);
1140 : :
1141 : 169 : btvacuumscan(info, stats, callback, callback_state, cycleid);
1142 : : }
1143 [ + - ]: 169 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1144 : 169 : _bt_end_vacuum(rel);
1145 : :
1146 : 338 : return stats;
1147 : 169 : }
1148 : :
1149 : : /*
1150 : : * Post-VACUUM cleanup.
1151 : : *
1152 : : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1153 : : */
1154 : : IndexBulkDeleteResult *
1155 : 869 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
1156 : : {
1157 : 869 : BlockNumber num_delpages;
1158 : :
1159 : : /* No-op in ANALYZE ONLY mode */
1160 [ + + ]: 869 : if (info->analyze_only)
1161 : 294 : return stats;
1162 : :
1163 : : /*
1164 : : * If btbulkdelete was called, we need not do anything (we just maintain
1165 : : * the information used within _bt_vacuum_needs_cleanup() by calling
1166 : : * _bt_set_cleanup_info() below).
1167 : : *
1168 : : * If btbulkdelete was _not_ called, then we have a choice to make: we
1169 : : * must decide whether or not a btvacuumscan() call is needed now (i.e.
1170 : : * whether the ongoing VACUUM operation can entirely avoid a physical scan
1171 : : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1172 : : * now.
1173 : : */
1174 [ + + ]: 575 : if (stats == NULL)
1175 : : {
1176 : : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1177 [ + + ]: 469 : if (!_bt_vacuum_needs_cleanup(info->index))
1178 : 468 : return NULL;
1179 : :
1180 : : /*
1181 : : * Since we aren't going to actually delete any leaf items, there's no
1182 : : * need to go through all the vacuum-cycle-ID pushups here.
1183 : : *
1184 : : * Posting list tuples are a source of inaccuracy for cleanup-only
1185 : : * scans. btvacuumscan() will assume that the number of index tuples
1186 : : * from each page can be used as num_index_tuples, even though
1187 : : * num_index_tuples is supposed to represent the number of TIDs in the
1188 : : * index. This naive approach can underestimate the number of tuples
1189 : : * in the index significantly.
1190 : : *
1191 : : * We handle the problem by making num_index_tuples an estimate in
1192 : : * cleanup-only case.
1193 : : */
1194 : 1 : stats = palloc0_object(IndexBulkDeleteResult);
1195 : 1 : btvacuumscan(info, stats, NULL, NULL, 0);
1196 : 1 : stats->estimated_count = true;
1197 : 1 : }
1198 : :
1199 : : /*
1200 : : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1201 : : *
1202 : : * num_delpages is the number of deleted pages now in the index that were
1203 : : * not safe to place in the FSM to be recycled just yet. num_delpages is
1204 : : * greater than 0 only when _bt_pagedel() actually deleted pages during
1205 : : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1206 : : * have failed to place any newly deleted pages in the FSM just moments
1207 : : * ago. (Actually, there are edge cases where recycling of the current
1208 : : * VACUUM's newly deleted pages does not even become safe by the time the
1209 : : * next VACUUM comes around. See nbtree/README.)
1210 : : */
1211 [ + - ]: 107 : Assert(stats->pages_deleted >= stats->pages_free);
1212 : 107 : num_delpages = stats->pages_deleted - stats->pages_free;
1213 : 107 : _bt_set_cleanup_info(info->index, num_delpages);
1214 : :
1215 : : /*
1216 : : * It's quite possible for us to be fooled by concurrent page splits into
1217 : : * double-counting some index tuples, so disbelieve any total that exceeds
1218 : : * the underlying heap's count ... if we know that accurately. Otherwise
1219 : : * this might just make matters worse.
1220 : : */
1221 [ + + ]: 107 : if (!info->estimated_count)
1222 : : {
1223 [ + - ]: 103 : if (stats->num_index_tuples > info->num_heap_tuples)
1224 : 0 : stats->num_index_tuples = info->num_heap_tuples;
1225 : 103 : }
1226 : :
1227 : 107 : return stats;
1228 : 869 : }
1229 : :
1230 : : /*
1231 : : * btvacuumscan --- scan the index for VACUUMing purposes
1232 : : *
1233 : : * This combines the functions of looking for leaf tuples that are deletable
1234 : : * according to the vacuum callback, looking for empty pages that can be
1235 : : * deleted, and looking for old deleted pages that can be recycled. Both
1236 : : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1237 : : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1238 : : *
1239 : : * The caller is responsible for initially allocating/zeroing a stats struct
1240 : : * and for obtaining a vacuum cycle ID if necessary.
1241 : : */
1242 : : static void
1243 : 170 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1244 : : IndexBulkDeleteCallback callback, void *callback_state,
1245 : : BTCycleId cycleid)
1246 : : {
1247 : 170 : Relation rel = info->index;
1248 : 170 : BTVacState vstate;
1249 : 170 : BlockNumber num_pages;
1250 : 170 : bool needLock;
1251 : 170 : BlockRangeReadStreamPrivate p;
1252 : 170 : ReadStream *stream = NULL;
1253 : :
1254 : : /*
1255 : : * Reset fields that track information about the entire index now. This
1256 : : * avoids double-counting in the case where a single VACUUM command
1257 : : * requires multiple scans of the index.
1258 : : *
1259 : : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1260 : : * since they track information about the VACUUM command, and so must last
1261 : : * across each call to btvacuumscan().
1262 : : *
1263 : : * (Note that pages_free is treated as state about the whole index, not
1264 : : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1265 : : * calls are idempotent, and get repeated for the same deleted pages in
1266 : : * some scenarios. The point for us is to track the number of recyclable
1267 : : * pages in the index at the end of the VACUUM command.)
1268 : : */
1269 : 170 : stats->num_pages = 0;
1270 : 170 : stats->num_index_tuples = 0;
1271 : 170 : stats->pages_deleted = 0;
1272 : 170 : stats->pages_free = 0;
1273 : :
1274 : : /* Set up info to pass down to btvacuumpage */
1275 : 170 : vstate.info = info;
1276 : 170 : vstate.stats = stats;
1277 : 170 : vstate.callback = callback;
1278 : 170 : vstate.callback_state = callback_state;
1279 : 170 : vstate.cycleid = cycleid;
1280 : :
1281 : : /* Create a temporary memory context to run _bt_pagedel in */
1282 : 170 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1283 : : "_bt_pagedel",
1284 : : ALLOCSET_DEFAULT_SIZES);
1285 : :
1286 : : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1287 : 170 : vstate.bufsize = 0;
1288 : 170 : vstate.maxbufsize = 0;
1289 : 170 : vstate.pendingpages = NULL;
1290 : 170 : vstate.npendingpages = 0;
1291 : : /* Consider applying _bt_pendingfsm_finalize optimization */
1292 : 170 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1293 : :
1294 : : /*
1295 : : * The outer loop iterates over all index pages except the metapage, in
1296 : : * physical order (we hope the kernel will cooperate in providing
1297 : : * read-ahead for speed). It is critical that we visit all leaf pages,
1298 : : * including ones added after we start the scan, else we might fail to
1299 : : * delete some deletable tuples. Hence, we must repeatedly check the
1300 : : * relation length. We must acquire the relation-extension lock while
1301 : : * doing so to avoid a race condition: if someone else is extending the
1302 : : * relation, there is a window where bufmgr/smgr have created a new
1303 : : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1304 : : * we manage to scan such a page here, we'll improperly assume it can be
1305 : : * recycled. Taking the lock synchronizes things enough to prevent a
1306 : : * problem: either num_pages won't include the new page, or _bt_getbuf
1307 : : * already has write lock on the buffer and it will be fully initialized
1308 : : * before we can examine it. Also, we need not worry if a page is added
1309 : : * immediately after we look; the page splitting code already has
1310 : : * write-lock on the left page before it adds a right page, so we must
1311 : : * already have processed any tuples due to be moved into such a page.
1312 : : *
1313 : : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1314 : : * think the use of the extension lock is still required.
1315 : : *
1316 : : * We can skip locking for new or temp relations, however, since no one
1317 : : * else could be accessing them.
1318 : : */
1319 [ - + ]: 170 : needLock = !RELATION_IS_LOCAL(rel);
1320 : :
1321 : 170 : p.current_blocknum = BTREE_METAPAGE + 1;
1322 : :
1323 : : /*
1324 : : * It is safe to use batchmode as block_range_read_stream_cb takes no
1325 : : * locks.
1326 : : */
1327 : 170 : stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
1328 : : READ_STREAM_FULL |
1329 : : READ_STREAM_USE_BATCHING,
1330 : 170 : info->strategy,
1331 : 170 : rel,
1332 : : MAIN_FORKNUM,
1333 : : block_range_read_stream_cb,
1334 : : &p,
1335 : : 0);
1336 : 308 : for (;;)
1337 : : {
1338 : : /* Get the current relation length */
1339 [ - + ]: 308 : if (needLock)
1340 : 308 : LockRelationForExtension(rel, ExclusiveLock);
1341 : 308 : num_pages = RelationGetNumberOfBlocks(rel);
1342 [ - + ]: 308 : if (needLock)
1343 : 308 : UnlockRelationForExtension(rel, ExclusiveLock);
1344 : :
1345 [ + + ]: 308 : if (info->report_progress)
1346 : 78 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1347 : 78 : num_pages);
1348 : :
1349 : : /* Quit if we've scanned the whole relation */
1350 [ + + ]: 308 : if (p.current_blocknum >= num_pages)
1351 : 170 : break;
1352 : :
1353 : 138 : p.last_exclusive = num_pages;
1354 : :
1355 : : /* Iterate over pages, then loop back to recheck relation length */
1356 : 1013 : while (true)
1357 : : {
1358 : 1013 : BlockNumber current_block;
1359 : 1013 : Buffer buf;
1360 : :
1361 : : /* call vacuum_delay_point while not holding any buffer lock */
1362 : 1013 : vacuum_delay_point(false);
1363 : :
1364 : 1013 : buf = read_stream_next_buffer(stream, NULL);
1365 : :
1366 [ + + ]: 1013 : if (!BufferIsValid(buf))
1367 : 138 : break;
1368 : :
1369 : 875 : current_block = btvacuumpage(&vstate, buf);
1370 : :
1371 [ + + ]: 875 : if (info->report_progress)
1372 : 38 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1373 : 38 : current_block);
1374 [ - + + ]: 1013 : }
1375 : :
1376 : : /*
1377 : : * We have to reset the read stream to use it again. After returning
1378 : : * InvalidBuffer, the read stream API won't invoke our callback again
1379 : : * until the stream has been reset.
1380 : : */
1381 : 138 : read_stream_reset(stream);
1382 : : }
1383 : :
1384 : 170 : read_stream_end(stream);
1385 : :
1386 : : /* Set statistics num_pages field to final size of index */
1387 : 170 : stats->num_pages = num_pages;
1388 : :
1389 : 170 : MemoryContextDelete(vstate.pagedelcontext);
1390 : :
1391 : : /*
1392 : : * If there were any calls to _bt_pagedel() during scan of the index then
1393 : : * see if any of the resulting pages can be placed in the FSM now. When
1394 : : * it's not safe we'll have to leave it up to a future VACUUM operation.
1395 : : *
1396 : : * Finally, if we placed any pages in the FSM (either just now or during
1397 : : * the scan), forcibly update the upper-level FSM pages to ensure that
1398 : : * searchers can find them.
1399 : : */
1400 : 170 : _bt_pendingfsm_finalize(rel, &vstate);
1401 [ + + ]: 170 : if (stats->pages_free > 0)
1402 : 4 : IndexFreeSpaceMapVacuum(rel);
1403 : 170 : }
1404 : :
1405 : : /*
1406 : : * btvacuumpage --- VACUUM one page
1407 : : *
1408 : : * This processes a single page for btvacuumscan(). In some cases we must
1409 : : * backtrack to re-examine and VACUUM pages that were on buf's page during
1410 : : * a previous call here. This is how we handle page splits (that happened
1411 : : * after our cycleid was acquired) whose right half page happened to reuse
1412 : : * a block that we might have processed at some point before it was
1413 : : * recycled (i.e. before the page split).
1414 : : *
1415 : : * Returns BlockNumber of a scanned page (not backtracked).
1416 : : */
1417 : : static BlockNumber
1418 : 875 : btvacuumpage(BTVacState *vstate, Buffer buf)
1419 : : {
1420 : 875 : IndexVacuumInfo *info = vstate->info;
1421 : 875 : IndexBulkDeleteResult *stats = vstate->stats;
1422 : 875 : IndexBulkDeleteCallback callback = vstate->callback;
1423 : 875 : void *callback_state = vstate->callback_state;
1424 : 875 : Relation rel = info->index;
1425 : 875 : Relation heaprel = info->heaprel;
1426 : 875 : bool attempt_pagedel;
1427 : 875 : BlockNumber blkno,
1428 : : backtrack_to;
1429 : 875 : BlockNumber scanblkno = BufferGetBlockNumber(buf);
1430 : 875 : Page page;
1431 : 875 : BTPageOpaque opaque;
1432 : :
1433 : 875 : blkno = scanblkno;
1434 : :
1435 : : backtrack:
1436 : :
1437 : 875 : attempt_pagedel = false;
1438 : 875 : backtrack_to = P_NONE;
1439 : :
1440 : 875 : _bt_lockbuf(rel, buf, BT_READ);
1441 : 875 : page = BufferGetPage(buf);
1442 : 875 : opaque = NULL;
1443 [ - + ]: 875 : if (!PageIsNew(page))
1444 : : {
1445 : 875 : _bt_checkpage(rel, buf);
1446 : 875 : opaque = BTPageGetOpaque(page);
1447 : 875 : }
1448 : :
1449 [ + - ]: 875 : Assert(blkno <= scanblkno);
1450 [ + - ]: 875 : if (blkno != scanblkno)
1451 : : {
1452 : : /*
1453 : : * We're backtracking.
1454 : : *
1455 : : * We followed a right link to a sibling leaf page (a page that
1456 : : * happens to be from a block located before scanblkno). The only
1457 : : * case we want to do anything with is a live leaf page having the
1458 : : * current vacuum cycle ID.
1459 : : *
1460 : : * The page had better be in a state that's consistent with what we
1461 : : * expect. Check for conditions that imply corruption in passing. It
1462 : : * can't be half-dead because only an interrupted VACUUM process can
1463 : : * leave pages in that state, so we'd definitely have dealt with it
1464 : : * back when the page was the scanblkno page (half-dead pages are
1465 : : * always marked fully deleted by _bt_pagedel(), barring corruption).
1466 : : */
1467 [ # # ]: 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1468 : : {
1469 : 0 : Assert(false);
1470 [ # # # # ]: 0 : ereport(LOG,
1471 : : (errcode(ERRCODE_INDEX_CORRUPTED),
1472 : : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1473 : : blkno, scanblkno, RelationGetRelationName(rel))));
1474 : 0 : _bt_relbuf(rel, buf);
1475 : 0 : return scanblkno;
1476 : : }
1477 : :
1478 : : /*
1479 : : * We may have already processed the page in an earlier call, when the
1480 : : * page was scanblkno. This happens when the leaf page split occurred
1481 : : * after the scan began, but before the right sibling page became the
1482 : : * scanblkno.
1483 : : *
1484 : : * Page may also have been deleted by current btvacuumpage() call,
1485 : : * since _bt_pagedel() sometimes deletes the right sibling page of
1486 : : * scanblkno in passing (it does so after we decided where to
1487 : : * backtrack to). We don't need to process this page as a deleted
1488 : : * page a second time now (in fact, it would be wrong to count it as a
1489 : : * deleted page in the bulk delete statistics a second time).
1490 : : */
1491 [ # # # # ]: 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1492 : : {
1493 : : /* Done with current scanblkno (and all lower split pages) */
1494 : 0 : _bt_relbuf(rel, buf);
1495 : 0 : return scanblkno;
1496 : : }
1497 : 0 : }
1498 : :
1499 [ + - + + ]: 875 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1500 : : {
1501 : : /* Okay to recycle this page (which could be leaf or internal) */
1502 : 14 : RecordFreeIndexPage(rel, blkno);
1503 : 14 : stats->pages_deleted++;
1504 : 14 : stats->pages_free++;
1505 : 14 : }
1506 [ + + ]: 861 : else if (P_ISDELETED(opaque))
1507 : : {
1508 : : /*
1509 : : * Already deleted page (which could be leaf or internal). Can't
1510 : : * recycle yet.
1511 : : */
1512 : 44 : stats->pages_deleted++;
1513 : 44 : }
1514 [ - + ]: 817 : else if (P_ISHALFDEAD(opaque))
1515 : : {
1516 : : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1517 : 0 : attempt_pagedel = true;
1518 : :
1519 : : /*
1520 : : * _bt_pagedel() will increment both pages_newly_deleted and
1521 : : * pages_deleted stats in all cases (barring corruption)
1522 : : */
1523 : 0 : }
1524 [ + + ]: 817 : else if (P_ISLEAF(opaque))
1525 : : {
1526 : 733 : OffsetNumber deletable[MaxIndexTuplesPerPage];
1527 : 733 : int ndeletable;
1528 : 733 : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1529 : 733 : int nupdatable;
1530 : 733 : OffsetNumber offnum,
1531 : : minoff,
1532 : : maxoff;
1533 : 733 : int nhtidsdead,
1534 : : nhtidslive;
1535 : :
1536 : : /*
1537 : : * Trade in the initial read lock for a full cleanup lock on this
1538 : : * page. We must get such a lock on every leaf page over the course
1539 : : * of the vacuum scan, whether or not it actually contains any
1540 : : * deletable tuples --- see nbtree/README.
1541 : : */
1542 : 733 : _bt_upgradelockbufcleanup(rel, buf);
1543 : :
1544 : : /*
1545 : : * Check whether we need to backtrack to earlier pages. What we are
1546 : : * concerned about is a page split that happened since we started the
1547 : : * vacuum scan. If the split moved tuples on the right half of the
1548 : : * split (i.e. the tuples that sort high) to a block that we already
1549 : : * passed over, then we might have missed the tuples. We need to
1550 : : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1551 : : * or deleting scanblkno page below!)
1552 : : */
1553 [ + + ]: 733 : if (vstate->cycleid != 0 &&
1554 [ - + ]: 725 : opaque->btpo_cycleid == vstate->cycleid &&
1555 [ # # ]: 0 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1556 [ # # # # ]: 0 : !P_RIGHTMOST(opaque) &&
1557 : 0 : opaque->btpo_next < scanblkno)
1558 : 0 : backtrack_to = opaque->btpo_next;
1559 : :
1560 : 733 : ndeletable = 0;
1561 : 733 : nupdatable = 0;
1562 : 733 : minoff = P_FIRSTDATAKEY(opaque);
1563 : 733 : maxoff = PageGetMaxOffsetNumber(page);
1564 : 733 : nhtidsdead = 0;
1565 : 733 : nhtidslive = 0;
1566 [ + + ]: 733 : if (callback)
1567 : : {
1568 : : /* btbulkdelete callback tells us what to delete (or update) */
1569 [ + + ]: 141036 : for (offnum = minoff;
1570 : 141036 : offnum <= maxoff;
1571 : 140311 : offnum = OffsetNumberNext(offnum))
1572 : : {
1573 : 140311 : IndexTuple itup;
1574 : :
1575 : 280622 : itup = (IndexTuple) PageGetItem(page,
1576 : 140311 : PageGetItemId(page, offnum));
1577 : :
1578 [ - + ]: 140311 : Assert(!BTreeTupleIsPivot(itup));
1579 [ + + ]: 140311 : if (!BTreeTupleIsPosting(itup))
1580 : : {
1581 : : /* Regular tuple, standard table TID representation */
1582 [ + + ]: 135405 : if (callback(&itup->t_tid, callback_state))
1583 : : {
1584 : 69100 : deletable[ndeletable++] = offnum;
1585 : 69100 : nhtidsdead++;
1586 : 69100 : }
1587 : : else
1588 : 66305 : nhtidslive++;
1589 : 135405 : }
1590 : : else
1591 : : {
1592 : 4906 : BTVacuumPosting vacposting;
1593 : 4906 : int nremaining;
1594 : :
1595 : : /* Posting list tuple */
1596 : 4906 : vacposting = btreevacuumposting(vstate, itup, offnum,
1597 : : &nremaining);
1598 [ + + ]: 4906 : if (vacposting == NULL)
1599 : : {
1600 : : /*
1601 : : * All table TIDs from the posting tuple remain, so no
1602 : : * delete or update required
1603 : : */
1604 [ - + ]: 1131 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1605 : 1131 : }
1606 [ + + ]: 3775 : else if (nremaining > 0)
1607 : : {
1608 : :
1609 : : /*
1610 : : * Store metadata about posting list tuple in
1611 : : * updatable array for entire page. Existing tuple
1612 : : * will be updated during the later call to
1613 : : * _bt_delitems_vacuum().
1614 : : */
1615 [ - + ]: 1133 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1616 : 1133 : updatable[nupdatable++] = vacposting;
1617 : 1133 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1618 : 1133 : }
1619 : : else
1620 : : {
1621 : : /*
1622 : : * All table TIDs from the posting list must be
1623 : : * deleted. We'll delete the index tuple completely
1624 : : * (no update required).
1625 : : */
1626 [ - + ]: 2642 : Assert(nremaining == 0);
1627 : 2642 : deletable[ndeletable++] = offnum;
1628 : 2642 : nhtidsdead += BTreeTupleGetNPosting(itup);
1629 : 2642 : pfree(vacposting);
1630 : : }
1631 : :
1632 : 4906 : nhtidslive += nremaining;
1633 : 4906 : }
1634 : 140311 : }
1635 : 725 : }
1636 : :
1637 : : /*
1638 : : * Apply any needed deletes or updates. We issue just one
1639 : : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1640 : : */
1641 [ + + + + ]: 733 : if (ndeletable > 0 || nupdatable > 0)
1642 : : {
1643 [ + - ]: 515 : Assert(nhtidsdead >= ndeletable + nupdatable);
1644 : 1030 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1645 : 515 : nupdatable);
1646 : :
1647 : 515 : stats->tuples_removed += nhtidsdead;
1648 : : /* must recompute maxoff */
1649 : 515 : maxoff = PageGetMaxOffsetNumber(page);
1650 : :
1651 : : /* can't leak memory here */
1652 [ + + ]: 1648 : for (int i = 0; i < nupdatable; i++)
1653 : 1133 : pfree(updatable[i]);
1654 : 515 : }
1655 : : else
1656 : : {
1657 : : /*
1658 : : * If the leaf page has been split during this vacuum cycle, it
1659 : : * seems worth expending a write to clear btpo_cycleid even if we
1660 : : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1661 : : * takes care of this.) This ensures we won't process the page
1662 : : * again.
1663 : : *
1664 : : * We treat this like a hint-bit update because there's no need to
1665 : : * WAL-log it.
1666 : : */
1667 [ + - ]: 218 : Assert(nhtidsdead == 0);
1668 [ + + + - ]: 218 : if (vstate->cycleid != 0 &&
1669 : 210 : opaque->btpo_cycleid == vstate->cycleid)
1670 : : {
1671 : 0 : opaque->btpo_cycleid = 0;
1672 : 0 : MarkBufferDirtyHint(buf, true);
1673 : 0 : }
1674 : : }
1675 : :
1676 : : /*
1677 : : * If the leaf page is now empty, try to delete it; else count the
1678 : : * live tuples (live table TIDs in posting lists are counted as
1679 : : * separate live tuples). We don't delete when backtracking, though,
1680 : : * since that would require teaching _bt_pagedel() about backtracking
1681 : : * (doesn't seem worth adding more complexity to deal with that).
1682 : : *
1683 : : * We don't count the number of live TIDs during cleanup-only calls to
1684 : : * btvacuumscan (i.e. when callback is not set). We count the number
1685 : : * of index tuples directly instead. This avoids the expense of
1686 : : * directly examining all of the tuples on each page. VACUUM will
1687 : : * treat num_index_tuples as an estimate in cleanup-only case, so it
1688 : : * doesn't matter that this underestimates num_index_tuples
1689 : : * significantly in some cases.
1690 : : */
1691 [ + + ]: 733 : if (minoff > maxoff)
1692 : 164 : attempt_pagedel = (blkno == scanblkno);
1693 [ + + ]: 569 : else if (callback)
1694 : 561 : stats->num_index_tuples += nhtidslive;
1695 : : else
1696 : 8 : stats->num_index_tuples += maxoff - minoff + 1;
1697 : :
1698 [ + + - + ]: 733 : Assert(!attempt_pagedel || nhtidslive == 0);
1699 : 733 : }
1700 : :
1701 [ + + ]: 875 : if (attempt_pagedel)
1702 : : {
1703 : 164 : MemoryContext oldcontext;
1704 : :
1705 : : /* Run pagedel in a temp context to avoid memory leakage */
1706 : 164 : MemoryContextReset(vstate->pagedelcontext);
1707 : 164 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1708 : :
1709 : : /*
1710 : : * _bt_pagedel maintains the bulk delete stats on our behalf;
1711 : : * pages_newly_deleted and pages_deleted are likely to be incremented
1712 : : * during call
1713 : : */
1714 [ - + ]: 164 : Assert(blkno == scanblkno);
1715 : 164 : _bt_pagedel(rel, buf, vstate);
1716 : :
1717 : 164 : MemoryContextSwitchTo(oldcontext);
1718 : : /* pagedel released buffer, so we shouldn't */
1719 : 164 : }
1720 : : else
1721 : 711 : _bt_relbuf(rel, buf);
1722 : :
1723 [ - + ]: 875 : if (backtrack_to != P_NONE)
1724 : : {
1725 : 0 : blkno = backtrack_to;
1726 : :
1727 : : /* check for vacuum delay while not holding any buffer lock */
1728 : 0 : vacuum_delay_point(false);
1729 : :
1730 : : /*
1731 : : * We can't use _bt_getbuf() here because it always applies
1732 : : * _bt_checkpage(), which will barf on an all-zero page. We want to
1733 : : * recycle all-zero pages, not fail. Also, we want to use a
1734 : : * nondefault buffer access strategy.
1735 : : */
1736 : 0 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1737 : 0 : info->strategy);
1738 : 0 : goto backtrack;
1739 : : }
1740 : :
1741 : 875 : return scanblkno;
1742 : 875 : }
1743 : :
1744 : : /*
1745 : : * btreevacuumposting --- determine TIDs still needed in posting list
1746 : : *
1747 : : * Returns metadata describing how to build replacement tuple without the TIDs
1748 : : * that VACUUM needs to delete. Returned value is NULL in the common case
1749 : : * where no changes are needed to caller's posting list tuple (we avoid
1750 : : * allocating memory here as an optimization).
1751 : : *
1752 : : * The number of TIDs that should remain in the posting list tuple is set for
1753 : : * caller in *nremaining.
1754 : : */
1755 : : static BTVacuumPosting
1756 : 4906 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1757 : : OffsetNumber updatedoffset, int *nremaining)
1758 : : {
1759 : 4906 : int live = 0;
1760 : 4906 : int nitem = BTreeTupleGetNPosting(posting);
1761 : 4906 : ItemPointer items = BTreeTupleGetPosting(posting);
1762 : 4906 : BTVacuumPosting vacposting = NULL;
1763 : :
1764 [ + + ]: 33792 : for (int i = 0; i < nitem; i++)
1765 : : {
1766 [ + + ]: 28886 : if (!vstate->callback(items + i, vstate->callback_state))
1767 : : {
1768 : : /* Live table TID */
1769 : 7660 : live++;
1770 : 7660 : }
1771 [ + + ]: 21226 : else if (vacposting == NULL)
1772 : : {
1773 : : /*
1774 : : * First dead table TID encountered.
1775 : : *
1776 : : * It's now clear that we need to delete one or more dead table
1777 : : * TIDs, so start maintaining metadata describing how to update
1778 : : * existing posting list tuple.
1779 : : */
1780 : 3775 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1781 : 3775 : nitem * sizeof(uint16));
1782 : :
1783 : 3775 : vacposting->itup = posting;
1784 : 3775 : vacposting->updatedoffset = updatedoffset;
1785 : 3775 : vacposting->ndeletedtids = 0;
1786 : 3775 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1787 : 3775 : }
1788 : : else
1789 : : {
1790 : : /* Second or subsequent dead table TID */
1791 : 17451 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1792 : : }
1793 : 28886 : }
1794 : :
1795 : 4906 : *nremaining = live;
1796 : 9812 : return vacposting;
1797 : 4906 : }
1798 : :
1799 : : /*
1800 : : * btcanreturn() -- Check whether btree indexes support index-only scans.
1801 : : *
1802 : : * btrees always do, so this is trivial.
1803 : : */
1804 : : bool
1805 : 101564 : btcanreturn(Relation index, int attno)
1806 : : {
1807 : 101564 : return true;
1808 : : }
1809 : :
1810 : : /*
1811 : : * btgettreeheight() -- Compute tree height for use by btcostestimate().
1812 : : */
1813 : : int
1814 : 69128 : btgettreeheight(Relation rel)
1815 : : {
1816 : 69128 : return _bt_getrootheight(rel);
1817 : : }
1818 : :
1819 : : CompareType
1820 : 0 : bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
1821 : : {
1822 [ # # # # : 0 : switch (strategy)
# # ]
1823 : : {
1824 : : case BTLessStrategyNumber:
1825 : 0 : return COMPARE_LT;
1826 : : case BTLessEqualStrategyNumber:
1827 : 0 : return COMPARE_LE;
1828 : : case BTEqualStrategyNumber:
1829 : 0 : return COMPARE_EQ;
1830 : : case BTGreaterEqualStrategyNumber:
1831 : 0 : return COMPARE_GE;
1832 : : case BTGreaterStrategyNumber:
1833 : 0 : return COMPARE_GT;
1834 : : default:
1835 : 0 : return COMPARE_INVALID;
1836 : : }
1837 : 0 : }
1838 : :
1839 : : StrategyNumber
1840 : 0 : bttranslatecmptype(CompareType cmptype, Oid opfamily)
1841 : : {
1842 [ # # # # : 0 : switch (cmptype)
# # ]
1843 : : {
1844 : : case COMPARE_LT:
1845 : 0 : return BTLessStrategyNumber;
1846 : : case COMPARE_LE:
1847 : 0 : return BTLessEqualStrategyNumber;
1848 : : case COMPARE_EQ:
1849 : 0 : return BTEqualStrategyNumber;
1850 : : case COMPARE_GE:
1851 : 0 : return BTGreaterEqualStrategyNumber;
1852 : : case COMPARE_GT:
1853 : 0 : return BTGreaterStrategyNumber;
1854 : : default:
1855 : 0 : return InvalidStrategy;
1856 : : }
1857 : 0 : }
|