Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_visibility.c
4 : * display visibility map information and page-level visibility bits
5 : *
6 : * Copyright (c) 2016-2026, PostgreSQL Global Development Group
7 : *
8 : * contrib/pg_visibility/pg_visibility.c
9 : *-------------------------------------------------------------------------
10 : */
11 : #include "postgres.h"
12 :
13 : #include "access/heapam.h"
14 : #include "access/htup_details.h"
15 : #include "access/visibilitymap.h"
16 : #include "access/xloginsert.h"
17 : #include "catalog/pg_type.h"
18 : #include "catalog/storage_xlog.h"
19 : #include "funcapi.h"
20 : #include "miscadmin.h"
21 : #include "storage/bufmgr.h"
22 : #include "storage/proc.h"
23 : #include "storage/procarray.h"
24 : #include "storage/read_stream.h"
25 : #include "storage/smgr.h"
26 : #include "utils/rel.h"
27 :
28 0 : PG_MODULE_MAGIC_EXT(
29 : .name = "pg_visibility",
30 : .version = PG_VERSION
31 : );
32 :
33 : typedef struct vbits
34 : {
35 : BlockNumber next;
36 : BlockNumber count;
37 : uint8 bits[FLEXIBLE_ARRAY_MEMBER];
38 : } vbits;
39 :
40 : typedef struct corrupt_items
41 : {
42 : BlockNumber next;
43 : BlockNumber count;
44 : ItemPointer tids;
45 : } corrupt_items;
46 :
47 : /* for collect_corrupt_items_read_stream_next_block */
48 : struct collect_corrupt_items_read_stream_private
49 : {
50 : bool all_frozen;
51 : bool all_visible;
52 : BlockNumber current_blocknum;
53 : BlockNumber last_exclusive;
54 : Relation rel;
55 : Buffer vmbuffer;
56 : };
57 :
58 0 : PG_FUNCTION_INFO_V1(pg_visibility_map);
59 0 : PG_FUNCTION_INFO_V1(pg_visibility_map_rel);
60 0 : PG_FUNCTION_INFO_V1(pg_visibility);
61 0 : PG_FUNCTION_INFO_V1(pg_visibility_rel);
62 0 : PG_FUNCTION_INFO_V1(pg_visibility_map_summary);
63 0 : PG_FUNCTION_INFO_V1(pg_check_frozen);
64 0 : PG_FUNCTION_INFO_V1(pg_check_visible);
65 0 : PG_FUNCTION_INFO_V1(pg_truncate_visibility_map);
66 :
67 : static TupleDesc pg_visibility_tupdesc(bool include_blkno, bool include_pd);
68 : static vbits *collect_visibility_data(Oid relid, bool include_pd);
69 : static corrupt_items *collect_corrupt_items(Oid relid, bool all_visible,
70 : bool all_frozen);
71 : static void record_corrupt_item(corrupt_items *items, ItemPointer tid);
72 : static bool tuple_all_visible(HeapTuple tup, TransactionId OldestXmin,
73 : Buffer buffer);
74 : static void check_relation_relkind(Relation rel);
75 :
76 : /*
77 : * Visibility map information for a single block of a relation.
78 : *
79 : * Note: the VM code will silently return zeroes for pages past the end
80 : * of the map, so we allow probes up to MaxBlockNumber regardless of the
81 : * actual relation size.
82 : */
83 : Datum
84 0 : pg_visibility_map(PG_FUNCTION_ARGS)
85 : {
86 0 : Oid relid = PG_GETARG_OID(0);
87 0 : int64 blkno = PG_GETARG_INT64(1);
88 0 : int32 mapbits;
89 0 : Relation rel;
90 0 : Buffer vmbuffer = InvalidBuffer;
91 0 : TupleDesc tupdesc;
92 0 : Datum values[2];
93 0 : bool nulls[2] = {0};
94 :
95 0 : rel = relation_open(relid, AccessShareLock);
96 :
97 : /* Only some relkinds have a visibility map */
98 0 : check_relation_relkind(rel);
99 :
100 0 : if (blkno < 0 || blkno > MaxBlockNumber)
101 0 : ereport(ERROR,
102 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
103 : errmsg("invalid block number")));
104 :
105 0 : tupdesc = pg_visibility_tupdesc(false, false);
106 :
107 0 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
108 0 : if (vmbuffer != InvalidBuffer)
109 0 : ReleaseBuffer(vmbuffer);
110 0 : values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
111 0 : values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
112 :
113 0 : relation_close(rel, AccessShareLock);
114 :
115 0 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
116 0 : }
117 :
118 : /*
119 : * Visibility map information for a single block of a relation, plus the
120 : * page-level information for the same block.
121 : */
122 : Datum
123 0 : pg_visibility(PG_FUNCTION_ARGS)
124 : {
125 0 : Oid relid = PG_GETARG_OID(0);
126 0 : int64 blkno = PG_GETARG_INT64(1);
127 0 : int32 mapbits;
128 0 : Relation rel;
129 0 : Buffer vmbuffer = InvalidBuffer;
130 0 : Buffer buffer;
131 0 : Page page;
132 0 : TupleDesc tupdesc;
133 0 : Datum values[3];
134 0 : bool nulls[3] = {0};
135 :
136 0 : rel = relation_open(relid, AccessShareLock);
137 :
138 : /* Only some relkinds have a visibility map */
139 0 : check_relation_relkind(rel);
140 :
141 0 : if (blkno < 0 || blkno > MaxBlockNumber)
142 0 : ereport(ERROR,
143 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
144 : errmsg("invalid block number")));
145 :
146 0 : tupdesc = pg_visibility_tupdesc(false, true);
147 :
148 0 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
149 0 : if (vmbuffer != InvalidBuffer)
150 0 : ReleaseBuffer(vmbuffer);
151 0 : values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
152 0 : values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
153 :
154 : /* Here we have to explicitly check rel size ... */
155 0 : if (blkno < RelationGetNumberOfBlocks(rel))
156 : {
157 0 : buffer = ReadBuffer(rel, blkno);
158 0 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
159 :
160 0 : page = BufferGetPage(buffer);
161 0 : values[2] = BoolGetDatum(PageIsAllVisible(page));
162 :
163 0 : UnlockReleaseBuffer(buffer);
164 0 : }
165 : else
166 : {
167 : /* As with the vismap, silently return 0 for pages past EOF */
168 0 : values[2] = BoolGetDatum(false);
169 : }
170 :
171 0 : relation_close(rel, AccessShareLock);
172 :
173 0 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
174 0 : }
175 :
176 : /*
177 : * Visibility map information for every block in a relation.
178 : */
179 : Datum
180 0 : pg_visibility_map_rel(PG_FUNCTION_ARGS)
181 : {
182 0 : FuncCallContext *funcctx;
183 0 : vbits *info;
184 :
185 0 : if (SRF_IS_FIRSTCALL())
186 : {
187 0 : Oid relid = PG_GETARG_OID(0);
188 0 : MemoryContext oldcontext;
189 :
190 0 : funcctx = SRF_FIRSTCALL_INIT();
191 0 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
192 0 : funcctx->tuple_desc = pg_visibility_tupdesc(true, false);
193 : /* collect_visibility_data will verify the relkind */
194 0 : funcctx->user_fctx = collect_visibility_data(relid, false);
195 0 : MemoryContextSwitchTo(oldcontext);
196 0 : }
197 :
198 0 : funcctx = SRF_PERCALL_SETUP();
199 0 : info = (vbits *) funcctx->user_fctx;
200 :
201 0 : if (info->next < info->count)
202 : {
203 0 : Datum values[3];
204 0 : bool nulls[3] = {0};
205 0 : HeapTuple tuple;
206 :
207 0 : values[0] = Int64GetDatum(info->next);
208 0 : values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
209 0 : values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
210 0 : info->next++;
211 :
212 0 : tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
213 0 : SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
214 0 : }
215 :
216 0 : SRF_RETURN_DONE(funcctx);
217 0 : }
218 :
219 : /*
220 : * Visibility map information for every block in a relation, plus the page
221 : * level information for each block.
222 : */
223 : Datum
224 0 : pg_visibility_rel(PG_FUNCTION_ARGS)
225 : {
226 0 : FuncCallContext *funcctx;
227 0 : vbits *info;
228 :
229 0 : if (SRF_IS_FIRSTCALL())
230 : {
231 0 : Oid relid = PG_GETARG_OID(0);
232 0 : MemoryContext oldcontext;
233 :
234 0 : funcctx = SRF_FIRSTCALL_INIT();
235 0 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
236 0 : funcctx->tuple_desc = pg_visibility_tupdesc(true, true);
237 : /* collect_visibility_data will verify the relkind */
238 0 : funcctx->user_fctx = collect_visibility_data(relid, true);
239 0 : MemoryContextSwitchTo(oldcontext);
240 0 : }
241 :
242 0 : funcctx = SRF_PERCALL_SETUP();
243 0 : info = (vbits *) funcctx->user_fctx;
244 :
245 0 : if (info->next < info->count)
246 : {
247 0 : Datum values[4];
248 0 : bool nulls[4] = {0};
249 0 : HeapTuple tuple;
250 :
251 0 : values[0] = Int64GetDatum(info->next);
252 0 : values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
253 0 : values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
254 0 : values[3] = BoolGetDatum((info->bits[info->next] & (1 << 2)) != 0);
255 0 : info->next++;
256 :
257 0 : tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
258 0 : SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
259 0 : }
260 :
261 0 : SRF_RETURN_DONE(funcctx);
262 0 : }
263 :
264 : /*
265 : * Count the number of all-visible and all-frozen pages in the visibility
266 : * map for a particular relation.
267 : */
268 : Datum
269 0 : pg_visibility_map_summary(PG_FUNCTION_ARGS)
270 : {
271 0 : Oid relid = PG_GETARG_OID(0);
272 0 : Relation rel;
273 0 : BlockNumber all_visible = 0;
274 0 : BlockNumber all_frozen = 0;
275 0 : TupleDesc tupdesc;
276 0 : Datum values[2];
277 0 : bool nulls[2] = {0};
278 :
279 0 : rel = relation_open(relid, AccessShareLock);
280 :
281 : /* Only some relkinds have a visibility map */
282 0 : check_relation_relkind(rel);
283 :
284 0 : visibilitymap_count(rel, &all_visible, &all_frozen);
285 :
286 0 : relation_close(rel, AccessShareLock);
287 :
288 0 : if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
289 0 : elog(ERROR, "return type must be a row type");
290 :
291 0 : values[0] = Int64GetDatum((int64) all_visible);
292 0 : values[1] = Int64GetDatum((int64) all_frozen);
293 :
294 0 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
295 0 : }
296 :
297 : /*
298 : * Return the TIDs of non-frozen tuples present in pages marked all-frozen
299 : * in the visibility map. We hope no one will ever find any, but there could
300 : * be bugs, database corruption, etc.
301 : */
302 : Datum
303 0 : pg_check_frozen(PG_FUNCTION_ARGS)
304 : {
305 0 : FuncCallContext *funcctx;
306 0 : corrupt_items *items;
307 :
308 0 : if (SRF_IS_FIRSTCALL())
309 : {
310 0 : Oid relid = PG_GETARG_OID(0);
311 0 : MemoryContext oldcontext;
312 :
313 0 : funcctx = SRF_FIRSTCALL_INIT();
314 0 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
315 : /* collect_corrupt_items will verify the relkind */
316 0 : funcctx->user_fctx = collect_corrupt_items(relid, false, true);
317 0 : MemoryContextSwitchTo(oldcontext);
318 0 : }
319 :
320 0 : funcctx = SRF_PERCALL_SETUP();
321 0 : items = (corrupt_items *) funcctx->user_fctx;
322 :
323 0 : if (items->next < items->count)
324 0 : SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
325 :
326 0 : SRF_RETURN_DONE(funcctx);
327 0 : }
328 :
329 : /*
330 : * Return the TIDs of not-all-visible tuples in pages marked all-visible
331 : * in the visibility map. We hope no one will ever find any, but there could
332 : * be bugs, database corruption, etc.
333 : */
334 : Datum
335 0 : pg_check_visible(PG_FUNCTION_ARGS)
336 : {
337 0 : FuncCallContext *funcctx;
338 0 : corrupt_items *items;
339 :
340 0 : if (SRF_IS_FIRSTCALL())
341 : {
342 0 : Oid relid = PG_GETARG_OID(0);
343 0 : MemoryContext oldcontext;
344 :
345 0 : funcctx = SRF_FIRSTCALL_INIT();
346 0 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
347 : /* collect_corrupt_items will verify the relkind */
348 0 : funcctx->user_fctx = collect_corrupt_items(relid, true, false);
349 0 : MemoryContextSwitchTo(oldcontext);
350 0 : }
351 :
352 0 : funcctx = SRF_PERCALL_SETUP();
353 0 : items = (corrupt_items *) funcctx->user_fctx;
354 :
355 0 : if (items->next < items->count)
356 0 : SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
357 :
358 0 : SRF_RETURN_DONE(funcctx);
359 0 : }
360 :
361 : /*
362 : * Remove the visibility map fork for a relation. If there turn out to be
363 : * any bugs in the visibility map code that require rebuilding the VM, this
364 : * provides users with a way to do it that is cleaner than shutting down the
365 : * server and removing files by hand.
366 : *
367 : * This is a cut-down version of RelationTruncate.
368 : */
369 : Datum
370 0 : pg_truncate_visibility_map(PG_FUNCTION_ARGS)
371 : {
372 0 : Oid relid = PG_GETARG_OID(0);
373 0 : Relation rel;
374 0 : ForkNumber fork;
375 0 : BlockNumber block;
376 0 : BlockNumber old_block;
377 :
378 0 : rel = relation_open(relid, AccessExclusiveLock);
379 :
380 : /* Only some relkinds have a visibility map */
381 0 : check_relation_relkind(rel);
382 :
383 : /* Forcibly reset cached file size */
384 0 : RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
385 :
386 : /* Compute new and old size before entering critical section. */
387 0 : fork = VISIBILITYMAP_FORKNUM;
388 0 : block = visibilitymap_prepare_truncate(rel, 0);
389 0 : old_block = BlockNumberIsValid(block) ? smgrnblocks(RelationGetSmgr(rel), fork) : 0;
390 :
391 : /*
392 : * WAL-logging, buffer dropping, file truncation must be atomic and all on
393 : * one side of a checkpoint. See RelationTruncate() for discussion.
394 : */
395 0 : Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
396 0 : MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
397 0 : START_CRIT_SECTION();
398 :
399 0 : if (RelationNeedsWAL(rel))
400 : {
401 0 : XLogRecPtr lsn;
402 0 : xl_smgr_truncate xlrec;
403 :
404 0 : xlrec.blkno = 0;
405 0 : xlrec.rlocator = rel->rd_locator;
406 0 : xlrec.flags = SMGR_TRUNCATE_VM;
407 :
408 0 : XLogBeginInsert();
409 0 : XLogRegisterData(&xlrec, sizeof(xlrec));
410 :
411 0 : lsn = XLogInsert(RM_SMGR_ID,
412 : XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
413 0 : XLogFlush(lsn);
414 0 : }
415 :
416 0 : if (BlockNumberIsValid(block))
417 0 : smgrtruncate(RelationGetSmgr(rel), &fork, 1, &old_block, &block);
418 :
419 0 : END_CRIT_SECTION();
420 0 : MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
421 :
422 : /*
423 : * Release the lock right away, not at commit time.
424 : *
425 : * It would be a problem to release the lock prior to commit if this
426 : * truncate operation sends any transactional invalidation messages. Other
427 : * backends would potentially be able to lock the relation without
428 : * processing them in the window of time between when we release the lock
429 : * here and when we sent the messages at our eventual commit. However,
430 : * we're currently only sending a non-transactional smgr invalidation,
431 : * which will have been posted to shared memory immediately from within
432 : * smgr_truncate. Therefore, there should be no race here.
433 : *
434 : * The reason why it's desirable to release the lock early here is because
435 : * of the possibility that someone will need to use this to blow away many
436 : * visibility map forks at once. If we can't release the lock until
437 : * commit time, the transaction doing this will accumulate
438 : * AccessExclusiveLocks on all of those relations at the same time, which
439 : * is undesirable. However, if this turns out to be unsafe we may have no
440 : * choice...
441 : */
442 0 : relation_close(rel, AccessExclusiveLock);
443 :
444 : /* Nothing to return. */
445 0 : PG_RETURN_VOID();
446 0 : }
447 :
448 : /*
449 : * Helper function to construct whichever TupleDesc we need for a particular
450 : * call.
451 : */
452 : static TupleDesc
453 0 : pg_visibility_tupdesc(bool include_blkno, bool include_pd)
454 : {
455 0 : TupleDesc tupdesc;
456 0 : AttrNumber maxattr = 2;
457 0 : AttrNumber a = 0;
458 :
459 0 : if (include_blkno)
460 0 : ++maxattr;
461 0 : if (include_pd)
462 0 : ++maxattr;
463 0 : tupdesc = CreateTemplateTupleDesc(maxattr);
464 0 : if (include_blkno)
465 0 : TupleDescInitEntry(tupdesc, ++a, "blkno", INT8OID, -1, 0);
466 0 : TupleDescInitEntry(tupdesc, ++a, "all_visible", BOOLOID, -1, 0);
467 0 : TupleDescInitEntry(tupdesc, ++a, "all_frozen", BOOLOID, -1, 0);
468 0 : if (include_pd)
469 0 : TupleDescInitEntry(tupdesc, ++a, "pd_all_visible", BOOLOID, -1, 0);
470 0 : Assert(a == maxattr);
471 :
472 0 : return BlessTupleDesc(tupdesc);
473 0 : }
474 :
475 : /*
476 : * Collect visibility data about a relation.
477 : *
478 : * Checks relkind of relid and will throw an error if the relation does not
479 : * have a VM.
480 : */
481 : static vbits *
482 0 : collect_visibility_data(Oid relid, bool include_pd)
483 : {
484 0 : Relation rel;
485 0 : BlockNumber nblocks;
486 0 : vbits *info;
487 0 : BlockNumber blkno;
488 0 : Buffer vmbuffer = InvalidBuffer;
489 0 : BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
490 0 : BlockRangeReadStreamPrivate p;
491 0 : ReadStream *stream = NULL;
492 :
493 0 : rel = relation_open(relid, AccessShareLock);
494 :
495 : /* Only some relkinds have a visibility map */
496 0 : check_relation_relkind(rel);
497 :
498 0 : nblocks = RelationGetNumberOfBlocks(rel);
499 0 : info = palloc0(offsetof(vbits, bits) + nblocks);
500 0 : info->next = 0;
501 0 : info->count = nblocks;
502 :
503 : /* Create a stream if reading main fork. */
504 0 : if (include_pd)
505 : {
506 0 : p.current_blocknum = 0;
507 0 : p.last_exclusive = nblocks;
508 :
509 : /*
510 : * It is safe to use batchmode as block_range_read_stream_cb takes no
511 : * locks.
512 : */
513 0 : stream = read_stream_begin_relation(READ_STREAM_FULL |
514 : READ_STREAM_USE_BATCHING,
515 0 : bstrategy,
516 0 : rel,
517 : MAIN_FORKNUM,
518 : block_range_read_stream_cb,
519 : &p,
520 : 0);
521 0 : }
522 :
523 0 : for (blkno = 0; blkno < nblocks; ++blkno)
524 : {
525 0 : int32 mapbits;
526 :
527 : /* Make sure we are interruptible. */
528 0 : CHECK_FOR_INTERRUPTS();
529 :
530 : /* Get map info. */
531 0 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
532 0 : if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
533 0 : info->bits[blkno] |= (1 << 0);
534 0 : if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
535 0 : info->bits[blkno] |= (1 << 1);
536 :
537 : /*
538 : * Page-level data requires reading every block, so only get it if the
539 : * caller needs it. Use a buffer access strategy, too, to prevent
540 : * cache-trashing.
541 : */
542 0 : if (include_pd)
543 : {
544 0 : Buffer buffer;
545 0 : Page page;
546 :
547 0 : buffer = read_stream_next_buffer(stream, NULL);
548 0 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
549 :
550 0 : page = BufferGetPage(buffer);
551 0 : if (PageIsAllVisible(page))
552 0 : info->bits[blkno] |= (1 << 2);
553 :
554 0 : UnlockReleaseBuffer(buffer);
555 0 : }
556 0 : }
557 :
558 0 : if (include_pd)
559 : {
560 0 : Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
561 0 : read_stream_end(stream);
562 0 : }
563 :
564 : /* Clean up. */
565 0 : if (vmbuffer != InvalidBuffer)
566 0 : ReleaseBuffer(vmbuffer);
567 0 : relation_close(rel, AccessShareLock);
568 :
569 0 : return info;
570 0 : }
571 :
572 : /*
573 : * The "strict" version of GetOldestNonRemovableTransactionId(). The
574 : * pg_visibility check can tolerate false positives (don't report some of the
575 : * errors), but can't tolerate false negatives (report false errors). Normally,
576 : * horizons move forwards, but there are cases when it could move backward
577 : * (see comment for ComputeXidHorizons()).
578 : *
579 : * This is why we have to implement our own function for xid horizon, which
580 : * would be guaranteed to be newer or equal to any xid horizon computed before.
581 : * We have to do the following to achieve this.
582 : *
583 : * 1. Ignore processes xmin's, because they consider connection to other
584 : * databases that were ignored before.
585 : * 2. Ignore KnownAssignedXids, as they are not database-aware. Although we
586 : * now perform minimal checking on a standby by always using nextXid, this
587 : * approach is better than nothing and will at least catch extremely broken
588 : * cases where a xid is in the future.
589 : * 3. Ignore walsender xmin, because it could go backward if some replication
590 : * connections don't use replication slots.
591 : *
592 : * While it might seem like we could use KnownAssignedXids for shared
593 : * catalogs, since shared catalogs rely on a global horizon rather than a
594 : * database-specific one - there are potential edge cases. For example, a
595 : * transaction may crash on the primary without writing a commit/abort record.
596 : * This would lead to a situation where it appears to still be running on the
597 : * standby, even though it has already ended on the primary. For this reason,
598 : * it's safer to ignore KnownAssignedXids, even for shared catalogs.
599 : *
600 : * As a result, we're using only currently running xids to compute the horizon.
601 : * Surely these would significantly sacrifice accuracy. But we have to do so
602 : * to avoid reporting false errors.
603 : */
604 : static TransactionId
605 0 : GetStrictOldestNonRemovableTransactionId(Relation rel)
606 : {
607 0 : RunningTransactions runningTransactions;
608 :
609 0 : if (RecoveryInProgress())
610 : {
611 0 : TransactionId result;
612 :
613 : /* As we ignore KnownAssignedXids on standby, just pick nextXid */
614 0 : LWLockAcquire(XidGenLock, LW_SHARED);
615 0 : result = XidFromFullTransactionId(TransamVariables->nextXid);
616 0 : LWLockRelease(XidGenLock);
617 0 : return result;
618 0 : }
619 0 : else if (rel == NULL || rel->rd_rel->relisshared)
620 : {
621 : /* Shared relation: take into account all running xids */
622 0 : runningTransactions = GetRunningTransactionData();
623 0 : LWLockRelease(ProcArrayLock);
624 0 : LWLockRelease(XidGenLock);
625 0 : return runningTransactions->oldestRunningXid;
626 : }
627 0 : else if (!RELATION_IS_LOCAL(rel))
628 : {
629 : /*
630 : * Normal relation: take into account xids running within the current
631 : * database
632 : */
633 0 : runningTransactions = GetRunningTransactionData();
634 0 : LWLockRelease(ProcArrayLock);
635 0 : LWLockRelease(XidGenLock);
636 0 : return runningTransactions->oldestDatabaseRunningXid;
637 : }
638 : else
639 : {
640 : /*
641 : * For temporary relations, ComputeXidHorizons() uses only
642 : * TransamVariables->latestCompletedXid and MyProc->xid. These two
643 : * shouldn't go backwards. So we're fine with this horizon.
644 : */
645 0 : return GetOldestNonRemovableTransactionId(rel);
646 : }
647 0 : }
648 :
649 : /*
650 : * Callback function to get next block for read stream object used in
651 : * collect_corrupt_items() function.
652 : */
653 : static BlockNumber
654 0 : collect_corrupt_items_read_stream_next_block(ReadStream *stream,
655 : void *callback_private_data,
656 : void *per_buffer_data)
657 : {
658 0 : struct collect_corrupt_items_read_stream_private *p = callback_private_data;
659 :
660 0 : for (; p->current_blocknum < p->last_exclusive; p->current_blocknum++)
661 : {
662 0 : bool check_frozen = false;
663 0 : bool check_visible = false;
664 :
665 : /* Make sure we are interruptible. */
666 0 : CHECK_FOR_INTERRUPTS();
667 :
668 0 : if (p->all_frozen && VM_ALL_FROZEN(p->rel, p->current_blocknum, &p->vmbuffer))
669 0 : check_frozen = true;
670 0 : if (p->all_visible && VM_ALL_VISIBLE(p->rel, p->current_blocknum, &p->vmbuffer))
671 0 : check_visible = true;
672 0 : if (!check_visible && !check_frozen)
673 0 : continue;
674 :
675 0 : return p->current_blocknum++;
676 0 : }
677 :
678 0 : return InvalidBlockNumber;
679 0 : }
680 :
681 : /*
682 : * Returns a list of items whose visibility map information does not match
683 : * the status of the tuples on the page.
684 : *
685 : * If all_visible is passed as true, this will include all items which are
686 : * on pages marked as all-visible in the visibility map but which do not
687 : * seem to in fact be all-visible.
688 : *
689 : * If all_frozen is passed as true, this will include all items which are
690 : * on pages marked as all-frozen but which do not seem to in fact be frozen.
691 : *
692 : * Checks relkind of relid and will throw an error if the relation does not
693 : * have a VM.
694 : */
695 : static corrupt_items *
696 0 : collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
697 : {
698 0 : Relation rel;
699 0 : corrupt_items *items;
700 0 : Buffer vmbuffer = InvalidBuffer;
701 0 : BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
702 0 : TransactionId OldestXmin = InvalidTransactionId;
703 0 : struct collect_corrupt_items_read_stream_private p;
704 0 : ReadStream *stream;
705 0 : Buffer buffer;
706 :
707 0 : rel = relation_open(relid, AccessShareLock);
708 :
709 : /* Only some relkinds have a visibility map */
710 0 : check_relation_relkind(rel);
711 :
712 0 : if (all_visible)
713 0 : OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
714 :
715 : /*
716 : * Guess an initial array size. We don't expect many corrupted tuples, so
717 : * start with a small array. This function uses the "next" field to track
718 : * the next offset where we can store an item (which is the same thing as
719 : * the number of items found so far) and the "count" field to track the
720 : * number of entries allocated. We'll repurpose these fields before
721 : * returning.
722 : */
723 0 : items = palloc0_object(corrupt_items);
724 0 : items->next = 0;
725 0 : items->count = 64;
726 0 : items->tids = palloc(items->count * sizeof(ItemPointerData));
727 :
728 0 : p.current_blocknum = 0;
729 0 : p.last_exclusive = RelationGetNumberOfBlocks(rel);
730 0 : p.rel = rel;
731 0 : p.vmbuffer = InvalidBuffer;
732 0 : p.all_frozen = all_frozen;
733 0 : p.all_visible = all_visible;
734 0 : stream = read_stream_begin_relation(READ_STREAM_FULL,
735 0 : bstrategy,
736 0 : rel,
737 : MAIN_FORKNUM,
738 : collect_corrupt_items_read_stream_next_block,
739 : &p,
740 : 0);
741 :
742 : /* Loop over every block in the relation. */
743 0 : while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
744 : {
745 0 : bool check_frozen = all_frozen;
746 0 : bool check_visible = all_visible;
747 0 : Page page;
748 0 : OffsetNumber offnum,
749 : maxoff;
750 0 : BlockNumber blkno;
751 :
752 : /* Make sure we are interruptible. */
753 0 : CHECK_FOR_INTERRUPTS();
754 :
755 0 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
756 :
757 0 : page = BufferGetPage(buffer);
758 0 : maxoff = PageGetMaxOffsetNumber(page);
759 0 : blkno = BufferGetBlockNumber(buffer);
760 :
761 : /*
762 : * The visibility map bits might have changed while we were acquiring
763 : * the page lock. Recheck to avoid returning spurious results.
764 : */
765 0 : if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer))
766 0 : check_frozen = false;
767 0 : if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
768 0 : check_visible = false;
769 0 : if (!check_visible && !check_frozen)
770 : {
771 0 : UnlockReleaseBuffer(buffer);
772 0 : continue;
773 : }
774 :
775 : /* Iterate over each tuple on the page. */
776 0 : for (offnum = FirstOffsetNumber;
777 0 : offnum <= maxoff;
778 0 : offnum = OffsetNumberNext(offnum))
779 : {
780 0 : HeapTupleData tuple;
781 0 : ItemId itemid;
782 :
783 0 : itemid = PageGetItemId(page, offnum);
784 :
785 : /* Unused or redirect line pointers are of no interest. */
786 0 : if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
787 0 : continue;
788 :
789 : /* Dead line pointers are neither all-visible nor frozen. */
790 0 : if (ItemIdIsDead(itemid))
791 : {
792 0 : ItemPointerSet(&(tuple.t_self), blkno, offnum);
793 0 : record_corrupt_item(items, &tuple.t_self);
794 0 : continue;
795 : }
796 :
797 : /* Initialize a HeapTupleData structure for checks below. */
798 0 : ItemPointerSet(&(tuple.t_self), blkno, offnum);
799 0 : tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
800 0 : tuple.t_len = ItemIdGetLength(itemid);
801 0 : tuple.t_tableOid = relid;
802 :
803 : /*
804 : * If we're checking whether the page is all-visible, we expect
805 : * the tuple to be all-visible.
806 : */
807 0 : if (check_visible &&
808 0 : !tuple_all_visible(&tuple, OldestXmin, buffer))
809 : {
810 0 : TransactionId RecomputedOldestXmin;
811 :
812 : /*
813 : * Time has passed since we computed OldestXmin, so it's
814 : * possible that this tuple is all-visible in reality even
815 : * though it doesn't appear so based on our
816 : * previously-computed value. Let's compute a new value so we
817 : * can be certain whether there is a problem.
818 : *
819 : * From a concurrency point of view, it sort of sucks to
820 : * retake ProcArrayLock here while we're holding the buffer
821 : * locked in shared mode, but it should be safe against
822 : * deadlocks, because surely
823 : * GetStrictOldestNonRemovableTransactionId() should never
824 : * take a buffer lock. And this shouldn't happen often, so
825 : * it's worth being careful so as to avoid false positives.
826 : */
827 0 : RecomputedOldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
828 :
829 0 : if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
830 0 : record_corrupt_item(items, &tuple.t_self);
831 : else
832 : {
833 0 : OldestXmin = RecomputedOldestXmin;
834 0 : if (!tuple_all_visible(&tuple, OldestXmin, buffer))
835 0 : record_corrupt_item(items, &tuple.t_self);
836 : }
837 0 : }
838 :
839 : /*
840 : * If we're checking whether the page is all-frozen, we expect the
841 : * tuple to be in a state where it will never need freezing.
842 : */
843 0 : if (check_frozen)
844 : {
845 0 : if (heap_tuple_needs_eventual_freeze(tuple.t_data))
846 0 : record_corrupt_item(items, &tuple.t_self);
847 0 : }
848 0 : }
849 :
850 0 : UnlockReleaseBuffer(buffer);
851 0 : }
852 0 : read_stream_end(stream);
853 :
854 : /* Clean up. */
855 0 : if (vmbuffer != InvalidBuffer)
856 0 : ReleaseBuffer(vmbuffer);
857 0 : if (p.vmbuffer != InvalidBuffer)
858 0 : ReleaseBuffer(p.vmbuffer);
859 0 : relation_close(rel, AccessShareLock);
860 :
861 : /*
862 : * Before returning, repurpose the fields to match caller's expectations.
863 : * next is now the next item that should be read (rather than written) and
864 : * count is now the number of items we wrote (rather than the number we
865 : * allocated).
866 : */
867 0 : items->count = items->next;
868 0 : items->next = 0;
869 :
870 0 : return items;
871 0 : }
872 :
873 : /*
874 : * Remember one corrupt item.
875 : */
876 : static void
877 0 : record_corrupt_item(corrupt_items *items, ItemPointer tid)
878 : {
879 : /* enlarge output array if needed. */
880 0 : if (items->next >= items->count)
881 : {
882 0 : items->count *= 2;
883 0 : items->tids = repalloc(items->tids,
884 0 : items->count * sizeof(ItemPointerData));
885 0 : }
886 : /* and add the new item */
887 0 : items->tids[items->next++] = *tid;
888 0 : }
889 :
890 : /*
891 : * Check whether a tuple is all-visible relative to a given OldestXmin value.
892 : * The buffer should contain the tuple and should be locked and pinned.
893 : */
894 : static bool
895 0 : tuple_all_visible(HeapTuple tup, TransactionId OldestXmin, Buffer buffer)
896 : {
897 0 : HTSV_Result state;
898 0 : TransactionId xmin;
899 :
900 0 : state = HeapTupleSatisfiesVacuum(tup, OldestXmin, buffer);
901 0 : if (state != HEAPTUPLE_LIVE)
902 0 : return false; /* all-visible implies live */
903 :
904 : /*
905 : * Neither lazy_scan_heap nor heap_page_is_all_visible will mark a page
906 : * all-visible unless every tuple is hinted committed. However, those hint
907 : * bits could be lost after a crash, so we can't be certain that they'll
908 : * be set here. So just check the xmin.
909 : */
910 :
911 0 : xmin = HeapTupleHeaderGetXmin(tup->t_data);
912 0 : if (!TransactionIdPrecedes(xmin, OldestXmin))
913 0 : return false; /* xmin not old enough for all to see */
914 :
915 0 : return true;
916 0 : }
917 :
918 : /*
919 : * check_relation_relkind - convenience routine to check that relation
920 : * is of the relkind supported by the callers
921 : */
922 : static void
923 0 : check_relation_relkind(Relation rel)
924 : {
925 0 : if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
926 0 : ereport(ERROR,
927 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
928 : errmsg("relation \"%s\" is of wrong relation kind",
929 : RelationGetRelationName(rel)),
930 : errdetail_relkind_not_supported(rel->rd_rel->relkind)));
931 0 : }
|