Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_numa.c
4 : * Basic NUMA portability routines
5 : *
6 : *
7 : * Copyright (c) 2025-2026, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/port/pg_numa.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "c.h"
17 : #include <unistd.h>
18 :
19 : #include "miscadmin.h"
20 : #include "port/pg_numa.h"
21 :
22 : /*
23 : * At this point we provide support only for Linux thanks to libnuma, but in
24 : * future support for other platforms e.g. Win32 or FreeBSD might be possible
25 : * too. For Win32 NUMA APIs see
26 : * https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
27 : */
28 : #ifdef USE_LIBNUMA
29 :
30 : #include <numa.h>
31 : #include <numaif.h>
32 :
33 : /*
34 : * numa_move_pages() chunk size, has to be <= 16 to work around a kernel bug
35 : * in do_pages_stat() (chunked by DO_PAGES_STAT_CHUNK_NR). By using the same
36 : * chunk size, we make it work even on unfixed kernels.
37 : *
38 : * 64-bit system are not affected by the bug, and so use much larger chunks.
39 : */
40 : #if SIZEOF_SIZE_T == 4
41 : #define NUMA_QUERY_CHUNK_SIZE 16
42 : #else
43 : #define NUMA_QUERY_CHUNK_SIZE 1024
44 : #endif
45 :
46 : /* libnuma requires initialization as per numa(3) on Linux */
47 : int
48 : pg_numa_init(void)
49 : {
50 : int r;
51 :
52 : /*
53 : * XXX libnuma versions before 2.0.19 don't handle EPERM by disabling
54 : * NUMA, which then leads to unexpected failures later. This affects
55 : * containers that disable get_mempolicy by a seccomp profile.
56 : */
57 : if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && (errno == EPERM))
58 : r = -1;
59 : else
60 : r = numa_available();
61 :
62 : return r;
63 : }
64 :
65 : /*
66 : * We use move_pages(2) syscall here - instead of get_mempolicy(2) - as the
67 : * first one allows us to batch and query about many memory pages in one single
68 : * giant system call that is way faster.
69 : *
70 : * We call numa_move_pages() for smaller chunks of the whole array. The first
71 : * reason is to work around a kernel bug, but also to allow interrupting the
72 : * query between the calls (for many pointers processing the whole array can
73 : * take a lot of time).
74 : */
75 : int
76 : pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
77 : {
78 : unsigned long next = 0;
79 : int ret = 0;
80 :
81 : /*
82 : * Chunk pointers passed to numa_move_pages to NUMA_QUERY_CHUNK_SIZE
83 : * items, to work around a kernel bug in do_pages_stat().
84 : */
85 : while (next < count)
86 : {
87 : unsigned long count_chunk = Min(count - next,
88 : NUMA_QUERY_CHUNK_SIZE);
89 :
90 : CHECK_FOR_INTERRUPTS();
91 :
92 : /*
93 : * Bail out if any of the chunks errors out (ret<0). We ignore (ret>0)
94 : * which is used to return number of nonmigrated pages, but we're not
95 : * migrating any pages here.
96 : */
97 : ret = numa_move_pages(pid, count_chunk, &pages[next], NULL, &status[next], 0);
98 : if (ret < 0)
99 : {
100 : /* plain error, return as is */
101 : return ret;
102 : }
103 :
104 : next += count_chunk;
105 : }
106 :
107 : /* should have consumed the input array exactly */
108 : Assert(next == count);
109 :
110 : return 0;
111 : }
112 :
113 : int
114 : pg_numa_get_max_node(void)
115 : {
116 : return numa_max_node();
117 : }
118 :
119 : #else
120 :
121 : /* Empty wrappers */
122 : int
123 2 : pg_numa_init(void)
124 : {
125 : /* We state that NUMA is not available */
126 2 : return -1;
127 : }
128 :
129 : int
130 0 : pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
131 : {
132 0 : return 0;
133 : }
134 :
135 : int
136 0 : pg_numa_get_max_node(void)
137 : {
138 0 : return 0;
139 : }
140 :
141 : #endif
|