Branch data Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * sysv_shmem.c
4 : : * Implement shared memory using SysV facilities
5 : : *
6 : : * These routines used to be a fairly thin layer on top of SysV shared
7 : : * memory functionality. With the addition of anonymous-shmem logic,
8 : : * they're a bit fatter now. We still require a SysV shmem block to
9 : : * exist, though, because mmap'd shmem provides no way to find out how
10 : : * many processes are attached, which we need for interlocking purposes.
11 : : *
12 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
13 : : * Portions Copyright (c) 1994, Regents of the University of California
14 : : *
15 : : * IDENTIFICATION
16 : : * src/backend/port/sysv_shmem.c
17 : : *
18 : : *-------------------------------------------------------------------------
19 : : */
20 : : #include "postgres.h"
21 : :
22 : : #include <signal.h>
23 : : #include <unistd.h>
24 : : #include <sys/file.h>
25 : : #include <sys/ipc.h>
26 : : #include <sys/mman.h>
27 : : #include <sys/shm.h>
28 : : #include <sys/stat.h>
29 : :
30 : : #include "miscadmin.h"
31 : : #include "port/pg_bitutils.h"
32 : : #include "portability/mem.h"
33 : : #include "storage/dsm.h"
34 : : #include "storage/fd.h"
35 : : #include "storage/ipc.h"
36 : : #include "storage/pg_shmem.h"
37 : : #include "storage/shmem.h"
38 : : #include "utils/guc.h"
39 : : #include "utils/guc_hooks.h"
40 : : #include "utils/pidfile.h"
41 : :
42 : :
43 : : /*
44 : : * As of PostgreSQL 9.3, we normally allocate only a very small amount of
45 : : * System V shared memory, and only for the purposes of providing an
46 : : * interlock to protect the data directory. The real shared memory block
47 : : * is allocated using mmap(). This works around the problem that many
48 : : * systems have very low limits on the amount of System V shared memory
49 : : * that can be allocated. Even a limit of a few megabytes will be enough
50 : : * to run many copies of PostgreSQL without needing to adjust system settings.
51 : : *
52 : : * We assume that no one will attempt to run PostgreSQL 9.3 or later on
53 : : * systems that are ancient enough that anonymous shared memory is not
54 : : * supported, such as pre-2.4 versions of Linux. If that turns out to be
55 : : * false, we might need to add compile and/or run-time tests here and do this
56 : : * only if the running kernel supports it.
57 : : *
58 : : * However, we must always disable this logic in the EXEC_BACKEND case, and
59 : : * fall back to the old method of allocating the entire segment using System V
60 : : * shared memory, because there's no way to attach an anonymous mmap'd segment
61 : : * to a process after exec(). Since EXEC_BACKEND is intended only for
62 : : * developer use, this shouldn't be a big problem. Because of this, we do
63 : : * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below.
64 : : *
65 : : * As of PostgreSQL 12, we regained the ability to use a large System V shared
66 : : * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set
67 : : * to sysv (though this is not the default).
68 : : */
69 : :
70 : :
71 : : typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
72 : : typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
73 : :
74 : : /*
75 : : * How does a given IpcMemoryId relate to this PostgreSQL process?
76 : : *
77 : : * One could recycle unattached segments of different data directories if we
78 : : * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would
79 : : * cause us to visit less of the key space, making us less likely to detect a
80 : : * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis,
81 : : * in that postmasters of different data directories could simultaneously
82 : : * attempt to recycle a given key. We'll waste keys longer in some cases, but
83 : : * avoiding the problems of the alternative justifies that loss.
84 : : */
85 : : typedef enum
86 : : {
87 : : SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */
88 : : SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */
89 : : SHMSTATE_ENOENT, /* no segment of that ID */
90 : : SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */
91 : : SHMSTATE_UNATTACHED, /* pertinent to DataDir, no attached PIDs */
92 : : } IpcMemoryState;
93 : :
94 : :
95 : : unsigned long UsedShmemSegID = 0;
96 : : void *UsedShmemSegAddr = NULL;
97 : :
98 : : static Size AnonymousShmemSize;
99 : : static void *AnonymousShmem = NULL;
100 : :
101 : : static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
102 : : static void IpcMemoryDetach(int status, Datum shmaddr);
103 : : static void IpcMemoryDelete(int status, Datum shmId);
104 : : static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId,
105 : : void *attachAt,
106 : : PGShmemHeader **addr);
107 : :
108 : :
109 : : /*
110 : : * InternalIpcMemoryCreate(memKey, size)
111 : : *
112 : : * Attempt to create a new shared memory segment with the specified key.
113 : : * Will fail (return NULL) if such a segment already exists. If successful,
114 : : * attach the segment to the current process and return its attached address.
115 : : * On success, callbacks are registered with on_shmem_exit to detach and
116 : : * delete the segment when on_shmem_exit is called.
117 : : *
118 : : * If we fail with a failure code other than collision-with-existing-segment,
119 : : * print out an error and abort. Other types of errors are not recoverable.
120 : : */
121 : : static void *
122 : 0 : InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
123 : : {
124 : 0 : IpcMemoryId shmid;
125 : 0 : void *requestedAddress = NULL;
126 : 0 : void *memAddress;
127 : :
128 : : /*
129 : : * Normally we just pass requestedAddress = NULL to shmat(), allowing the
130 : : * system to choose where the segment gets mapped. But in an EXEC_BACKEND
131 : : * build, it's possible for whatever is chosen in the postmaster to not
132 : : * work for backends, due to variations in address space layout. As a
133 : : * rather klugy workaround, allow the user to specify the address to use
134 : : * via setting the environment variable PG_SHMEM_ADDR. (If this were of
135 : : * interest for anything except debugging, we'd probably create a cleaner
136 : : * and better-documented way to set it, such as a GUC.)
137 : : */
138 : : #ifdef EXEC_BACKEND
139 : : {
140 : : char *pg_shmem_addr = getenv("PG_SHMEM_ADDR");
141 : :
142 : : if (pg_shmem_addr)
143 : : requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0);
144 : : else
145 : : {
146 : : #if defined(__darwin__) && SIZEOF_VOID_P == 8
147 : : /*
148 : : * Provide a default value that is believed to avoid problems with
149 : : * ASLR on the current macOS release.
150 : : */
151 : : requestedAddress = (void *) 0x80000000000;
152 : : #endif
153 : : }
154 : : }
155 : : #endif
156 : :
157 : 0 : shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
158 : :
159 [ # # ]: 0 : if (shmid < 0)
160 : : {
161 : 0 : int shmget_errno = errno;
162 : :
163 : : /*
164 : : * Fail quietly if error indicates a collision with existing segment.
165 : : * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
166 : : * we could get a permission violation instead? Also, EIDRM might
167 : : * occur if an old seg is slated for destruction but not gone yet.
168 : : */
169 [ # # ]: 0 : if (shmget_errno == EEXIST || shmget_errno == EACCES
170 : : #ifdef EIDRM
171 [ # # # # ]: 0 : || shmget_errno == EIDRM
172 : : #endif
173 : : )
174 : 0 : return NULL;
175 : :
176 : : /*
177 : : * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
178 : : * there is an existing segment but it's smaller than "size" (this is
179 : : * a result of poorly-thought-out ordering of error tests). To
180 : : * distinguish between collision and invalid size in such cases, we
181 : : * make a second try with size = 0. These kernels do not test size
182 : : * against SHMMIN in the preexisting-segment case, so we will not get
183 : : * EINVAL a second time if there is such a segment.
184 : : */
185 [ # # ]: 0 : if (shmget_errno == EINVAL)
186 : : {
187 : 0 : shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
188 : :
189 [ # # ]: 0 : if (shmid < 0)
190 : : {
191 : : /* As above, fail quietly if we verify a collision */
192 [ # # ]: 0 : if (errno == EEXIST || errno == EACCES
193 : : #ifdef EIDRM
194 [ # # # # ]: 0 : || errno == EIDRM
195 : : #endif
196 : : )
197 : 0 : return NULL;
198 : : /* Otherwise, fall through to report the original error */
199 : 0 : }
200 : : else
201 : : {
202 : : /*
203 : : * On most platforms we cannot get here because SHMMIN is
204 : : * greater than zero. However, if we do succeed in creating a
205 : : * zero-size segment, free it and then fall through to report
206 : : * the original error.
207 : : */
208 [ # # ]: 0 : if (shmctl(shmid, IPC_RMID, NULL) < 0)
209 [ # # # # ]: 0 : elog(LOG, "shmctl(%d, %d, 0) failed: %m",
210 : : shmid, IPC_RMID);
211 : : }
212 : 0 : }
213 : :
214 : : /*
215 : : * Else complain and abort.
216 : : *
217 : : * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
218 : : * is violated. SHMALL violation might be reported as either ENOMEM
219 : : * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
220 : : * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
221 : : * not-enough-RAM is ENOMEM.
222 : : */
223 : 0 : errno = shmget_errno;
224 [ # # # # : 0 : ereport(FATAL,
# # # # #
# ]
225 : : (errmsg("could not create shared memory segment: %m"),
226 : : errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).",
227 : : (unsigned long) memKey, size,
228 : : IPC_CREAT | IPC_EXCL | IPCProtection),
229 : : (shmget_errno == EINVAL) ?
230 : : errhint("This error usually means that PostgreSQL's request for a shared memory "
231 : : "segment exceeded your kernel's SHMMAX parameter, or possibly that "
232 : : "it is less than "
233 : : "your kernel's SHMMIN parameter.\n"
234 : : "The PostgreSQL documentation contains more information about shared "
235 : : "memory configuration.") : 0,
236 : : (shmget_errno == ENOMEM) ?
237 : : errhint("This error usually means that PostgreSQL's request for a shared "
238 : : "memory segment exceeded your kernel's SHMALL parameter. You might need "
239 : : "to reconfigure the kernel with larger SHMALL.\n"
240 : : "The PostgreSQL documentation contains more information about shared "
241 : : "memory configuration.") : 0,
242 : : (shmget_errno == ENOSPC) ?
243 : : errhint("This error does *not* mean that you have run out of disk space. "
244 : : "It occurs either if all available shared memory IDs have been taken, "
245 : : "in which case you need to raise the SHMMNI parameter in your kernel, "
246 : : "or because the system's overall limit for shared memory has been "
247 : : "reached.\n"
248 : : "The PostgreSQL documentation contains more information about shared "
249 : : "memory configuration.") : 0));
250 [ # # ]: 0 : }
251 : :
252 : : /* Register on-exit routine to delete the new segment */
253 : 0 : on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid));
254 : :
255 : : /* OK, should be able to attach to the segment */
256 : 0 : memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS);
257 : :
258 [ # # ]: 0 : if (memAddress == (void *) -1)
259 [ # # # # ]: 0 : elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m",
260 : : shmid, requestedAddress, PG_SHMAT_FLAGS);
261 : :
262 : : /* Register on-exit routine to detach new segment before deleting */
263 : 0 : on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress));
264 : :
265 : : /*
266 : : * Store shmem key and ID in data directory lockfile. Format to try to
267 : : * keep it the same length always (trailing junk in the lockfile won't
268 : : * hurt, but might confuse humans).
269 : : */
270 : : {
271 : 0 : char line[64];
272 : :
273 : 0 : sprintf(line, "%9lu %9lu",
274 : 0 : (unsigned long) memKey, (unsigned long) shmid);
275 : 0 : AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line);
276 : 0 : }
277 : :
278 : 0 : return memAddress;
279 : 0 : }
280 : :
281 : : /****************************************************************************/
282 : : /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
283 : : /* from process' address space */
284 : : /* (called as an on_shmem_exit callback, hence funny argument list) */
285 : : /****************************************************************************/
286 : : static void
287 : 0 : IpcMemoryDetach(int status, Datum shmaddr)
288 : : {
289 : : /* Detach System V shared memory block. */
290 [ # # ]: 0 : if (shmdt(DatumGetPointer(shmaddr)) < 0)
291 [ # # # # ]: 0 : elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
292 : 0 : }
293 : :
294 : : /****************************************************************************/
295 : : /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
296 : : /* (called as an on_shmem_exit callback, hence funny argument list) */
297 : : /****************************************************************************/
298 : : static void
299 : 0 : IpcMemoryDelete(int status, Datum shmId)
300 : : {
301 [ # # ]: 0 : if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
302 [ # # # # ]: 0 : elog(LOG, "shmctl(%d, %d, 0) failed: %m",
303 : : DatumGetInt32(shmId), IPC_RMID);
304 : 0 : }
305 : :
306 : : /*
307 : : * PGSharedMemoryIsInUse
308 : : *
309 : : * Is a previously-existing shmem segment still existing and in use?
310 : : *
311 : : * The point of this exercise is to detect the case where a prior postmaster
312 : : * crashed, but it left child backends that are still running. Therefore
313 : : * we only care about shmem segments that are associated with the intended
314 : : * DataDir. This is an important consideration since accidental matches of
315 : : * shmem segment IDs are reasonably common.
316 : : */
317 : : bool
318 : 0 : PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
319 : : {
320 : 0 : PGShmemHeader *memAddress;
321 : 0 : IpcMemoryState state;
322 : :
323 : 0 : state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress);
324 [ # # # # ]: 0 : if (memAddress && shmdt(memAddress) < 0)
325 [ # # # # ]: 0 : elog(LOG, "shmdt(%p) failed: %m", memAddress);
326 [ # # # ]: 0 : switch (state)
327 : : {
328 : : case SHMSTATE_ENOENT:
329 : : case SHMSTATE_FOREIGN:
330 : : case SHMSTATE_UNATTACHED:
331 : 0 : return false;
332 : : case SHMSTATE_ANALYSIS_FAILURE:
333 : : case SHMSTATE_ATTACHED:
334 : 0 : return true;
335 : : }
336 : 0 : return true;
337 : 0 : }
338 : :
339 : : /*
340 : : * Test for a segment with id shmId; see comment at IpcMemoryState.
341 : : *
342 : : * If the segment exists, we'll attempt to attach to it, using attachAt
343 : : * if that's not NULL (but it's best to pass NULL if possible).
344 : : *
345 : : * *addr is set to the segment memory address if we attached to it, else NULL.
346 : : */
347 : : static IpcMemoryState
348 : 0 : PGSharedMemoryAttach(IpcMemoryId shmId,
349 : : void *attachAt,
350 : : PGShmemHeader **addr)
351 : : {
352 : 0 : struct shmid_ds shmStat;
353 : 0 : struct stat statbuf;
354 : 0 : PGShmemHeader *hdr;
355 : :
356 : 0 : *addr = NULL;
357 : :
358 : : /*
359 : : * First, try to stat the shm segment ID, to see if it exists at all.
360 : : */
361 [ # # ]: 0 : if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
362 : : {
363 : : /*
364 : : * EINVAL actually has multiple possible causes documented in the
365 : : * shmctl man page, but we assume it must mean the segment no longer
366 : : * exists.
367 : : */
368 [ # # ]: 0 : if (errno == EINVAL)
369 : 0 : return SHMSTATE_ENOENT;
370 : :
371 : : /*
372 : : * EACCES implies we have no read permission, which means it is not a
373 : : * Postgres shmem segment (or at least, not one that is relevant to
374 : : * our data directory).
375 : : */
376 [ # # ]: 0 : if (errno == EACCES)
377 : 0 : return SHMSTATE_FOREIGN;
378 : :
379 : : /*
380 : : * Some Linux kernel versions (in fact, all of them as of July 2007)
381 : : * sometimes return EIDRM when EINVAL is correct. The Linux kernel
382 : : * actually does not have any internal state that would justify
383 : : * returning EIDRM, so we can get away with assuming that EIDRM is
384 : : * equivalent to EINVAL on that platform.
385 : : */
386 : : #ifdef HAVE_LINUX_EIDRM_BUG
387 : : if (errno == EIDRM)
388 : : return SHMSTATE_ENOENT;
389 : : #endif
390 : :
391 : : /*
392 : : * Otherwise, we had better assume that the segment is in use. The
393 : : * only likely case is (non-Linux, assumed spec-compliant) EIDRM,
394 : : * which implies that the segment has been IPC_RMID'd but there are
395 : : * still processes attached to it.
396 : : */
397 : 0 : return SHMSTATE_ANALYSIS_FAILURE;
398 : : }
399 : :
400 : : /*
401 : : * Try to attach to the segment and see if it matches our data directory.
402 : : * This avoids any risk of duplicate-shmem-key conflicts on machines that
403 : : * are running several postmasters under the same userid.
404 : : *
405 : : * (When we're called from PGSharedMemoryCreate, this stat call is
406 : : * duplicative; but since this isn't a high-traffic case it's not worth
407 : : * trying to optimize.)
408 : : */
409 [ # # ]: 0 : if (stat(DataDir, &statbuf) < 0)
410 : 0 : return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */
411 : :
412 : 0 : hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS);
413 [ # # ]: 0 : if (hdr == (PGShmemHeader *) -1)
414 : : {
415 : : /*
416 : : * Attachment failed. The cases we're interested in are the same as
417 : : * for the shmctl() call above. In particular, note that the owning
418 : : * postmaster could have terminated and removed the segment between
419 : : * shmctl() and shmat().
420 : : *
421 : : * If attachAt isn't NULL, it's possible that EINVAL reflects a
422 : : * problem with that address not a vanished segment, so it's best to
423 : : * pass NULL when probing for conflicting segments.
424 : : */
425 [ # # ]: 0 : if (errno == EINVAL)
426 : 0 : return SHMSTATE_ENOENT; /* segment disappeared */
427 [ # # ]: 0 : if (errno == EACCES)
428 : 0 : return SHMSTATE_FOREIGN; /* must be non-Postgres */
429 : : #ifdef HAVE_LINUX_EIDRM_BUG
430 : : if (errno == EIDRM)
431 : : return SHMSTATE_ENOENT; /* segment disappeared */
432 : : #endif
433 : : /* Otherwise, be conservative. */
434 : 0 : return SHMSTATE_ANALYSIS_FAILURE;
435 : : }
436 : 0 : *addr = hdr;
437 : :
438 [ # # ]: 0 : if (hdr->magic != PGShmemMagic ||
439 [ # # # # ]: 0 : hdr->device != statbuf.st_dev ||
440 : 0 : hdr->inode != statbuf.st_ino)
441 : : {
442 : : /*
443 : : * It's either not a Postgres segment, or not one for my data
444 : : * directory.
445 : : */
446 : 0 : return SHMSTATE_FOREIGN;
447 : : }
448 : :
449 : : /*
450 : : * It does match our data directory, so now test whether any processes are
451 : : * still attached to it. (We are, now, but the shm_nattch result is from
452 : : * before we attached to it.)
453 : : */
454 : 0 : return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED;
455 : 0 : }
456 : :
457 : : /*
458 : : * Identify the huge page size to use, and compute the related mmap flags.
459 : : *
460 : : * Some Linux kernel versions have a bug causing mmap() to fail on requests
461 : : * that are not a multiple of the hugepage size. Versions without that bug
462 : : * instead silently round the request up to the next hugepage multiple ---
463 : : * and then munmap() fails when we give it a size different from that.
464 : : * So we have to round our request up to a multiple of the actual hugepage
465 : : * size to avoid trouble.
466 : : *
467 : : * Doing the round-up ourselves also lets us make use of the extra memory,
468 : : * rather than just wasting it. Currently, we just increase the available
469 : : * space recorded in the shmem header, which will make the extra usable for
470 : : * purposes such as additional locktable entries. Someday, for very large
471 : : * hugepage sizes, we might want to think about more invasive strategies,
472 : : * such as increasing shared_buffers to absorb the extra space.
473 : : *
474 : : * Returns the (real, assumed or config provided) page size into
475 : : * *hugepagesize, and the hugepage-related mmap flags to use into
476 : : * *mmap_flags if requested by the caller. If huge pages are not supported,
477 : : * *hugepagesize and *mmap_flags are set to 0.
478 : : */
479 : : void
480 : 0 : GetHugePageSize(Size *hugepagesize, int *mmap_flags)
481 : : {
482 : : #ifdef MAP_HUGETLB
483 : :
484 : : Size default_hugepagesize = 0;
485 : : Size hugepagesize_local = 0;
486 : : int mmap_flags_local = 0;
487 : :
488 : : /*
489 : : * System-dependent code to find out the default huge page size.
490 : : *
491 : : * On Linux, read /proc/meminfo looking for a line like "Hugepagesize:
492 : : * nnnn kB". Ignore any failures, falling back to the preset default.
493 : : */
494 : : #ifdef __linux__
495 : :
496 : : {
497 : : FILE *fp = AllocateFile("/proc/meminfo", "r");
498 : : char buf[128];
499 : : unsigned int sz;
500 : : char ch;
501 : :
502 : : if (fp)
503 : : {
504 : : while (fgets(buf, sizeof(buf), fp))
505 : : {
506 : : if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2)
507 : : {
508 : : if (ch == 'k')
509 : : {
510 : : default_hugepagesize = sz * (Size) 1024;
511 : : break;
512 : : }
513 : : /* We could accept other units besides kB, if needed */
514 : : }
515 : : }
516 : : FreeFile(fp);
517 : : }
518 : : }
519 : : #endif /* __linux__ */
520 : :
521 : : if (huge_page_size != 0)
522 : : {
523 : : /* If huge page size is requested explicitly, use that. */
524 : : hugepagesize_local = (Size) huge_page_size * 1024;
525 : : }
526 : : else if (default_hugepagesize != 0)
527 : : {
528 : : /* Otherwise use the system default, if we have it. */
529 : : hugepagesize_local = default_hugepagesize;
530 : : }
531 : : else
532 : : {
533 : : /*
534 : : * If we fail to find out the system's default huge page size, or no
535 : : * huge page size is requested explicitly, assume it is 2MB. This will
536 : : * work fine when the actual size is less. If it's more, we might get
537 : : * mmap() or munmap() failures due to unaligned requests; but at this
538 : : * writing, there are no reports of any non-Linux systems being picky
539 : : * about that.
540 : : */
541 : : hugepagesize_local = 2 * 1024 * 1024;
542 : : }
543 : :
544 : : mmap_flags_local = MAP_HUGETLB;
545 : :
546 : : /*
547 : : * On recent enough Linux, also include the explicit page size, if
548 : : * necessary.
549 : : */
550 : : #if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT)
551 : : if (hugepagesize_local != default_hugepagesize)
552 : : {
553 : : int shift = pg_ceil_log2_64(hugepagesize_local);
554 : :
555 : : mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
556 : : }
557 : : #endif
558 : :
559 : : /* assign the results found */
560 : : if (mmap_flags)
561 : : *mmap_flags = mmap_flags_local;
562 : : if (hugepagesize)
563 : : *hugepagesize = hugepagesize_local;
564 : :
565 : : #else
566 : :
567 [ # # ]: 0 : if (hugepagesize)
568 : 0 : *hugepagesize = 0;
569 [ # # ]: 0 : if (mmap_flags)
570 : 0 : *mmap_flags = 0;
571 : :
572 : : #endif /* MAP_HUGETLB */
573 : 0 : }
574 : :
575 : : /*
576 : : * GUC check_hook for huge_page_size
577 : : */
578 : : bool
579 : 0 : check_huge_page_size(int *newval, void **extra, GucSource source)
580 : : {
581 : : #if !(defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT))
582 : : /* Recent enough Linux only, for now. See GetHugePageSize(). */
583 [ # # ]: 0 : if (*newval != 0)
584 : : {
585 : 0 : GUC_check_errdetail("\"huge_page_size\" must be 0 on this platform.");
586 : 0 : return false;
587 : : }
588 : : #endif
589 : 0 : return true;
590 : 0 : }
591 : :
592 : : /*
593 : : * Creates an anonymous mmap()ed shared memory segment.
594 : : *
595 : : * Pass the requested size in *size. This function will modify *size to the
596 : : * actual size of the allocation, if it ends up allocating a segment that is
597 : : * larger than requested.
598 : : */
599 : : static void *
600 : 0 : CreateAnonymousSegment(Size *size)
601 : : {
602 : 0 : Size allocsize = *size;
603 : 0 : void *ptr = MAP_FAILED;
604 : 0 : int mmap_errno = 0;
605 : 0 : int mmap_flags = MAP_SHARED | MAP_ANONYMOUS | MAP_HASSEMAPHORE;
606 : :
607 : : #ifndef MAP_HUGETLB
608 : : /* PGSharedMemoryCreate should have dealt with this case */
609 [ # # ]: 0 : Assert(huge_pages != HUGE_PAGES_ON);
610 : : #else
611 : : if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
612 : : {
613 : : /*
614 : : * Round up the request size to a suitable large value.
615 : : */
616 : : Size hugepagesize;
617 : : int huge_mmap_flags;
618 : :
619 : : GetHugePageSize(&hugepagesize, &huge_mmap_flags);
620 : :
621 : : if (allocsize % hugepagesize != 0)
622 : : allocsize = add_size(allocsize, hugepagesize - (allocsize % hugepagesize));
623 : :
624 : : ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
625 : : mmap_flags | huge_mmap_flags, -1, 0);
626 : : mmap_errno = errno;
627 : : if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
628 : : elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
629 : : allocsize);
630 : : }
631 : : #endif
632 : :
633 : : /*
634 : : * Report whether huge pages are in use. This needs to be tracked before
635 : : * the second mmap() call if attempting to use huge pages failed
636 : : * previously.
637 : : */
638 : 0 : SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on",
639 : : PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
640 : :
641 [ # # # # ]: 0 : if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON)
642 : : {
643 : : /*
644 : : * Use the original size, not the rounded-up value, when falling back
645 : : * to non-huge pages.
646 : : */
647 : 0 : allocsize = *size;
648 : 0 : ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
649 : 0 : mmap_flags, -1, 0);
650 : 0 : mmap_errno = errno;
651 : 0 : }
652 : :
653 [ # # ]: 0 : if (ptr == MAP_FAILED)
654 : : {
655 : 0 : errno = mmap_errno;
656 [ # # # # : 0 : ereport(FATAL,
# # ]
657 : : (errmsg("could not map anonymous shared memory: %m"),
658 : : (mmap_errno == ENOMEM) ?
659 : : errhint("This error usually means that PostgreSQL's request "
660 : : "for a shared memory segment exceeded available memory, "
661 : : "swap space, or huge pages. To reduce the request size "
662 : : "(currently %zu bytes), reduce PostgreSQL's shared "
663 : : "memory usage, perhaps by reducing \"shared_buffers\" or "
664 : : "\"max_connections\".",
665 : : allocsize) : 0));
666 : 0 : }
667 : :
668 : 0 : *size = allocsize;
669 : 0 : return ptr;
670 : 0 : }
671 : :
672 : : /*
673 : : * AnonymousShmemDetach --- detach from an anonymous mmap'd block
674 : : * (called as an on_shmem_exit callback, hence funny argument list)
675 : : */
676 : : static void
677 : 0 : AnonymousShmemDetach(int status, Datum arg)
678 : : {
679 : : /* Release anonymous shared memory block, if any. */
680 [ # # ]: 0 : if (AnonymousShmem != NULL)
681 : : {
682 [ # # ]: 0 : if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
683 [ # # # # ]: 0 : elog(LOG, "munmap(%p, %zu) failed: %m",
684 : : AnonymousShmem, AnonymousShmemSize);
685 : 0 : AnonymousShmem = NULL;
686 : 0 : }
687 : 0 : }
688 : :
689 : : /*
690 : : * PGSharedMemoryCreate
691 : : *
692 : : * Create a shared memory segment of the given size and initialize its
693 : : * standard header. Also, register an on_shmem_exit callback to release
694 : : * the storage.
695 : : *
696 : : * Dead Postgres segments pertinent to this DataDir are recycled if found, but
697 : : * we do not fail upon collision with foreign shmem segments. The idea here
698 : : * is to detect and re-use keys that may have been assigned by a crashed
699 : : * postmaster or backend.
700 : : */
701 : : PGShmemHeader *
702 : 0 : PGSharedMemoryCreate(Size size,
703 : : PGShmemHeader **shim)
704 : : {
705 : 0 : IpcMemoryKey NextShmemSegID;
706 : 0 : void *memAddress;
707 : 0 : PGShmemHeader *hdr;
708 : 0 : struct stat statbuf;
709 : 0 : Size sysvsize;
710 : :
711 : : /*
712 : : * We use the data directory's ID info (inode and device numbers) to
713 : : * positively identify shmem segments associated with this data dir, and
714 : : * also as seeds for searching for a free shmem key.
715 : : */
716 [ # # ]: 0 : if (stat(DataDir, &statbuf) < 0)
717 [ # # # # ]: 0 : ereport(FATAL,
718 : : (errcode_for_file_access(),
719 : : errmsg("could not stat data directory \"%s\": %m",
720 : : DataDir)));
721 : :
722 : : /* Complain if hugepages demanded but we can't possibly support them */
723 : : #if !defined(MAP_HUGETLB)
724 [ # # ]: 0 : if (huge_pages == HUGE_PAGES_ON)
725 [ # # # # ]: 0 : ereport(ERROR,
726 : : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
727 : : errmsg("huge pages not supported on this platform")));
728 : : #endif
729 : :
730 : : /* For now, we don't support huge pages in SysV memory */
731 [ # # # # ]: 0 : if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP)
732 [ # # # # ]: 0 : ereport(ERROR,
733 : : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
734 : : errmsg("huge pages not supported with the current \"shared_memory_type\" setting")));
735 : :
736 : : /* Room for a header? */
737 [ # # ]: 0 : Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
738 : :
739 [ # # ]: 0 : if (shared_memory_type == SHMEM_TYPE_MMAP)
740 : : {
741 : 0 : AnonymousShmem = CreateAnonymousSegment(&size);
742 : 0 : AnonymousShmemSize = size;
743 : :
744 : : /* Register on-exit routine to unmap the anonymous segment */
745 : 0 : on_shmem_exit(AnonymousShmemDetach, (Datum) 0);
746 : :
747 : : /* Now we need only allocate a minimal-sized SysV shmem block. */
748 : 0 : sysvsize = sizeof(PGShmemHeader);
749 : 0 : }
750 : : else
751 : : {
752 : 0 : sysvsize = size;
753 : :
754 : : /* huge pages are only available with mmap */
755 : 0 : SetConfigOption("huge_pages_status", "off",
756 : : PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
757 : : }
758 : :
759 : : /*
760 : : * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to
761 : : * ensure no more than one postmaster per data directory can enter this
762 : : * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure
763 : : * that, but prefer fixing it over coping here.)
764 : : */
765 : 0 : NextShmemSegID = statbuf.st_ino;
766 : :
767 : 0 : for (;;)
768 : : {
769 : 0 : IpcMemoryId shmid;
770 : 0 : PGShmemHeader *oldhdr;
771 : 0 : IpcMemoryState state;
772 : :
773 : : /* Try to create new segment */
774 : 0 : memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
775 [ # # ]: 0 : if (memAddress)
776 : 0 : break; /* successful create and attach */
777 : :
778 : : /* Check shared memory and possibly remove and recreate */
779 : :
780 : : /*
781 : : * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN.
782 : : * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can
783 : : * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN.
784 : : */
785 : 0 : shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0);
786 [ # # ]: 0 : if (shmid < 0)
787 : : {
788 : 0 : oldhdr = NULL;
789 : 0 : state = SHMSTATE_FOREIGN;
790 : 0 : }
791 : : else
792 : 0 : state = PGSharedMemoryAttach(shmid, NULL, &oldhdr);
793 : :
794 [ # # # # : 0 : switch (state)
# ]
795 : : {
796 : : case SHMSTATE_ANALYSIS_FAILURE:
797 : : case SHMSTATE_ATTACHED:
798 [ # # # # ]: 0 : ereport(FATAL,
799 : : (errcode(ERRCODE_LOCK_FILE_EXISTS),
800 : : errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use",
801 : : (unsigned long) NextShmemSegID,
802 : : (unsigned long) shmid),
803 : : errhint("Terminate any old server processes associated with data directory \"%s\".",
804 : : DataDir)));
805 : 0 : break;
806 : : case SHMSTATE_ENOENT:
807 : :
808 : : /*
809 : : * To our surprise, some other process deleted since our last
810 : : * InternalIpcMemoryCreate(). Moments earlier, we would have
811 : : * seen SHMSTATE_FOREIGN. Try that same ID again.
812 : : */
813 [ # # # # ]: 0 : elog(LOG,
814 : : "shared memory block (key %lu, ID %lu) deleted during startup",
815 : : (unsigned long) NextShmemSegID,
816 : : (unsigned long) shmid);
817 : 0 : break;
818 : : case SHMSTATE_FOREIGN:
819 : 0 : NextShmemSegID++;
820 : 0 : break;
821 : : case SHMSTATE_UNATTACHED:
822 : :
823 : : /*
824 : : * The segment pertains to DataDir, and every process that had
825 : : * used it has died or detached. Zap it, if possible, and any
826 : : * associated dynamic shared memory segments, as well. This
827 : : * shouldn't fail, but if it does, assume the segment belongs
828 : : * to someone else after all, and try the next candidate.
829 : : * Otherwise, try again to create the segment. That may fail
830 : : * if some other process creates the same shmem key before we
831 : : * do, in which case we'll try the next key.
832 : : */
833 [ # # ]: 0 : if (oldhdr->dsm_control != 0)
834 : 0 : dsm_cleanup_using_control_segment(oldhdr->dsm_control);
835 [ # # ]: 0 : if (shmctl(shmid, IPC_RMID, NULL) < 0)
836 : 0 : NextShmemSegID++;
837 : 0 : break;
838 : : }
839 : :
840 [ # # # # ]: 0 : if (oldhdr && shmdt(oldhdr) < 0)
841 [ # # # # ]: 0 : elog(LOG, "shmdt(%p) failed: %m", oldhdr);
842 [ # # # ]: 0 : }
843 : :
844 : : /* Initialize new segment. */
845 : 0 : hdr = (PGShmemHeader *) memAddress;
846 : 0 : hdr->creatorPID = getpid();
847 : 0 : hdr->magic = PGShmemMagic;
848 : 0 : hdr->dsm_control = 0;
849 : :
850 : : /* Fill in the data directory ID info, too */
851 : 0 : hdr->device = statbuf.st_dev;
852 : 0 : hdr->inode = statbuf.st_ino;
853 : :
854 : : /*
855 : : * Initialize space allocation status for segment.
856 : : */
857 : 0 : hdr->totalsize = size;
858 : 0 : hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
859 : 0 : *shim = hdr;
860 : :
861 : : /* Save info for possible future use */
862 : 0 : UsedShmemSegAddr = memAddress;
863 : 0 : UsedShmemSegID = (unsigned long) NextShmemSegID;
864 : :
865 : : /*
866 : : * If AnonymousShmem is NULL here, then we're not using anonymous shared
867 : : * memory, and should return a pointer to the System V shared memory
868 : : * block. Otherwise, the System V shared memory block is only a shim, and
869 : : * we must return a pointer to the real block.
870 : : */
871 [ # # ]: 0 : if (AnonymousShmem == NULL)
872 : 0 : return hdr;
873 : 0 : memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
874 : 0 : return (PGShmemHeader *) AnonymousShmem;
875 : 0 : }
876 : :
877 : : #ifdef EXEC_BACKEND
878 : :
879 : : /*
880 : : * PGSharedMemoryReAttach
881 : : *
882 : : * This is called during startup of a postmaster child process to re-attach to
883 : : * an already existing shared memory segment. This is needed only in the
884 : : * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory
885 : : * segment attachment via fork().
886 : : *
887 : : * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
888 : : * routine. The caller must have already restored them to the postmaster's
889 : : * values.
890 : : */
891 : : void
892 : : PGSharedMemoryReAttach(void)
893 : : {
894 : : IpcMemoryId shmid;
895 : : PGShmemHeader *hdr;
896 : : IpcMemoryState state;
897 : : void *origUsedShmemSegAddr = UsedShmemSegAddr;
898 : :
899 : : Assert(UsedShmemSegAddr != NULL);
900 : : Assert(IsUnderPostmaster);
901 : :
902 : : #ifdef __CYGWIN__
903 : : /* cygipc (currently) appears to not detach on exec. */
904 : : PGSharedMemoryDetach();
905 : : UsedShmemSegAddr = origUsedShmemSegAddr;
906 : : #endif
907 : :
908 : : elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
909 : : shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0);
910 : : if (shmid < 0)
911 : : state = SHMSTATE_FOREIGN;
912 : : else
913 : : state = PGSharedMemoryAttach(shmid, UsedShmemSegAddr, &hdr);
914 : : if (state != SHMSTATE_ATTACHED)
915 : : elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
916 : : (int) UsedShmemSegID, UsedShmemSegAddr);
917 : : if (hdr != origUsedShmemSegAddr)
918 : : elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
919 : : hdr, origUsedShmemSegAddr);
920 : : dsm_set_control_handle(hdr->dsm_control);
921 : :
922 : : UsedShmemSegAddr = hdr; /* probably redundant */
923 : : }
924 : :
925 : : /*
926 : : * PGSharedMemoryNoReAttach
927 : : *
928 : : * This is called during startup of a postmaster child process when we choose
929 : : * *not* to re-attach to the existing shared memory segment. We must clean up
930 : : * to leave things in the appropriate state. This is not used in the non
931 : : * EXEC_BACKEND case, either.
932 : : *
933 : : * The child process startup logic might or might not call PGSharedMemoryDetach
934 : : * after this; make sure that it will be a no-op if called.
935 : : *
936 : : * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
937 : : * routine. The caller must have already restored them to the postmaster's
938 : : * values.
939 : : */
940 : : void
941 : : PGSharedMemoryNoReAttach(void)
942 : : {
943 : : Assert(UsedShmemSegAddr != NULL);
944 : : Assert(IsUnderPostmaster);
945 : :
946 : : #ifdef __CYGWIN__
947 : : /* cygipc (currently) appears to not detach on exec. */
948 : : PGSharedMemoryDetach();
949 : : #endif
950 : :
951 : : /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */
952 : : UsedShmemSegAddr = NULL;
953 : : /* And the same for UsedShmemSegID. */
954 : : UsedShmemSegID = 0;
955 : : }
956 : :
957 : : #endif /* EXEC_BACKEND */
958 : :
959 : : /*
960 : : * PGSharedMemoryDetach
961 : : *
962 : : * Detach from the shared memory segment, if still attached. This is not
963 : : * intended to be called explicitly by the process that originally created the
964 : : * segment (it will have on_shmem_exit callback(s) registered to do that).
965 : : * Rather, this is for subprocesses that have inherited an attachment and want
966 : : * to get rid of it.
967 : : *
968 : : * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
969 : : * routine, also AnonymousShmem and AnonymousShmemSize.
970 : : */
971 : : void
972 : 0 : PGSharedMemoryDetach(void)
973 : : {
974 [ # # ]: 0 : if (UsedShmemSegAddr != NULL)
975 : : {
976 [ # # ]: 0 : if ((shmdt(UsedShmemSegAddr) < 0)
977 : : #if defined(EXEC_BACKEND) && defined(__CYGWIN__)
978 : : /* Work-around for cygipc exec bug */
979 : : && shmdt(NULL) < 0
980 : : #endif
981 : : )
982 [ # # # # ]: 0 : elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
983 : 0 : UsedShmemSegAddr = NULL;
984 : 0 : }
985 : :
986 [ # # ]: 0 : if (AnonymousShmem != NULL)
987 : : {
988 [ # # ]: 0 : if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
989 [ # # # # ]: 0 : elog(LOG, "munmap(%p, %zu) failed: %m",
990 : : AnonymousShmem, AnonymousShmemSize);
991 : 0 : AnonymousShmem = NULL;
992 : 0 : }
993 : 0 : }
|