LCOV - code coverage report
Current view: top level - src/backend/utils/misc - sampling.c (source / functions) Coverage Total Hit
Test: Code coverage Lines: 45.8 % 107 49
Test Date: 2026-01-26 10:56:24 Functions: 70.0 % 10 7
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
Branches: 29.0 % 31 9

             Branch data     Line data    Source code
       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * sampling.c
       4                 :             :  *        Relation block sampling routines.
       5                 :             :  *
       6                 :             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7                 :             :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :             :  *
       9                 :             :  *
      10                 :             :  * IDENTIFICATION
      11                 :             :  *        src/backend/utils/misc/sampling.c
      12                 :             :  *
      13                 :             :  *-------------------------------------------------------------------------
      14                 :             :  */
      15                 :             : 
      16                 :             : #include "postgres.h"
      17                 :             : 
      18                 :             : #include <math.h>
      19                 :             : 
      20                 :             : #include "utils/sampling.h"
      21                 :             : 
      22                 :             : 
      23                 :             : /*
      24                 :             :  * BlockSampler_Init -- prepare for random sampling of blocknumbers
      25                 :             :  *
      26                 :             :  * BlockSampler provides algorithm for block level sampling of a relation
      27                 :             :  * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
      28                 :             :  * It selects a random sample of samplesize blocks out of
      29                 :             :  * the nblocks blocks in the table. If the table has less than
      30                 :             :  * samplesize blocks, all blocks are selected.
      31                 :             :  *
      32                 :             :  * Since we know the total number of blocks in advance, we can use the
      33                 :             :  * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
      34                 :             :  * algorithm.
      35                 :             :  *
      36                 :             :  * Returns the number of blocks that BlockSampler_Next will return.
      37                 :             :  */
      38                 :             : BlockNumber
      39                 :        1000 : BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
      40                 :             :                                   uint32 randseed)
      41                 :             : {
      42                 :        1000 :         bs->N = nblocks;                     /* measured table size */
      43                 :             : 
      44                 :             :         /*
      45                 :             :          * If we decide to reduce samplesize for tables that have less or not much
      46                 :             :          * more than samplesize blocks, here is the place to do it.
      47                 :             :          */
      48                 :        1000 :         bs->n = samplesize;
      49                 :        1000 :         bs->t = 0;                                   /* blocks scanned so far */
      50                 :        1000 :         bs->m = 0;                                   /* blocks selected so far */
      51                 :             : 
      52                 :        1000 :         sampler_random_init_state(randseed, &bs->randstate);
      53                 :             : 
      54         [ -  + ]:        1000 :         return Min(bs->n, bs->N);
      55                 :             : }
      56                 :             : 
      57                 :             : bool
      58                 :       22394 : BlockSampler_HasMore(BlockSampler bs)
      59                 :             : {
      60         [ +  + ]:       22394 :         return (bs->t < bs->N) && (bs->m < bs->n);
      61                 :             : }
      62                 :             : 
      63                 :             : BlockNumber
      64                 :       10697 : BlockSampler_Next(BlockSampler bs)
      65                 :             : {
      66                 :       10697 :         BlockNumber K = bs->N - bs->t;    /* remaining blocks */
      67                 :       10697 :         int                     k = bs->n - bs->m;        /* blocks still to sample */
      68                 :       10697 :         double          p;                              /* probability to skip block */
      69                 :       10697 :         double          V;                              /* random */
      70                 :             : 
      71         [ +  - ]:       10697 :         Assert(BlockSampler_HasMore(bs));       /* hence K > 0 and k > 0 */
      72                 :             : 
      73         [ +  - ]:       10697 :         if ((BlockNumber) k >= K)
      74                 :             :         {
      75                 :             :                 /* need all the rest */
      76                 :       10697 :                 bs->m++;
      77                 :       10697 :                 return bs->t++;
      78                 :             :         }
      79                 :             : 
      80                 :             :         /*----------
      81                 :             :          * It is not obvious that this code matches Knuth's Algorithm S.
      82                 :             :          * Knuth says to skip the current block with probability 1 - k/K.
      83                 :             :          * If we are to skip, we should advance t (hence decrease K), and
      84                 :             :          * repeat the same probabilistic test for the next block.  The naive
      85                 :             :          * implementation thus requires a sampler_random_fract() call for each
      86                 :             :          * block number.  But we can reduce this to one sampler_random_fract()
      87                 :             :          * call per selected block, by noting that each time the while-test
      88                 :             :          * succeeds, we can reinterpret V as a uniform random number in the range
      89                 :             :          * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
      90                 :             :          * the appropriate fraction of its former value, and our next loop
      91                 :             :          * makes the appropriate probabilistic test.
      92                 :             :          *
      93                 :             :          * We have initially K > k > 0.  If the loop reduces K to equal k,
      94                 :             :          * the next while-test must fail since p will become exactly zero
      95                 :             :          * (we assume there will not be roundoff error in the division).
      96                 :             :          * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
      97                 :             :          * to be doubly sure about roundoff error.)  Therefore K cannot become
      98                 :             :          * less than k, which means that we cannot fail to select enough blocks.
      99                 :             :          *----------
     100                 :             :          */
     101                 :           0 :         V = sampler_random_fract(&bs->randstate);
     102                 :           0 :         p = 1.0 - (double) k / (double) K;
     103         [ #  # ]:           0 :         while (V < p)
     104                 :             :         {
     105                 :             :                 /* skip */
     106                 :           0 :                 bs->t++;
     107                 :           0 :                 K--;                                    /* keep K == N - t */
     108                 :             : 
     109                 :             :                 /* adjust p to be new cutoff point in reduced range */
     110                 :           0 :                 p *= 1.0 - (double) k / (double) K;
     111                 :             :         }
     112                 :             : 
     113                 :             :         /* select */
     114                 :           0 :         bs->m++;
     115                 :           0 :         return bs->t++;
     116                 :       10697 : }
     117                 :             : 
     118                 :             : /*
     119                 :             :  * These two routines embody Algorithm Z from "Random sampling with a
     120                 :             :  * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
     121                 :             :  * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
     122                 :             :  * of the count S of records to skip before processing another record.
     123                 :             :  * It is computed primarily based on t, the number of records already read.
     124                 :             :  * The only extra state needed between calls is W, a random state variable.
     125                 :             :  *
     126                 :             :  * reservoir_init_selection_state computes the initial W value.
     127                 :             :  *
     128                 :             :  * Given that we've already read t records (t >= n), reservoir_get_next_S
     129                 :             :  * determines the number of records to skip before the next record is
     130                 :             :  * processed.
     131                 :             :  */
     132                 :             : void
     133                 :        1000 : reservoir_init_selection_state(ReservoirState rs, int n)
     134                 :             : {
     135                 :             :         /*
     136                 :             :          * Reservoir sampling is not used anywhere where it would need to return
     137                 :             :          * repeatable results so we can initialize it randomly.
     138                 :             :          */
     139                 :        2000 :         sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
     140                 :        1000 :                                                           &rs->randstate);
     141                 :             : 
     142                 :             :         /* Initial value of W (for use when Algorithm Z is first applied) */
     143                 :        1000 :         rs->W = exp(-log(sampler_random_fract(&rs->randstate)) / n);
     144                 :        1000 : }
     145                 :             : 
     146                 :             : double
     147                 :      144492 : reservoir_get_next_S(ReservoirState rs, double t, int n)
     148                 :             : {
     149                 :      144492 :         double          S;
     150                 :             : 
     151                 :             :         /* The magic constant here is T from Vitter's paper */
     152         [ +  - ]:      144492 :         if (t <= (22.0 * n))
     153                 :             :         {
     154                 :             :                 /* Process records using Algorithm X until t is large enough */
     155                 :      144492 :                 double          V,
     156                 :             :                                         quot;
     157                 :             : 
     158                 :      144492 :                 V = sampler_random_fract(&rs->randstate);        /* Generate V */
     159                 :      144492 :                 S = 0;
     160                 :      144492 :                 t += 1;
     161                 :             :                 /* Note: "num" in Vitter's code is always equal to t - n */
     162                 :      144492 :                 quot = (t - (double) n) / t;
     163                 :             :                 /* Find min S satisfying (4.1) */
     164         [ +  + ]:      280171 :                 while (quot > V)
     165                 :             :                 {
     166                 :      135679 :                         S += 1;
     167                 :      135679 :                         t += 1;
     168                 :      135679 :                         quot *= (t - (double) n) / t;
     169                 :             :                 }
     170                 :      144492 :         }
     171                 :             :         else
     172                 :             :         {
     173                 :             :                 /* Now apply Algorithm Z */
     174                 :           0 :                 double          W = rs->W;
     175                 :           0 :                 double          term = t - (double) n + 1;
     176                 :             : 
     177                 :           0 :                 for (;;)
     178                 :             :                 {
     179                 :           0 :                         double          numer,
     180                 :             :                                                 numer_lim,
     181                 :             :                                                 denom;
     182                 :           0 :                         double          U,
     183                 :             :                                                 X,
     184                 :             :                                                 lhs,
     185                 :             :                                                 rhs,
     186                 :             :                                                 y,
     187                 :             :                                                 tmp;
     188                 :             : 
     189                 :             :                         /* Generate U and X */
     190                 :           0 :                         U = sampler_random_fract(&rs->randstate);
     191                 :           0 :                         X = t * (W - 1.0);
     192                 :           0 :                         S = floor(X);           /* S is tentatively set to floor(X) */
     193                 :             :                         /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
     194                 :           0 :                         tmp = (t + 1) / term;
     195                 :           0 :                         lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
     196                 :           0 :                         rhs = (((t + X) / (term + S)) * term) / t;
     197         [ #  # ]:           0 :                         if (lhs <= rhs)
     198                 :             :                         {
     199                 :           0 :                                 W = rhs / lhs;
     200                 :           0 :                                 break;
     201                 :             :                         }
     202                 :             :                         /* Test if U <= f(S)/cg(X) */
     203                 :           0 :                         y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
     204         [ #  # ]:           0 :                         if ((double) n < S)
     205                 :             :                         {
     206                 :           0 :                                 denom = t;
     207                 :           0 :                                 numer_lim = term + S;
     208                 :           0 :                         }
     209                 :             :                         else
     210                 :             :                         {
     211                 :           0 :                                 denom = t - (double) n + S;
     212                 :           0 :                                 numer_lim = t + 1;
     213                 :             :                         }
     214         [ #  # ]:           0 :                         for (numer = t + S; numer >= numer_lim; numer -= 1)
     215                 :             :                         {
     216                 :           0 :                                 y *= numer / denom;
     217                 :           0 :                                 denom -= 1;
     218                 :           0 :                         }
     219                 :           0 :                         W = exp(-log(sampler_random_fract(&rs->randstate)) / n); /* Generate W in advance */
     220         [ #  # ]:           0 :                         if (exp(log(y) / n) <= (t + X) / t)
     221                 :           0 :                                 break;
     222      [ #  #  # ]:           0 :                 }
     223                 :           0 :                 rs->W = W;
     224                 :           0 :         }
     225                 :      288984 :         return S;
     226                 :      144492 : }
     227                 :             : 
     228                 :             : 
     229                 :             : /*----------
     230                 :             :  * Random number generator used by sampling
     231                 :             :  *----------
     232                 :             :  */
     233                 :             : void
     234                 :        2000 : sampler_random_init_state(uint32 seed, pg_prng_state *randstate)
     235                 :             : {
     236                 :        2000 :         pg_prng_seed(randstate, (uint64) seed);
     237                 :        2000 : }
     238                 :             : 
     239                 :             : /* Select a random value R uniformly distributed in (0 - 1) */
     240                 :             : double
     241                 :      289982 : sampler_random_fract(pg_prng_state *randstate)
     242                 :             : {
     243                 :      289982 :         double          res;
     244                 :             : 
     245                 :             :         /* pg_prng_double returns a value in [0.0 - 1.0), so we must reject 0.0 */
     246                 :      289982 :         do
     247                 :             :         {
     248                 :      289982 :                 res = pg_prng_double(randstate);
     249         [ -  + ]:      289982 :         } while (unlikely(res == 0.0));
     250                 :      579964 :         return res;
     251                 :      289982 : }
     252                 :             : 
     253                 :             : 
     254                 :             : /*
     255                 :             :  * Backwards-compatible API for block sampling
     256                 :             :  *
     257                 :             :  * This code is now deprecated, but since it's still in use by many FDWs,
     258                 :             :  * we should keep it for awhile at least.  The functionality is the same as
     259                 :             :  * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
     260                 :             :  * except that a common random state is used across all callers.
     261                 :             :  */
     262                 :             : static ReservoirStateData oldrs;
     263                 :             : static bool oldrs_initialized = false;
     264                 :             : 
     265                 :             : double
     266                 :           0 : anl_random_fract(void)
     267                 :             : {
     268                 :             :         /* initialize if first time through */
     269         [ #  # ]:           0 :         if (unlikely(!oldrs_initialized))
     270                 :             :         {
     271                 :           0 :                 sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
     272                 :             :                                                                   &oldrs.randstate);
     273                 :           0 :                 oldrs_initialized = true;
     274                 :           0 :         }
     275                 :             : 
     276                 :             :         /* and compute a random fraction */
     277                 :           0 :         return sampler_random_fract(&oldrs.randstate);
     278                 :             : }
     279                 :             : 
     280                 :             : double
     281                 :           0 : anl_init_selection_state(int n)
     282                 :             : {
     283                 :             :         /* initialize if first time through */
     284         [ #  # ]:           0 :         if (unlikely(!oldrs_initialized))
     285                 :             :         {
     286                 :           0 :                 sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
     287                 :             :                                                                   &oldrs.randstate);
     288                 :           0 :                 oldrs_initialized = true;
     289                 :           0 :         }
     290                 :             : 
     291                 :             :         /* Initial value of W (for use when Algorithm Z is first applied) */
     292                 :           0 :         return exp(-log(sampler_random_fract(&oldrs.randstate)) / n);
     293                 :             : }
     294                 :             : 
     295                 :             : double
     296                 :           0 : anl_get_next_S(double t, int n, double *stateptr)
     297                 :             : {
     298                 :           0 :         double          result;
     299                 :             : 
     300                 :           0 :         oldrs.W = *stateptr;
     301                 :           0 :         result = reservoir_get_next_S(&oldrs, t, n);
     302                 :           0 :         *stateptr = oldrs.W;
     303                 :           0 :         return result;
     304                 :           0 : }
        

Generated by: LCOV version 2.3.2-1