LCOV - code coverage report
Current view: top level - src/backend/lib - hyperloglog.c (source / functions) Coverage Total Hit
Test: Code coverage Lines: 74.0 % 73 54
Test Date: 2026-01-26 10:56:24 Functions: 83.3 % 6 5
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
Branches: 48.6 % 37 18

             Branch data     Line data    Source code
       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * hyperloglog.c
       4                 :             :  *        HyperLogLog cardinality estimator
       5                 :             :  *
       6                 :             :  * Portions Copyright (c) 2014-2026, PostgreSQL Global Development Group
       7                 :             :  *
       8                 :             :  * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
       9                 :             :  * suited to estimating the cardinality of very large sets;  in particular, we
      10                 :             :  * have not attempted to further optimize the implementation as described in
      11                 :             :  * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
      12                 :             :  * Engineering of a State of The Art Cardinality Estimation Algorithm".
      13                 :             :  *
      14                 :             :  * A sparse representation of HyperLogLog state is used, with fixed space
      15                 :             :  * overhead.
      16                 :             :  *
      17                 :             :  * The copyright terms of Ohno's original version (the MIT license) follow.
      18                 :             :  *
      19                 :             :  * IDENTIFICATION
      20                 :             :  *        src/backend/lib/hyperloglog.c
      21                 :             :  *
      22                 :             :  *-------------------------------------------------------------------------
      23                 :             :  */
      24                 :             : 
      25                 :             : /*
      26                 :             :  * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
      27                 :             :  *
      28                 :             :  * Permission is hereby granted, free of charge, to any person obtaining a copy
      29                 :             :  * of this software and associated documentation files (the 'Software'), to
      30                 :             :  * deal in the Software without restriction, including without limitation the
      31                 :             :  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
      32                 :             :  * sell copies of the Software, and to permit persons to whom the Software is
      33                 :             :  * furnished to do so, subject to the following conditions:
      34                 :             :  *
      35                 :             :  * The above copyright notice and this permission notice shall be included in
      36                 :             :  * all copies or substantial portions of the Software.
      37                 :             :  *
      38                 :             :  * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      39                 :             :  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      40                 :             :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
      41                 :             :  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      42                 :             :  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      43                 :             :  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
      44                 :             :  * IN THE SOFTWARE.
      45                 :             :  */
      46                 :             : 
      47                 :             : #include "postgres.h"
      48                 :             : 
      49                 :             : #include <math.h>
      50                 :             : 
      51                 :             : #include "lib/hyperloglog.h"
      52                 :             : #include "port/pg_bitutils.h"
      53                 :             : 
      54                 :             : #define POW_2_32                        (4294967296.0)
      55                 :             : #define NEG_POW_2_32            (-4294967296.0)
      56                 :             : 
      57                 :             : static inline uint8 rho(uint32 x, uint8 b);
      58                 :             : 
      59                 :             : /*
      60                 :             :  * Initialize HyperLogLog track state, by bit width
      61                 :             :  *
      62                 :             :  * bwidth is bit width (so register size will be 2 to the power of bwidth).
      63                 :             :  * Must be between 4 and 16 inclusive.
      64                 :             :  */
      65                 :             : void
      66                 :        9139 : initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
      67                 :             : {
      68                 :        9139 :         double          alpha;
      69                 :             : 
      70         [ +  - ]:        9139 :         if (bwidth < 4 || bwidth > 16)
      71   [ #  #  #  # ]:           0 :                 elog(ERROR, "bit width must be between 4 and 16 inclusive");
      72                 :             : 
      73                 :        9139 :         cState->registerWidth = bwidth;
      74                 :        9139 :         cState->nRegisters = (Size) 1 << bwidth;
      75                 :        9139 :         cState->arrSize = sizeof(uint8) * cState->nRegisters + 1;
      76                 :             : 
      77                 :             :         /*
      78                 :             :          * Initialize hashes array to zero, not negative infinity, per discussion
      79                 :             :          * of the coupon collector problem in the HyperLogLog paper
      80                 :             :          */
      81                 :        9139 :         cState->hashesArr = palloc0(cState->arrSize);
      82                 :             : 
      83                 :             :         /*
      84                 :             :          * "alpha" is a value that for each possible number of registers (m) is
      85                 :             :          * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z
      86                 :             :          * is "the indicator function" through which we finally compute E,
      87                 :             :          * estimated cardinality).
      88                 :             :          */
      89   [ +  +  -  - ]:        9139 :         switch (cState->nRegisters)
      90                 :             :         {
      91                 :             :                 case 16:
      92                 :           0 :                         alpha = 0.673;
      93                 :           0 :                         break;
      94                 :             :                 case 32:
      95                 :        8404 :                         alpha = 0.697;
      96                 :        8404 :                         break;
      97                 :             :                 case 64:
      98                 :           0 :                         alpha = 0.709;
      99                 :           0 :                         break;
     100                 :             :                 default:
     101                 :         735 :                         alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters);
     102                 :         735 :         }
     103                 :             : 
     104                 :             :         /*
     105                 :             :          * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog
     106                 :             :          * estimate E
     107                 :             :          */
     108                 :        9139 :         cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters;
     109                 :        9139 : }
     110                 :             : 
     111                 :             : /*
     112                 :             :  * Initialize HyperLogLog track state, by error rate
     113                 :             :  *
     114                 :             :  * Instead of specifying bwidth (number of bits used for addressing the
     115                 :             :  * register), this method allows sizing the counter for particular error
     116                 :             :  * rate using a simple formula from the paper:
     117                 :             :  *
     118                 :             :  *       e = 1.04 / sqrt(m)
     119                 :             :  *
     120                 :             :  * where 'm' is the number of registers, i.e. (2^bwidth). The method
     121                 :             :  * finds the lowest bwidth with 'e' below the requested error rate, and
     122                 :             :  * then uses it to initialize the counter.
     123                 :             :  *
     124                 :             :  * As bwidth has to be between 4 and 16, the worst possible error rate
     125                 :             :  * is between ~25% (bwidth=4) and 0.4% (bwidth=16).
     126                 :             :  */
     127                 :             : void
     128                 :           0 : initHyperLogLogError(hyperLogLogState *cState, double error)
     129                 :             : {
     130                 :           0 :         uint8           bwidth = 4;
     131                 :             : 
     132         [ #  # ]:           0 :         while (bwidth < 16)
     133                 :             :         {
     134                 :           0 :                 double          m = (Size) 1 << bwidth;
     135                 :             : 
     136         [ #  # ]:           0 :                 if (1.04 / sqrt(m) < error)
     137                 :           0 :                         break;
     138                 :           0 :                 bwidth++;
     139      [ #  #  # ]:           0 :         }
     140                 :             : 
     141                 :           0 :         initHyperLogLog(cState, bwidth);
     142                 :           0 : }
     143                 :             : 
     144                 :             : /*
     145                 :             :  * Free HyperLogLog track state
     146                 :             :  *
     147                 :             :  * Releases allocated resources, but not the state itself (in case it's not
     148                 :             :  * allocated by palloc).
     149                 :             :  */
     150                 :             : void
     151                 :        4485 : freeHyperLogLog(hyperLogLogState *cState)
     152                 :             : {
     153         [ +  - ]:        4485 :         Assert(cState->hashesArr != NULL);
     154                 :        4485 :         pfree(cState->hashesArr);
     155                 :        4485 : }
     156                 :             : 
     157                 :             : /*
     158                 :             :  * Adds element to the estimator, from caller-supplied hash.
     159                 :             :  *
     160                 :             :  * It is critical that the hash value passed be an actual hash value, typically
     161                 :             :  * generated using hash_any().  The algorithm relies on a specific bit-pattern
     162                 :             :  * observable in conjunction with stochastic averaging.  There must be a
     163                 :             :  * uniform distribution of bits in hash values for each distinct original value
     164                 :             :  * observed.
     165                 :             :  */
     166                 :             : void
     167                 :      826409 : addHyperLogLog(hyperLogLogState *cState, uint32 hash)
     168                 :             : {
     169                 :      826409 :         uint8           count;
     170                 :      826409 :         uint32          index;
     171                 :             : 
     172                 :             :         /* Use the first "k" (registerWidth) bits as a zero based index */
     173                 :      826409 :         index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
     174                 :             : 
     175                 :             :         /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
     176                 :     1652818 :         count = rho(hash << cState->registerWidth,
     177                 :      826409 :                                 BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
     178                 :             : 
     179         [ +  + ]:      826409 :         cState->hashesArr[index] = Max(count, cState->hashesArr[index]);
     180                 :      826409 : }
     181                 :             : 
     182                 :             : /*
     183                 :             :  * Estimates cardinality, based on elements added so far
     184                 :             :  */
     185                 :             : double
     186                 :        4735 : estimateHyperLogLog(hyperLogLogState *cState)
     187                 :             : {
     188                 :        4735 :         double          result;
     189                 :        4735 :         double          sum = 0.0;
     190                 :        4735 :         int                     i;
     191                 :             : 
     192         [ +  + ]:      404255 :         for (i = 0; i < cState->nRegisters; i++)
     193                 :             :         {
     194                 :      399520 :                 sum += 1.0 / pow(2.0, cState->hashesArr[i]);
     195                 :      399520 :         }
     196                 :             : 
     197                 :             :         /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
     198                 :        4735 :         result = cState->alphaMM / sum;
     199                 :             : 
     200         [ +  + ]:        4735 :         if (result <= (5.0 / 2.0) * cState->nRegisters)
     201                 :             :         {
     202                 :             :                 /* Small range correction */
     203                 :        4621 :                 int                     zero_count = 0;
     204                 :             : 
     205         [ +  + ]:      380653 :                 for (i = 0; i < cState->nRegisters; i++)
     206                 :             :                 {
     207         [ +  + ]:      376032 :                         if (cState->hashesArr[i] == 0)
     208                 :      267108 :                                 zero_count++;
     209                 :      376032 :                 }
     210                 :             : 
     211         [ -  + ]:        4621 :                 if (zero_count != 0)
     212                 :        9242 :                         result = cState->nRegisters * log((double) cState->nRegisters /
     213                 :        4621 :                                                                                           zero_count);
     214                 :        4621 :         }
     215         [ +  - ]:         114 :         else if (result > (1.0 / 30.0) * POW_2_32)
     216                 :             :         {
     217                 :             :                 /* Large range correction */
     218                 :           0 :                 result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
     219                 :           0 :         }
     220                 :             : 
     221                 :        9470 :         return result;
     222                 :        4735 : }
     223                 :             : 
     224                 :             : /*
     225                 :             :  * Worker for addHyperLogLog().
     226                 :             :  *
     227                 :             :  * Calculates the position of the first set bit in first b bits of x argument
     228                 :             :  * starting from the first, reading from most significant to least significant
     229                 :             :  * bits.
     230                 :             :  *
     231                 :             :  * Example (when considering fist 10 bits of x):
     232                 :             :  *
     233                 :             :  * rho(x = 0b1000000000)   returns 1
     234                 :             :  * rho(x = 0b0010000000)   returns 3
     235                 :             :  * rho(x = 0b0000000000)   returns b + 1
     236                 :             :  *
     237                 :             :  * "The binary address determined by the first b bits of x"
     238                 :             :  *
     239                 :             :  * Return value "j" used to index bit pattern to watch.
     240                 :             :  */
     241                 :             : static inline uint8
     242                 :      826409 : rho(uint32 x, uint8 b)
     243                 :             : {
     244                 :      826409 :         uint8           j = 1;
     245                 :             : 
     246         [ +  - ]:      826409 :         if (x == 0)
     247                 :           0 :                 return b + 1;
     248                 :             : 
     249                 :      826409 :         j = 32 - pg_leftmost_one_pos32(x);
     250                 :             : 
     251         [ -  + ]:      826409 :         if (j > b)
     252                 :           0 :                 return b + 1;
     253                 :             : 
     254                 :      826409 :         return j;
     255                 :      826409 : }
        

Generated by: LCOV version 2.3.2-1