LCOV - code coverage report
Current view: top level - src/backend/regex - regprefix.c (source / functions) Coverage Total Hit
Test: Code coverage Lines: 88.1 % 101 89
Test Date: 2026-01-26 10:56:24 Functions: 100.0 % 2 2
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
Branches: 72.5 % 80 58

             Branch data     Line data    Source code
       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * regprefix.c
       4                 :             :  *        Extract a common prefix, if any, from a compiled regex.
       5                 :             :  *
       6                 :             :  *
       7                 :             :  * Portions Copyright (c) 2012-2026, PostgreSQL Global Development Group
       8                 :             :  * Portions Copyright (c) 1998, 1999 Henry Spencer
       9                 :             :  *
      10                 :             :  * IDENTIFICATION
      11                 :             :  *        src/backend/regex/regprefix.c
      12                 :             :  *
      13                 :             :  *-------------------------------------------------------------------------
      14                 :             :  */
      15                 :             : 
      16                 :             : #include "regex/regguts.h"
      17                 :             : 
      18                 :             : 
      19                 :             : /*
      20                 :             :  * forward declarations
      21                 :             :  */
      22                 :             : static int      findprefix(struct cnfa *cnfa, struct colormap *cm,
      23                 :             :                                            chr *string, size_t *slength);
      24                 :             : 
      25                 :             : 
      26                 :             : /*
      27                 :             :  * pg_regprefix - get common prefix for regular expression
      28                 :             :  *
      29                 :             :  * Returns one of:
      30                 :             :  *      REG_NOMATCH: there is no common prefix of strings matching the regex
      31                 :             :  *      REG_PREFIX: there is a common prefix of strings matching the regex
      32                 :             :  *      REG_EXACT: all strings satisfying the regex must match the same string
      33                 :             :  *      or a REG_XXX error code
      34                 :             :  *
      35                 :             :  * In the non-failure cases, *string is set to a palloc'd string containing
      36                 :             :  * the common prefix or exact value, of length *slength (measured in chrs
      37                 :             :  * not bytes!).
      38                 :             :  *
      39                 :             :  * This function does not analyze all complex cases (such as lookaround
      40                 :             :  * constraints) exactly.  Therefore it is possible that some strings matching
      41                 :             :  * the reported prefix or exact-match string do not satisfy the regex.  But
      42                 :             :  * it should never be the case that a string satisfying the regex does not
      43                 :             :  * match the reported prefix or exact-match string.
      44                 :             :  */
      45                 :             : int
      46                 :        2342 : pg_regprefix(regex_t *re,
      47                 :             :                          chr **string,
      48                 :             :                          size_t *slength)
      49                 :             : {
      50                 :        2342 :         struct guts *g;
      51                 :        2342 :         struct cnfa *cnfa;
      52                 :        2342 :         int                     st;
      53                 :             : 
      54                 :             :         /* sanity checks */
      55   [ +  -  -  + ]:        2342 :         if (string == NULL || slength == NULL)
      56                 :           0 :                 return REG_INVARG;
      57                 :        2342 :         *string = NULL;                         /* initialize for failure cases */
      58                 :        2342 :         *slength = 0;
      59   [ +  -  -  + ]:        2342 :         if (re == NULL || re->re_magic != REMAGIC)
      60                 :           0 :                 return REG_INVARG;
      61         [ -  + ]:        2342 :         if (re->re_csize != sizeof(chr))
      62                 :           0 :                 return REG_MIXED;
      63                 :             : 
      64                 :             :         /* Initialize locale-dependent support */
      65                 :        2342 :         pg_set_regex_collation(re->re_collation);
      66                 :             : 
      67                 :             :         /* setup */
      68                 :        2342 :         g = (struct guts *) re->re_guts;
      69         [ -  + ]:        2342 :         if (g->info & REG_UIMPOSSIBLE)
      70                 :           0 :                 return REG_NOMATCH;
      71                 :             : 
      72                 :             :         /*
      73                 :             :          * This implementation considers only the search NFA for the topmost regex
      74                 :             :          * tree node.  Therefore, constraints such as backrefs are not fully
      75                 :             :          * applied, which is allowed per the function's API spec.
      76                 :             :          */
      77         [ +  - ]:        2342 :         assert(g->tree != NULL);
      78                 :        2342 :         cnfa = &g->tree->cnfa;
      79                 :             : 
      80                 :             :         /* matchall NFAs never have a fixed prefix */
      81         [ +  + ]:        2342 :         if (cnfa->flags & MATCHALL)
      82                 :           2 :                 return REG_NOMATCH;
      83                 :             : 
      84                 :             :         /*
      85                 :             :          * Since a correct NFA should never contain any exit-free loops, it should
      86                 :             :          * not be possible for our traversal to return to a previously visited NFA
      87                 :             :          * state.  Hence we need at most nstates chrs in the output string.
      88                 :             :          */
      89                 :        2340 :         *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
      90         [ +  - ]:        2340 :         if (*string == NULL)
      91                 :           0 :                 return REG_ESPACE;
      92                 :             : 
      93                 :             :         /* do it */
      94                 :        2340 :         st = findprefix(cnfa, &g->cmap, *string, slength);
      95                 :             : 
      96         [ +  - ]:        2340 :         assert(*slength <= cnfa->nstates);
      97                 :             : 
      98                 :             :         /* clean up */
      99   [ +  +  +  + ]:        2340 :         if (st != REG_PREFIX && st != REG_EXACT)
     100                 :             :         {
     101                 :          94 :                 FREE(*string);
     102                 :          94 :                 *string = NULL;
     103                 :          94 :                 *slength = 0;
     104                 :          94 :         }
     105                 :             : 
     106                 :        2340 :         return st;
     107                 :        2342 : }
     108                 :             : 
     109                 :             : /*
     110                 :             :  * findprefix - extract common prefix from cNFA
     111                 :             :  *
     112                 :             :  * Results are returned into the preallocated chr array string[], with
     113                 :             :  * *slength (which must be preset to zero) incremented for each chr.
     114                 :             :  */
     115                 :             : static int                                              /* regprefix return code */
     116                 :        2340 : findprefix(struct cnfa *cnfa,
     117                 :             :                    struct colormap *cm,
     118                 :             :                    chr *string,
     119                 :             :                    size_t *slength)
     120                 :             : {
     121                 :        2340 :         int                     st;
     122                 :        2340 :         int                     nextst;
     123                 :        2340 :         color           thiscolor;
     124                 :        2340 :         chr                     c;
     125                 :        2340 :         struct carc *ca;
     126                 :             : 
     127                 :             :         /*
     128                 :             :          * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
     129                 :             :          * anchored left.  If we have both BOS and BOL, they must go to the same
     130                 :             :          * next state.
     131                 :             :          */
     132                 :        2340 :         st = cnfa->pre;
     133                 :        2340 :         nextst = -1;
     134         [ +  + ]:        4618 :         for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     135                 :             :         {
     136   [ +  -  +  + ]:        2342 :                 if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
     137                 :             :                 {
     138         [ +  + ]:        2280 :                         if (nextst == -1)
     139                 :        2278 :                                 nextst = ca->to;
     140         [ -  + ]:           2 :                         else if (nextst != ca->to)
     141                 :           2 :                                 return REG_NOMATCH;
     142                 :        2278 :                 }
     143                 :             :                 else
     144                 :          62 :                         return REG_NOMATCH;
     145                 :        2278 :         }
     146         [ +  - ]:        2276 :         if (nextst == -1)
     147                 :           0 :                 return REG_NOMATCH;
     148                 :             : 
     149                 :             :         /*
     150                 :             :          * Scan through successive states, stopping as soon as we find one with
     151                 :             :          * more than one acceptable transition character (either multiple colors
     152                 :             :          * on out-arcs, or a color with more than one member chr).
     153                 :             :          *
     154                 :             :          * We could find a state with multiple out-arcs that are all labeled with
     155                 :             :          * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
     156                 :             :          * In that case we add the chr "c" to the output string but then exit the
     157                 :             :          * loop with nextst == -1.  This leaves a little bit on the table: if the
     158                 :             :          * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
     159                 :             :          * to the prefix.  But chasing multiple parallel state chains doesn't seem
     160                 :             :          * worth the trouble.
     161                 :             :          */
     162                 :        2276 :         do
     163                 :             :         {
     164                 :       29281 :                 st = nextst;
     165                 :       29281 :                 nextst = -1;
     166                 :       29281 :                 thiscolor = COLORLESS;
     167         [ +  + ]:       56314 :                 for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     168                 :             :                 {
     169                 :             :                         /* We can ignore BOS/BOL arcs */
     170   [ +  -  -  + ]:       29294 :                         if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
     171                 :           0 :                                 continue;
     172                 :             : 
     173                 :             :                         /*
     174                 :             :                          * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
     175                 :             :                          * and LACONs
     176                 :             :                          */
     177   [ +  -  +  + ]:       29294 :                         if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
     178   [ +  +  +  + ]:       27215 :                                 ca->co == RAINBOW || ca->co >= cnfa->ncolors)
     179                 :             :                         {
     180                 :        2252 :                                 thiscolor = COLORLESS;
     181                 :        2252 :                                 break;
     182                 :             :                         }
     183         [ +  + ]:       27042 :                         if (thiscolor == COLORLESS)
     184                 :             :                         {
     185                 :             :                                 /* First plain outarc */
     186                 :       27031 :                                 thiscolor = ca->co;
     187                 :       27031 :                                 nextst = ca->to;
     188                 :       27031 :                         }
     189         [ +  + ]:          11 :                         else if (thiscolor == ca->co)
     190                 :             :                         {
     191                 :             :                                 /* Another plain outarc for same color */
     192                 :           2 :                                 nextst = -1;
     193                 :           2 :                         }
     194                 :             :                         else
     195                 :             :                         {
     196                 :             :                                 /* More than one plain outarc color terminates the search */
     197                 :           9 :                                 thiscolor = COLORLESS;
     198                 :           9 :                                 break;
     199                 :             :                         }
     200                 :       27033 :                 }
     201                 :             :                 /* Done if we didn't find exactly one color on plain outarcs */
     202         [ +  + ]:       29281 :                 if (thiscolor == COLORLESS)
     203                 :        2261 :                         break;
     204                 :             :                 /* The color must be a singleton */
     205         [ +  + ]:       27020 :                 if (cm->cd[thiscolor].nschrs != 1)
     206                 :          13 :                         break;
     207                 :             :                 /* Must not have any high-color-map entries */
     208         [ -  + ]:       27007 :                 if (cm->cd[thiscolor].nuchrs != 0)
     209                 :           0 :                         break;
     210                 :             : 
     211                 :             :                 /*
     212                 :             :                  * Identify the color's sole member chr and add it to the prefix
     213                 :             :                  * string.  In general the colormap data structure doesn't provide a
     214                 :             :                  * way to find color member chrs, except by trying GETCOLOR() on each
     215                 :             :                  * possible chr value, which won't do at all.  However, for the cases
     216                 :             :                  * we care about it should be sufficient to test the "firstchr" value,
     217                 :             :                  * that is the first chr ever added to the color.  There are cases
     218                 :             :                  * where this might no longer be a member of the color (so we do need
     219                 :             :                  * to test), but none of them are likely to arise for a character that
     220                 :             :                  * is a member of a common prefix.  If we do hit such a corner case,
     221                 :             :                  * we just fall out without adding anything to the prefix string.
     222                 :             :                  */
     223                 :       27007 :                 c = cm->cd[thiscolor].firstchr;
     224   [ +  -  +  - ]:       27007 :                 if (GETCOLOR(cm, c) != thiscolor)
     225                 :           0 :                         break;
     226                 :             : 
     227                 :       27007 :                 string[(*slength)++] = c;
     228                 :             : 
     229                 :             :                 /* Advance to next state, but only if we have a unique next state */
     230         [ +  + ]:       27007 :         } while (nextst != -1);
     231                 :             : 
     232                 :             :         /*
     233                 :             :          * If we ended at a state that only has EOS/EOL outarcs leading to the
     234                 :             :          * "post" state, then we have an exact-match string.  Note this is true
     235                 :             :          * even if the string is of zero length.
     236                 :             :          */
     237                 :        2276 :         nextst = -1;
     238         [ +  + ]:        4355 :         for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     239                 :             :         {
     240   [ +  -  +  + ]:        2276 :                 if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
     241                 :             :                 {
     242         [ -  + ]:        2079 :                         if (nextst == -1)
     243                 :        2079 :                                 nextst = ca->to;
     244         [ #  # ]:           0 :                         else if (nextst != ca->to)
     245                 :             :                         {
     246                 :           0 :                                 nextst = -1;
     247                 :           0 :                                 break;
     248                 :             :                         }
     249                 :        2079 :                 }
     250                 :             :                 else
     251                 :             :                 {
     252                 :         197 :                         nextst = -1;
     253                 :         197 :                         break;
     254                 :             :                 }
     255                 :        2079 :         }
     256         [ +  + ]:        2276 :         if (nextst == cnfa->post)
     257                 :        2079 :                 return REG_EXACT;
     258                 :             : 
     259                 :             :         /*
     260                 :             :          * Otherwise, if we were unable to identify any prefix characters, say
     261                 :             :          * NOMATCH --- the pattern is anchored left, but doesn't specify any
     262                 :             :          * particular first character.
     263                 :             :          */
     264         [ +  + ]:         197 :         if (*slength > 0)
     265                 :         167 :                 return REG_PREFIX;
     266                 :             : 
     267                 :          30 :         return REG_NOMATCH;
     268                 :        2340 : }
        

Generated by: LCOV version 2.3.2-1