Branch data Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * regprefix.c
4 : : * Extract a common prefix, if any, from a compiled regex.
5 : : *
6 : : *
7 : : * Portions Copyright (c) 2012-2026, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1998, 1999 Henry Spencer
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/regex/regprefix.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : :
16 : : #include "regex/regguts.h"
17 : :
18 : :
19 : : /*
20 : : * forward declarations
21 : : */
22 : : static int findprefix(struct cnfa *cnfa, struct colormap *cm,
23 : : chr *string, size_t *slength);
24 : :
25 : :
26 : : /*
27 : : * pg_regprefix - get common prefix for regular expression
28 : : *
29 : : * Returns one of:
30 : : * REG_NOMATCH: there is no common prefix of strings matching the regex
31 : : * REG_PREFIX: there is a common prefix of strings matching the regex
32 : : * REG_EXACT: all strings satisfying the regex must match the same string
33 : : * or a REG_XXX error code
34 : : *
35 : : * In the non-failure cases, *string is set to a palloc'd string containing
36 : : * the common prefix or exact value, of length *slength (measured in chrs
37 : : * not bytes!).
38 : : *
39 : : * This function does not analyze all complex cases (such as lookaround
40 : : * constraints) exactly. Therefore it is possible that some strings matching
41 : : * the reported prefix or exact-match string do not satisfy the regex. But
42 : : * it should never be the case that a string satisfying the regex does not
43 : : * match the reported prefix or exact-match string.
44 : : */
45 : : int
46 : 2342 : pg_regprefix(regex_t *re,
47 : : chr **string,
48 : : size_t *slength)
49 : : {
50 : 2342 : struct guts *g;
51 : 2342 : struct cnfa *cnfa;
52 : 2342 : int st;
53 : :
54 : : /* sanity checks */
55 [ + - - + ]: 2342 : if (string == NULL || slength == NULL)
56 : 0 : return REG_INVARG;
57 : 2342 : *string = NULL; /* initialize for failure cases */
58 : 2342 : *slength = 0;
59 [ + - - + ]: 2342 : if (re == NULL || re->re_magic != REMAGIC)
60 : 0 : return REG_INVARG;
61 [ - + ]: 2342 : if (re->re_csize != sizeof(chr))
62 : 0 : return REG_MIXED;
63 : :
64 : : /* Initialize locale-dependent support */
65 : 2342 : pg_set_regex_collation(re->re_collation);
66 : :
67 : : /* setup */
68 : 2342 : g = (struct guts *) re->re_guts;
69 [ - + ]: 2342 : if (g->info & REG_UIMPOSSIBLE)
70 : 0 : return REG_NOMATCH;
71 : :
72 : : /*
73 : : * This implementation considers only the search NFA for the topmost regex
74 : : * tree node. Therefore, constraints such as backrefs are not fully
75 : : * applied, which is allowed per the function's API spec.
76 : : */
77 [ + - ]: 2342 : assert(g->tree != NULL);
78 : 2342 : cnfa = &g->tree->cnfa;
79 : :
80 : : /* matchall NFAs never have a fixed prefix */
81 [ + + ]: 2342 : if (cnfa->flags & MATCHALL)
82 : 2 : return REG_NOMATCH;
83 : :
84 : : /*
85 : : * Since a correct NFA should never contain any exit-free loops, it should
86 : : * not be possible for our traversal to return to a previously visited NFA
87 : : * state. Hence we need at most nstates chrs in the output string.
88 : : */
89 : 2340 : *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
90 [ + - ]: 2340 : if (*string == NULL)
91 : 0 : return REG_ESPACE;
92 : :
93 : : /* do it */
94 : 2340 : st = findprefix(cnfa, &g->cmap, *string, slength);
95 : :
96 [ + - ]: 2340 : assert(*slength <= cnfa->nstates);
97 : :
98 : : /* clean up */
99 [ + + + + ]: 2340 : if (st != REG_PREFIX && st != REG_EXACT)
100 : : {
101 : 94 : FREE(*string);
102 : 94 : *string = NULL;
103 : 94 : *slength = 0;
104 : 94 : }
105 : :
106 : 2340 : return st;
107 : 2342 : }
108 : :
109 : : /*
110 : : * findprefix - extract common prefix from cNFA
111 : : *
112 : : * Results are returned into the preallocated chr array string[], with
113 : : * *slength (which must be preset to zero) incremented for each chr.
114 : : */
115 : : static int /* regprefix return code */
116 : 2340 : findprefix(struct cnfa *cnfa,
117 : : struct colormap *cm,
118 : : chr *string,
119 : : size_t *slength)
120 : : {
121 : 2340 : int st;
122 : 2340 : int nextst;
123 : 2340 : color thiscolor;
124 : 2340 : chr c;
125 : 2340 : struct carc *ca;
126 : :
127 : : /*
128 : : * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
129 : : * anchored left. If we have both BOS and BOL, they must go to the same
130 : : * next state.
131 : : */
132 : 2340 : st = cnfa->pre;
133 : 2340 : nextst = -1;
134 [ + + ]: 4618 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
135 : : {
136 [ + - + + ]: 2342 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
137 : : {
138 [ + + ]: 2280 : if (nextst == -1)
139 : 2278 : nextst = ca->to;
140 [ - + ]: 2 : else if (nextst != ca->to)
141 : 2 : return REG_NOMATCH;
142 : 2278 : }
143 : : else
144 : 62 : return REG_NOMATCH;
145 : 2278 : }
146 [ + - ]: 2276 : if (nextst == -1)
147 : 0 : return REG_NOMATCH;
148 : :
149 : : /*
150 : : * Scan through successive states, stopping as soon as we find one with
151 : : * more than one acceptable transition character (either multiple colors
152 : : * on out-arcs, or a color with more than one member chr).
153 : : *
154 : : * We could find a state with multiple out-arcs that are all labeled with
155 : : * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
156 : : * In that case we add the chr "c" to the output string but then exit the
157 : : * loop with nextst == -1. This leaves a little bit on the table: if the
158 : : * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
159 : : * to the prefix. But chasing multiple parallel state chains doesn't seem
160 : : * worth the trouble.
161 : : */
162 : 2276 : do
163 : : {
164 : 29281 : st = nextst;
165 : 29281 : nextst = -1;
166 : 29281 : thiscolor = COLORLESS;
167 [ + + ]: 56314 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
168 : : {
169 : : /* We can ignore BOS/BOL arcs */
170 [ + - - + ]: 29294 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
171 : 0 : continue;
172 : :
173 : : /*
174 : : * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
175 : : * and LACONs
176 : : */
177 [ + - + + ]: 29294 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
178 [ + + + + ]: 27215 : ca->co == RAINBOW || ca->co >= cnfa->ncolors)
179 : : {
180 : 2252 : thiscolor = COLORLESS;
181 : 2252 : break;
182 : : }
183 [ + + ]: 27042 : if (thiscolor == COLORLESS)
184 : : {
185 : : /* First plain outarc */
186 : 27031 : thiscolor = ca->co;
187 : 27031 : nextst = ca->to;
188 : 27031 : }
189 [ + + ]: 11 : else if (thiscolor == ca->co)
190 : : {
191 : : /* Another plain outarc for same color */
192 : 2 : nextst = -1;
193 : 2 : }
194 : : else
195 : : {
196 : : /* More than one plain outarc color terminates the search */
197 : 9 : thiscolor = COLORLESS;
198 : 9 : break;
199 : : }
200 : 27033 : }
201 : : /* Done if we didn't find exactly one color on plain outarcs */
202 [ + + ]: 29281 : if (thiscolor == COLORLESS)
203 : 2261 : break;
204 : : /* The color must be a singleton */
205 [ + + ]: 27020 : if (cm->cd[thiscolor].nschrs != 1)
206 : 13 : break;
207 : : /* Must not have any high-color-map entries */
208 [ - + ]: 27007 : if (cm->cd[thiscolor].nuchrs != 0)
209 : 0 : break;
210 : :
211 : : /*
212 : : * Identify the color's sole member chr and add it to the prefix
213 : : * string. In general the colormap data structure doesn't provide a
214 : : * way to find color member chrs, except by trying GETCOLOR() on each
215 : : * possible chr value, which won't do at all. However, for the cases
216 : : * we care about it should be sufficient to test the "firstchr" value,
217 : : * that is the first chr ever added to the color. There are cases
218 : : * where this might no longer be a member of the color (so we do need
219 : : * to test), but none of them are likely to arise for a character that
220 : : * is a member of a common prefix. If we do hit such a corner case,
221 : : * we just fall out without adding anything to the prefix string.
222 : : */
223 : 27007 : c = cm->cd[thiscolor].firstchr;
224 [ + - + - ]: 27007 : if (GETCOLOR(cm, c) != thiscolor)
225 : 0 : break;
226 : :
227 : 27007 : string[(*slength)++] = c;
228 : :
229 : : /* Advance to next state, but only if we have a unique next state */
230 [ + + ]: 27007 : } while (nextst != -1);
231 : :
232 : : /*
233 : : * If we ended at a state that only has EOS/EOL outarcs leading to the
234 : : * "post" state, then we have an exact-match string. Note this is true
235 : : * even if the string is of zero length.
236 : : */
237 : 2276 : nextst = -1;
238 [ + + ]: 4355 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
239 : : {
240 [ + - + + ]: 2276 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
241 : : {
242 [ - + ]: 2079 : if (nextst == -1)
243 : 2079 : nextst = ca->to;
244 [ # # ]: 0 : else if (nextst != ca->to)
245 : : {
246 : 0 : nextst = -1;
247 : 0 : break;
248 : : }
249 : 2079 : }
250 : : else
251 : : {
252 : 197 : nextst = -1;
253 : 197 : break;
254 : : }
255 : 2079 : }
256 [ + + ]: 2276 : if (nextst == cnfa->post)
257 : 2079 : return REG_EXACT;
258 : :
259 : : /*
260 : : * Otherwise, if we were unable to identify any prefix characters, say
261 : : * NOMATCH --- the pattern is anchored left, but doesn't specify any
262 : : * particular first character.
263 : : */
264 [ + + ]: 197 : if (*slength > 0)
265 : 167 : return REG_PREFIX;
266 : :
267 : 30 : return REG_NOMATCH;
268 : 2340 : }
|