Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * parser.c
4 : * Main entry point/driver for PostgreSQL grammar
5 : *
6 : * This should match src/backend/parser/parser.c, except that we do not
7 : * need to bother with re-entrant interfaces.
8 : *
9 : * Note: ECPG doesn't report error location like the backend does.
10 : * This file will need work if we ever want it to.
11 : *
12 : *
13 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
14 : * Portions Copyright (c) 1994, Regents of the University of California
15 : *
16 : * IDENTIFICATION
17 : * src/interfaces/ecpg/preproc/parser.c
18 : *
19 : *-------------------------------------------------------------------------
20 : */
21 :
22 : #include "postgres_fe.h"
23 :
24 : #include "preproc_extern.h"
25 : #include "preproc.h"
26 :
27 :
28 : static bool have_lookahead; /* is lookahead info valid? */
29 : static int lookahead_token; /* one-token lookahead */
30 : static YYSTYPE lookahead_yylval; /* yylval for lookahead token */
31 : static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */
32 : static char *lookahead_yytext; /* start current token */
33 :
34 : static int base_yylex_location(void);
35 : static bool check_uescapechar(unsigned char escape);
36 : static bool ecpg_isspace(char ch);
37 :
38 :
39 : /*
40 : * Intermediate filter between parser and base lexer (base_yylex in scan.l).
41 : *
42 : * This filter is needed because in some cases the standard SQL grammar
43 : * requires more than one token lookahead. We reduce these cases to one-token
44 : * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
45 : *
46 : * Using a filter is simpler than trying to recognize multiword tokens
47 : * directly in scan.l, because we'd have to allow for comments between the
48 : * words. Furthermore it's not clear how to do that without re-introducing
49 : * scanner backtrack, which would cost more performance than this filter
50 : * layer does.
51 : *
52 : * We also use this filter to convert UIDENT and USCONST sequences into
53 : * plain IDENT and SCONST tokens. While that could be handled by additional
54 : * productions in the main grammar, it's more efficient to do it like this.
55 : */
56 : int
57 0 : filtered_base_yylex(void)
58 : {
59 0 : int cur_token;
60 0 : int next_token;
61 0 : YYSTYPE cur_yylval;
62 0 : YYLTYPE cur_yylloc;
63 0 : char *cur_yytext;
64 :
65 : /* Get next token --- we might already have it */
66 0 : if (have_lookahead)
67 : {
68 0 : cur_token = lookahead_token;
69 0 : base_yylval = lookahead_yylval;
70 0 : base_yylloc = lookahead_yylloc;
71 0 : base_yytext = lookahead_yytext;
72 0 : have_lookahead = false;
73 0 : }
74 : else
75 0 : cur_token = base_yylex_location();
76 :
77 : /*
78 : * If this token isn't one that requires lookahead, just return it.
79 : */
80 0 : switch (cur_token)
81 : {
82 : case FORMAT:
83 : case NOT:
84 : case NULLS_P:
85 : case WITH:
86 : case WITHOUT:
87 : case UIDENT:
88 : case USCONST:
89 0 : break;
90 : default:
91 0 : return cur_token;
92 : }
93 :
94 : /* Save and restore lexer output variables around the call */
95 0 : cur_yylval = base_yylval;
96 0 : cur_yylloc = base_yylloc;
97 0 : cur_yytext = base_yytext;
98 :
99 : /* Get next token, saving outputs into lookahead variables */
100 0 : next_token = base_yylex_location();
101 :
102 0 : lookahead_token = next_token;
103 0 : lookahead_yylval = base_yylval;
104 0 : lookahead_yylloc = base_yylloc;
105 0 : lookahead_yytext = base_yytext;
106 :
107 0 : base_yylval = cur_yylval;
108 0 : base_yylloc = cur_yylloc;
109 0 : base_yytext = cur_yytext;
110 :
111 0 : have_lookahead = true;
112 :
113 : /* Replace cur_token if needed, based on lookahead */
114 0 : switch (cur_token)
115 : {
116 : case FORMAT:
117 : /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
118 0 : switch (next_token)
119 : {
120 : case JSON:
121 0 : cur_token = FORMAT_LA;
122 0 : break;
123 : }
124 0 : break;
125 :
126 : case NOT:
127 : /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
128 0 : switch (next_token)
129 : {
130 : case BETWEEN:
131 : case IN_P:
132 : case LIKE:
133 : case ILIKE:
134 : case SIMILAR:
135 0 : cur_token = NOT_LA;
136 0 : break;
137 : }
138 0 : break;
139 :
140 : case NULLS_P:
141 : /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
142 0 : switch (next_token)
143 : {
144 : case FIRST_P:
145 : case LAST_P:
146 0 : cur_token = NULLS_LA;
147 0 : break;
148 : }
149 0 : break;
150 :
151 : case WITH:
152 : /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
153 0 : switch (next_token)
154 : {
155 : case TIME:
156 : case ORDINALITY:
157 0 : cur_token = WITH_LA;
158 0 : break;
159 : }
160 0 : break;
161 :
162 : case WITHOUT:
163 : /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
164 0 : switch (next_token)
165 : {
166 : case TIME:
167 0 : cur_token = WITHOUT_LA;
168 0 : break;
169 : }
170 0 : break;
171 : case UIDENT:
172 : case USCONST:
173 : /* Look ahead for UESCAPE */
174 0 : if (next_token == UESCAPE)
175 : {
176 : /* Yup, so get third token, which had better be SCONST */
177 0 : const char *escstr;
178 :
179 : /*
180 : * Again save and restore lexer output variables around the
181 : * call
182 : */
183 0 : cur_yylval = base_yylval;
184 0 : cur_yylloc = base_yylloc;
185 0 : cur_yytext = base_yytext;
186 :
187 : /* Get third token */
188 0 : next_token = base_yylex_location();
189 :
190 0 : if (next_token != SCONST)
191 0 : mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal");
192 :
193 : /*
194 : * Save and check escape string, which the scanner returns
195 : * with quotes
196 : */
197 0 : escstr = base_yylval.str;
198 0 : if (strlen(escstr) != 3 || !check_uescapechar(escstr[1]))
199 0 : mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character");
200 :
201 0 : base_yylval = cur_yylval;
202 0 : base_yylloc = cur_yylloc;
203 0 : base_yytext = cur_yytext;
204 :
205 : /* Combine 3 tokens into 1 */
206 0 : base_yylval.str = make3_str(base_yylval.str,
207 : " UESCAPE ",
208 0 : escstr);
209 0 : base_yylloc = loc_strdup(base_yylval.str);
210 :
211 : /* Clear have_lookahead, thereby consuming all three tokens */
212 0 : have_lookahead = false;
213 0 : }
214 :
215 0 : if (cur_token == UIDENT)
216 0 : cur_token = IDENT;
217 0 : else if (cur_token == USCONST)
218 0 : cur_token = SCONST;
219 0 : break;
220 : }
221 :
222 0 : return cur_token;
223 0 : }
224 :
225 : /*
226 : * Call base_yylex() and fill in base_yylloc.
227 : *
228 : * pgc.l does not worry about setting yylloc, and given what we want for
229 : * that, trying to set it there would be pretty inconvenient. What we
230 : * want is: if the returned token has type <str>, then duplicate its
231 : * string value as yylloc; otherwise, make a downcased copy of yytext.
232 : * The downcasing is ASCII-only because all that we care about there
233 : * is producing uniformly-cased output of keywords. (That's mostly
234 : * cosmetic, but there are places in ecpglib that expect to receive
235 : * downcased keywords, plus it keeps us regression-test-compatible
236 : * with the pre-v18 implementation of ecpg.)
237 : */
238 : static int
239 0 : base_yylex_location(void)
240 : {
241 0 : int token = base_yylex();
242 :
243 0 : switch (token)
244 : {
245 : /* List a token here if pgc.l assigns to base_yylval.str for it */
246 : case Op:
247 : case CSTRING:
248 : case CPP_LINE:
249 : case CVARIABLE:
250 : case BCONST:
251 : case SCONST:
252 : case USCONST:
253 : case XCONST:
254 : case FCONST:
255 : case IDENT:
256 : case UIDENT:
257 : case IP:
258 : /* Duplicate the <str> value */
259 0 : base_yylloc = loc_strdup(base_yylval.str);
260 0 : break;
261 : default:
262 : /* Else just use the input, i.e., yytext */
263 0 : base_yylloc = loc_strdup(base_yytext);
264 : /* Apply an ASCII-only downcasing */
265 0 : for (unsigned char *ptr = (unsigned char *) base_yylloc; *ptr; ptr++)
266 : {
267 0 : if (*ptr >= 'A' && *ptr <= 'Z')
268 0 : *ptr += 'a' - 'A';
269 0 : }
270 0 : break;
271 : }
272 0 : return token;
273 0 : }
274 :
275 : /*
276 : * check_uescapechar() and ecpg_isspace() should match their equivalents
277 : * in pgc.l.
278 : */
279 :
280 : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
281 : static bool
282 0 : check_uescapechar(unsigned char escape)
283 : {
284 0 : if (isxdigit(escape)
285 0 : || escape == '+'
286 0 : || escape == '\''
287 0 : || escape == '"'
288 0 : || ecpg_isspace(escape))
289 0 : return false;
290 : else
291 0 : return true;
292 0 : }
293 :
294 : /*
295 : * ecpg_isspace() --- return true if flex scanner considers char whitespace
296 : */
297 : static bool
298 0 : ecpg_isspace(char ch)
299 : {
300 0 : if (ch == ' ' ||
301 0 : ch == '\t' ||
302 0 : ch == '\n' ||
303 0 : ch == '\r' ||
304 0 : ch == '\f')
305 0 : return true;
306 0 : return false;
307 0 : }
|