Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * unaccent.c
4 : * Text search unaccent dictionary
5 : *
6 : * Copyright (c) 2009-2026, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * contrib/unaccent/unaccent.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 :
14 : #include "postgres.h"
15 :
16 : #include "catalog/pg_ts_dict.h"
17 : #include "commands/defrem.h"
18 : #include "lib/stringinfo.h"
19 : #include "tsearch/ts_cache.h"
20 : #include "tsearch/ts_locale.h"
21 : #include "tsearch/ts_public.h"
22 : #include "utils/builtins.h"
23 : #include "utils/lsyscache.h"
24 : #include "utils/syscache.h"
25 :
26 0 : PG_MODULE_MAGIC_EXT(
27 : .name = "unaccent",
28 : .version = PG_VERSION
29 : );
30 :
31 : /*
32 : * An unaccent dictionary uses a trie to find a string to replace. Each node
33 : * of the trie is an array of 256 TrieChar structs; the N-th element of the
34 : * array corresponds to next byte value N. That element can contain both a
35 : * replacement string (to be used if the source string ends with this byte)
36 : * and a link to another trie node (to be followed if there are more bytes).
37 : *
38 : * Note that the trie search logic pays no attention to multibyte character
39 : * boundaries. This is OK as long as both the data entered into the trie and
40 : * the data we're trying to look up are validly encoded; no partial-character
41 : * matches will occur.
42 : */
43 : typedef struct TrieChar
44 : {
45 : struct TrieChar *nextChar;
46 : char *replaceTo;
47 : int replacelen;
48 : } TrieChar;
49 :
50 : /*
51 : * placeChar - put str into trie's structure, byte by byte.
52 : *
53 : * If node is NULL, we need to make a new node, which will be returned;
54 : * otherwise the return value is the same as node.
55 : */
56 : static TrieChar *
57 0 : placeChar(TrieChar *node, const unsigned char *str, int lenstr,
58 : const char *replaceTo, int replacelen)
59 : {
60 0 : TrieChar *curnode;
61 :
62 0 : if (!node)
63 0 : node = palloc0_array(TrieChar, 256);
64 :
65 0 : Assert(lenstr > 0); /* else str[0] doesn't exist */
66 :
67 0 : curnode = node + *str;
68 :
69 0 : if (lenstr <= 1)
70 : {
71 0 : if (curnode->replaceTo)
72 0 : ereport(WARNING,
73 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
74 : errmsg("duplicate source strings, first one will be used")));
75 : else
76 : {
77 0 : curnode->replacelen = replacelen;
78 0 : curnode->replaceTo = (char *) palloc(replacelen);
79 0 : memcpy(curnode->replaceTo, replaceTo, replacelen);
80 : }
81 0 : }
82 : else
83 : {
84 0 : curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
85 0 : replaceTo, replacelen);
86 : }
87 :
88 0 : return node;
89 0 : }
90 :
91 : /*
92 : * initTrie - create trie from file.
93 : *
94 : * Function converts UTF8-encoded file into current encoding.
95 : */
96 : static TrieChar *
97 0 : initTrie(const char *filename)
98 : {
99 0 : TrieChar *volatile rootTrie = NULL;
100 0 : MemoryContext ccxt = CurrentMemoryContext;
101 0 : tsearch_readline_state trst;
102 0 : volatile bool skip;
103 :
104 0 : filename = get_tsearch_config_filename(filename, "rules");
105 0 : if (!tsearch_readline_begin(&trst, filename))
106 0 : ereport(ERROR,
107 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
108 : errmsg("could not open unaccent file \"%s\": %m",
109 : filename)));
110 :
111 0 : do
112 : {
113 : /*
114 : * pg_do_encoding_conversion() (called by tsearch_readline()) will
115 : * emit exception if it finds untranslatable characters in current
116 : * locale. We just skip such lines, continuing with the next.
117 : */
118 0 : skip = true;
119 :
120 0 : PG_TRY();
121 : {
122 0 : char *line;
123 :
124 0 : while ((line = tsearch_readline(&trst)) != NULL)
125 : {
126 : /*----------
127 : * The format of each line must be "src" or "src trg", where
128 : * src and trg are sequences of one or more non-whitespace
129 : * characters, separated by whitespace. Whitespace at start
130 : * or end of line is ignored. If trg is omitted, an empty
131 : * string is used as the replacement. trg can be optionally
132 : * quoted, in which case whitespaces are included in it.
133 : *
134 : * We use a simple state machine, with states
135 : * 0 initial (before src)
136 : * 1 in src
137 : * 2 in whitespace after src
138 : * 3 in trg (non-quoted)
139 : * 4 in trg (quoted)
140 : * 5 in whitespace after trg
141 : * -1 syntax error detected (two strings)
142 : * -2 syntax error detected (unfinished quoted string)
143 : *----------
144 : */
145 0 : int state;
146 0 : char *ptr;
147 0 : char *src = NULL;
148 0 : char *trg = NULL;
149 0 : char *trgstore = NULL;
150 0 : int ptrlen;
151 0 : int srclen = 0;
152 0 : int trglen = 0;
153 0 : int trgstorelen = 0;
154 0 : bool trgquoted = false;
155 :
156 0 : state = 0;
157 0 : for (ptr = line; *ptr; ptr += ptrlen)
158 : {
159 0 : ptrlen = pg_mblen(ptr);
160 : /* ignore whitespace, but end src or trg */
161 0 : if (isspace((unsigned char) *ptr))
162 : {
163 0 : if (state == 1)
164 0 : state = 2;
165 0 : else if (state == 3)
166 0 : state = 5;
167 : /* whitespaces are OK in quoted area */
168 0 : if (state != 4)
169 0 : continue;
170 0 : }
171 0 : switch (state)
172 : {
173 : case 0:
174 : /* start of src */
175 0 : src = ptr;
176 0 : srclen = ptrlen;
177 0 : state = 1;
178 0 : break;
179 : case 1:
180 : /* continue src */
181 0 : srclen += ptrlen;
182 0 : break;
183 : case 2:
184 : /* start of trg */
185 0 : if (*ptr == '"')
186 : {
187 0 : trgquoted = true;
188 0 : state = 4;
189 0 : }
190 : else
191 0 : state = 3;
192 :
193 0 : trg = ptr;
194 0 : trglen = ptrlen;
195 0 : break;
196 : case 3:
197 : /* continue non-quoted trg */
198 0 : trglen += ptrlen;
199 0 : break;
200 : case 4:
201 : /* continue quoted trg */
202 0 : trglen += ptrlen;
203 :
204 : /*
205 : * If this is a quote, consider it as the end of
206 : * trg except if the follow-up character is itself
207 : * a quote.
208 : */
209 0 : if (*ptr == '"')
210 : {
211 0 : if (*(ptr + 1) == '"')
212 : {
213 0 : ptr++;
214 0 : trglen += 1;
215 0 : }
216 : else
217 0 : state = 5;
218 0 : }
219 0 : break;
220 : default:
221 : /* bogus line format */
222 0 : state = -1;
223 0 : break;
224 : }
225 0 : }
226 :
227 0 : if (state == 1 || state == 2)
228 : {
229 : /* trg was omitted, so use "" */
230 0 : trg = "";
231 0 : trglen = 0;
232 0 : }
233 :
234 : /* If still in a quoted area, fallback to an error */
235 0 : if (state == 4)
236 0 : state = -2;
237 :
238 : /* If trg was quoted, remove its quotes and unescape it */
239 0 : if (trgquoted && state > 0)
240 : {
241 : /* Ignore first and end quotes */
242 0 : trgstore = palloc_array(char, trglen - 2);
243 0 : trgstorelen = 0;
244 0 : for (int i = 1; i < trglen - 1; i++)
245 : {
246 0 : trgstore[trgstorelen] = trg[i];
247 0 : trgstorelen++;
248 : /* skip second double quotes */
249 0 : if (trg[i] == '"' && trg[i + 1] == '"')
250 0 : i++;
251 0 : }
252 0 : }
253 : else
254 : {
255 0 : trgstore = palloc_array(char, trglen);
256 0 : trgstorelen = trglen;
257 0 : memcpy(trgstore, trg, trgstorelen);
258 : }
259 :
260 0 : if (state > 0)
261 0 : rootTrie = placeChar(rootTrie,
262 0 : (unsigned char *) src, srclen,
263 0 : trgstore, trgstorelen);
264 0 : else if (state == -1)
265 0 : ereport(WARNING,
266 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
267 : errmsg("invalid syntax: more than two strings in unaccent rule")));
268 0 : else if (state == -2)
269 0 : ereport(WARNING,
270 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
271 : errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
272 :
273 0 : pfree(trgstore);
274 0 : pfree(line);
275 0 : }
276 0 : skip = false;
277 0 : }
278 0 : PG_CATCH();
279 : {
280 0 : ErrorData *errdata;
281 0 : MemoryContext ecxt;
282 :
283 0 : ecxt = MemoryContextSwitchTo(ccxt);
284 0 : errdata = CopyErrorData();
285 0 : if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
286 : {
287 0 : FlushErrorState();
288 0 : }
289 : else
290 : {
291 0 : MemoryContextSwitchTo(ecxt);
292 0 : PG_RE_THROW();
293 : }
294 0 : }
295 0 : PG_END_TRY();
296 0 : }
297 0 : while (skip);
298 :
299 0 : tsearch_readline_end(&trst);
300 :
301 0 : return rootTrie;
302 0 : }
303 :
304 : /*
305 : * findReplaceTo - find longest possible match in trie
306 : *
307 : * On success, returns pointer to ending subnode, plus length of matched
308 : * source string in *p_matchlen. On failure, returns NULL.
309 : */
310 : static TrieChar *
311 0 : findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
312 : int *p_matchlen)
313 : {
314 0 : TrieChar *result = NULL;
315 0 : int matchlen = 0;
316 :
317 0 : *p_matchlen = 0; /* prevent uninitialized-variable warnings */
318 :
319 0 : while (node && matchlen < srclen)
320 : {
321 0 : node = node + src[matchlen];
322 0 : matchlen++;
323 :
324 0 : if (node->replaceTo)
325 : {
326 0 : result = node;
327 0 : *p_matchlen = matchlen;
328 0 : }
329 :
330 0 : node = node->nextChar;
331 : }
332 :
333 0 : return result;
334 0 : }
335 :
336 0 : PG_FUNCTION_INFO_V1(unaccent_init);
337 : Datum
338 0 : unaccent_init(PG_FUNCTION_ARGS)
339 : {
340 0 : List *dictoptions = (List *) PG_GETARG_POINTER(0);
341 0 : TrieChar *rootTrie = NULL;
342 0 : bool fileloaded = false;
343 0 : ListCell *l;
344 :
345 0 : foreach(l, dictoptions)
346 : {
347 0 : DefElem *defel = (DefElem *) lfirst(l);
348 :
349 0 : if (strcmp(defel->defname, "rules") == 0)
350 : {
351 0 : if (fileloaded)
352 0 : ereport(ERROR,
353 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
354 : errmsg("multiple Rules parameters")));
355 0 : rootTrie = initTrie(defGetString(defel));
356 0 : fileloaded = true;
357 0 : }
358 : else
359 : {
360 0 : ereport(ERROR,
361 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
362 : errmsg("unrecognized Unaccent parameter: \"%s\"",
363 : defel->defname)));
364 : }
365 0 : }
366 :
367 0 : if (!fileloaded)
368 : {
369 0 : ereport(ERROR,
370 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
371 : errmsg("missing Rules parameter")));
372 0 : }
373 :
374 0 : PG_RETURN_POINTER(rootTrie);
375 0 : }
376 :
377 0 : PG_FUNCTION_INFO_V1(unaccent_lexize);
378 : Datum
379 0 : unaccent_lexize(PG_FUNCTION_ARGS)
380 : {
381 0 : TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
382 0 : char *srcchar = (char *) PG_GETARG_POINTER(1);
383 0 : int32 len = PG_GETARG_INT32(2);
384 0 : char *srcstart = srcchar;
385 0 : TSLexeme *res;
386 0 : StringInfoData buf;
387 :
388 : /* we allocate storage for the buffer only if needed */
389 0 : buf.data = NULL;
390 :
391 0 : while (len > 0)
392 : {
393 0 : TrieChar *node;
394 0 : int matchlen;
395 :
396 0 : node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
397 : &matchlen);
398 0 : if (node && node->replaceTo)
399 : {
400 0 : if (buf.data == NULL)
401 : {
402 : /* initialize buffer */
403 0 : initStringInfo(&buf);
404 : /* insert any data we already skipped over */
405 0 : if (srcchar != srcstart)
406 0 : appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
407 0 : }
408 0 : appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
409 0 : }
410 : else
411 : {
412 0 : matchlen = pg_mblen(srcchar);
413 0 : if (buf.data != NULL)
414 0 : appendBinaryStringInfo(&buf, srcchar, matchlen);
415 : }
416 :
417 0 : srcchar += matchlen;
418 0 : len -= matchlen;
419 0 : }
420 :
421 : /* return a result only if we made at least one substitution */
422 0 : if (buf.data != NULL)
423 : {
424 0 : res = palloc0_array(TSLexeme, 2);
425 0 : res->lexeme = buf.data;
426 0 : res->flags = TSL_FILTER;
427 0 : }
428 : else
429 0 : res = NULL;
430 :
431 0 : PG_RETURN_POINTER(res);
432 0 : }
433 :
434 : /*
435 : * Function-like wrapper for dictionary
436 : */
437 0 : PG_FUNCTION_INFO_V1(unaccent_dict);
438 : Datum
439 0 : unaccent_dict(PG_FUNCTION_ARGS)
440 : {
441 0 : text *str;
442 0 : int strArg;
443 0 : Oid dictOid;
444 0 : TSDictionaryCacheEntry *dict;
445 0 : TSLexeme *res;
446 :
447 0 : if (PG_NARGS() == 1)
448 : {
449 : /*
450 : * Use the "unaccent" dictionary that is in the same schema that this
451 : * function is in.
452 : */
453 0 : Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
454 0 : const char *dictname = "unaccent";
455 :
456 0 : dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
457 : PointerGetDatum(dictname),
458 : ObjectIdGetDatum(procnspid));
459 0 : if (!OidIsValid(dictOid))
460 0 : ereport(ERROR,
461 : (errcode(ERRCODE_UNDEFINED_OBJECT),
462 : errmsg("text search dictionary \"%s.%s\" does not exist",
463 : get_namespace_name(procnspid), dictname)));
464 0 : strArg = 0;
465 0 : }
466 : else
467 : {
468 0 : dictOid = PG_GETARG_OID(0);
469 0 : strArg = 1;
470 : }
471 0 : str = PG_GETARG_TEXT_PP(strArg);
472 :
473 0 : dict = lookup_ts_dictionary_cache(dictOid);
474 :
475 0 : res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
476 : PointerGetDatum(dict->dictData),
477 : PointerGetDatum(VARDATA_ANY(str)),
478 : Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
479 : PointerGetDatum(NULL)));
480 :
481 0 : PG_FREE_IF_COPY(str, strArg);
482 :
483 0 : if (res == NULL)
484 : {
485 0 : PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
486 : }
487 0 : else if (res->lexeme == NULL)
488 : {
489 0 : pfree(res);
490 0 : PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
491 : }
492 : else
493 : {
494 0 : text *txt = cstring_to_text(res->lexeme);
495 :
496 0 : pfree(res->lexeme);
497 0 : pfree(res);
498 :
499 0 : PG_RETURN_TEXT_P(txt);
500 0 : }
501 0 : }
|