LCOV - Code coverage - contrib/unaccent/unaccent.c

LCOV - code coverage report

Current view:	top level - contrib/unaccent - unaccent.c (source / functions)		Coverage	Total	Hit
Test:	Code coverage	Lines:	0.0 %	232	0
Test Date:	2026-01-26 10:56:24	Functions:	0.0 %	10	0
Legend:	Lines: hit not hit

            Line data    Source code

       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * unaccent.c
       4              :  *        Text search unaccent dictionary
       5              :  *
       6              :  * Copyright (c) 2009-2026, PostgreSQL Global Development Group
       7              :  *
       8              :  * IDENTIFICATION
       9              :  *        contrib/unaccent/unaccent.c
      10              :  *
      11              :  *-------------------------------------------------------------------------
      12              :  */
      13              : 
      14              : #include "postgres.h"
      15              : 
      16              : #include "catalog/pg_ts_dict.h"
      17              : #include "commands/defrem.h"
      18              : #include "lib/stringinfo.h"
      19              : #include "tsearch/ts_cache.h"
      20              : #include "tsearch/ts_locale.h"
      21              : #include "tsearch/ts_public.h"
      22              : #include "utils/builtins.h"
      23              : #include "utils/lsyscache.h"
      24              : #include "utils/syscache.h"
      25              : 
      26            0 : PG_MODULE_MAGIC_EXT(
      27              :                                         .name = "unaccent",
      28              :                                         .version = PG_VERSION
      29              : );
      30              : 
      31              : /*
      32              :  * An unaccent dictionary uses a trie to find a string to replace.  Each node
      33              :  * of the trie is an array of 256 TrieChar structs; the N-th element of the
      34              :  * array corresponds to next byte value N.  That element can contain both a
      35              :  * replacement string (to be used if the source string ends with this byte)
      36              :  * and a link to another trie node (to be followed if there are more bytes).
      37              :  *
      38              :  * Note that the trie search logic pays no attention to multibyte character
      39              :  * boundaries.  This is OK as long as both the data entered into the trie and
      40              :  * the data we're trying to look up are validly encoded; no partial-character
      41              :  * matches will occur.
      42              :  */
      43              : typedef struct TrieChar
      44              : {
      45              :         struct TrieChar *nextChar;
      46              :         char       *replaceTo;
      47              :         int                     replacelen;
      48              : } TrieChar;
      49              : 
      50              : /*
      51              :  * placeChar - put str into trie's structure, byte by byte.
      52              :  *
      53              :  * If node is NULL, we need to make a new node, which will be returned;
      54              :  * otherwise the return value is the same as node.
      55              :  */
      56              : static TrieChar *
      57            0 : placeChar(TrieChar *node, const unsigned char *str, int lenstr,
      58              :                   const char *replaceTo, int replacelen)
      59              : {
      60            0 :         TrieChar   *curnode;
      61              : 
      62            0 :         if (!node)
      63            0 :                 node = palloc0_array(TrieChar, 256);
      64              : 
      65            0 :         Assert(lenstr > 0);                  /* else str[0] doesn't exist */
      66              : 
      67            0 :         curnode = node + *str;
      68              : 
      69            0 :         if (lenstr <= 1)
      70              :         {
      71            0 :                 if (curnode->replaceTo)
      72            0 :                         ereport(WARNING,
      73              :                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
      74              :                                          errmsg("duplicate source strings, first one will be used")));
      75              :                 else
      76              :                 {
      77            0 :                         curnode->replacelen = replacelen;
      78            0 :                         curnode->replaceTo = (char *) palloc(replacelen);
      79            0 :                         memcpy(curnode->replaceTo, replaceTo, replacelen);
      80              :                 }
      81            0 :         }
      82              :         else
      83              :         {
      84            0 :                 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
      85            0 :                                                                           replaceTo, replacelen);
      86              :         }
      87              : 
      88            0 :         return node;
      89            0 : }
      90              : 
      91              : /*
      92              :  * initTrie  - create trie from file.
      93              :  *
      94              :  * Function converts UTF8-encoded file into current encoding.
      95              :  */
      96              : static TrieChar *
      97            0 : initTrie(const char *filename)
      98              : {
      99            0 :         TrieChar   *volatile rootTrie = NULL;
     100            0 :         MemoryContext ccxt = CurrentMemoryContext;
     101            0 :         tsearch_readline_state trst;
     102            0 :         volatile bool skip;
     103              : 
     104            0 :         filename = get_tsearch_config_filename(filename, "rules");
     105            0 :         if (!tsearch_readline_begin(&trst, filename))
     106            0 :                 ereport(ERROR,
     107              :                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
     108              :                                  errmsg("could not open unaccent file \"%s\": %m",
     109              :                                                 filename)));
     110              : 
     111            0 :         do
     112              :         {
     113              :                 /*
     114              :                  * pg_do_encoding_conversion() (called by tsearch_readline()) will
     115              :                  * emit exception if it finds untranslatable characters in current
     116              :                  * locale. We just skip such lines, continuing with the next.
     117              :                  */
     118            0 :                 skip = true;
     119              : 
     120            0 :                 PG_TRY();
     121              :                 {
     122            0 :                         char       *line;
     123              : 
     124            0 :                         while ((line = tsearch_readline(&trst)) != NULL)
     125              :                         {
     126              :                                 /*----------
     127              :                                  * The format of each line must be "src" or "src trg", where
     128              :                                  * src and trg are sequences of one or more non-whitespace
     129              :                                  * characters, separated by whitespace.  Whitespace at start
     130              :                                  * or end of line is ignored.  If trg is omitted, an empty
     131              :                                  * string is used as the replacement.  trg can be optionally
     132              :                                  * quoted, in which case whitespaces are included in it.
     133              :                                  *
     134              :                                  * We use a simple state machine, with states
     135              :                                  *      0       initial (before src)
     136              :                                  *      1       in src
     137              :                                  *      2       in whitespace after src
     138              :                                  *      3       in trg (non-quoted)
     139              :                                  *      4       in trg (quoted)
     140              :                                  *      5       in whitespace after trg
     141              :                                  *      -1      syntax error detected (two strings)
     142              :                                  *      -2      syntax error detected (unfinished quoted string)
     143              :                                  *----------
     144              :                                  */
     145            0 :                                 int                     state;
     146            0 :                                 char       *ptr;
     147            0 :                                 char       *src = NULL;
     148            0 :                                 char       *trg = NULL;
     149            0 :                                 char       *trgstore = NULL;
     150            0 :                                 int                     ptrlen;
     151            0 :                                 int                     srclen = 0;
     152            0 :                                 int                     trglen = 0;
     153            0 :                                 int                     trgstorelen = 0;
     154            0 :                                 bool            trgquoted = false;
     155              : 
     156            0 :                                 state = 0;
     157            0 :                                 for (ptr = line; *ptr; ptr += ptrlen)
     158              :                                 {
     159            0 :                                         ptrlen = pg_mblen(ptr);
     160              :                                         /* ignore whitespace, but end src or trg */
     161            0 :                                         if (isspace((unsigned char) *ptr))
     162              :                                         {
     163            0 :                                                 if (state == 1)
     164            0 :                                                         state = 2;
     165            0 :                                                 else if (state == 3)
     166            0 :                                                         state = 5;
     167              :                                                 /* whitespaces are OK in quoted area */
     168            0 :                                                 if (state != 4)
     169            0 :                                                         continue;
     170            0 :                                         }
     171            0 :                                         switch (state)
     172              :                                         {
     173              :                                                 case 0:
     174              :                                                         /* start of src */
     175            0 :                                                         src = ptr;
     176            0 :                                                         srclen = ptrlen;
     177            0 :                                                         state = 1;
     178            0 :                                                         break;
     179              :                                                 case 1:
     180              :                                                         /* continue src */
     181            0 :                                                         srclen += ptrlen;
     182            0 :                                                         break;
     183              :                                                 case 2:
     184              :                                                         /* start of trg */
     185            0 :                                                         if (*ptr == '"')
     186              :                                                         {
     187            0 :                                                                 trgquoted = true;
     188            0 :                                                                 state = 4;
     189            0 :                                                         }
     190              :                                                         else
     191            0 :                                                                 state = 3;
     192              : 
     193            0 :                                                         trg = ptr;
     194            0 :                                                         trglen = ptrlen;
     195            0 :                                                         break;
     196              :                                                 case 3:
     197              :                                                         /* continue non-quoted trg */
     198            0 :                                                         trglen += ptrlen;
     199            0 :                                                         break;
     200              :                                                 case 4:
     201              :                                                         /* continue quoted trg */
     202            0 :                                                         trglen += ptrlen;
     203              : 
     204              :                                                         /*
     205              :                                                          * If this is a quote, consider it as the end of
     206              :                                                          * trg except if the follow-up character is itself
     207              :                                                          * a quote.
     208              :                                                          */
     209            0 :                                                         if (*ptr == '"')
     210              :                                                         {
     211            0 :                                                                 if (*(ptr + 1) == '"')
     212              :                                                                 {
     213            0 :                                                                         ptr++;
     214            0 :                                                                         trglen += 1;
     215            0 :                                                                 }
     216              :                                                                 else
     217            0 :                                                                         state = 5;
     218            0 :                                                         }
     219            0 :                                                         break;
     220              :                                                 default:
     221              :                                                         /* bogus line format */
     222            0 :                                                         state = -1;
     223            0 :                                                         break;
     224              :                                         }
     225            0 :                                 }
     226              : 
     227            0 :                                 if (state == 1 || state == 2)
     228              :                                 {
     229              :                                         /* trg was omitted, so use "" */
     230            0 :                                         trg = "";
     231            0 :                                         trglen = 0;
     232            0 :                                 }
     233              : 
     234              :                                 /* If still in a quoted area, fallback to an error */
     235            0 :                                 if (state == 4)
     236            0 :                                         state = -2;
     237              : 
     238              :                                 /* If trg was quoted, remove its quotes and unescape it */
     239            0 :                                 if (trgquoted && state > 0)
     240              :                                 {
     241              :                                         /* Ignore first and end quotes */
     242            0 :                                         trgstore = palloc_array(char, trglen - 2);
     243            0 :                                         trgstorelen = 0;
     244            0 :                                         for (int i = 1; i < trglen - 1; i++)
     245              :                                         {
     246            0 :                                                 trgstore[trgstorelen] = trg[i];
     247            0 :                                                 trgstorelen++;
     248              :                                                 /* skip second double quotes */
     249            0 :                                                 if (trg[i] == '"' && trg[i + 1] == '"')
     250            0 :                                                         i++;
     251            0 :                                         }
     252            0 :                                 }
     253              :                                 else
     254              :                                 {
     255            0 :                                         trgstore = palloc_array(char, trglen);
     256            0 :                                         trgstorelen = trglen;
     257            0 :                                         memcpy(trgstore, trg, trgstorelen);
     258              :                                 }
     259              : 
     260            0 :                                 if (state > 0)
     261            0 :                                         rootTrie = placeChar(rootTrie,
     262            0 :                                                                                  (unsigned char *) src, srclen,
     263            0 :                                                                                  trgstore, trgstorelen);
     264            0 :                                 else if (state == -1)
     265            0 :                                         ereport(WARNING,
     266              :                                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
     267              :                                                          errmsg("invalid syntax: more than two strings in unaccent rule")));
     268            0 :                                 else if (state == -2)
     269            0 :                                         ereport(WARNING,
     270              :                                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
     271              :                                                          errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
     272              : 
     273            0 :                                 pfree(trgstore);
     274            0 :                                 pfree(line);
     275            0 :                         }
     276            0 :                         skip = false;
     277            0 :                 }
     278            0 :                 PG_CATCH();
     279              :                 {
     280            0 :                         ErrorData  *errdata;
     281            0 :                         MemoryContext ecxt;
     282              : 
     283            0 :                         ecxt = MemoryContextSwitchTo(ccxt);
     284            0 :                         errdata = CopyErrorData();
     285            0 :                         if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
     286              :                         {
     287            0 :                                 FlushErrorState();
     288            0 :                         }
     289              :                         else
     290              :                         {
     291            0 :                                 MemoryContextSwitchTo(ecxt);
     292            0 :                                 PG_RE_THROW();
     293              :                         }
     294            0 :                 }
     295            0 :                 PG_END_TRY();
     296            0 :         }
     297            0 :         while (skip);
     298              : 
     299            0 :         tsearch_readline_end(&trst);
     300              : 
     301            0 :         return rootTrie;
     302            0 : }
     303              : 
     304              : /*
     305              :  * findReplaceTo - find longest possible match in trie
     306              :  *
     307              :  * On success, returns pointer to ending subnode, plus length of matched
     308              :  * source string in *p_matchlen.  On failure, returns NULL.
     309              :  */
     310              : static TrieChar *
     311            0 : findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
     312              :                           int *p_matchlen)
     313              : {
     314            0 :         TrieChar   *result = NULL;
     315            0 :         int                     matchlen = 0;
     316              : 
     317            0 :         *p_matchlen = 0;                        /* prevent uninitialized-variable warnings */
     318              : 
     319            0 :         while (node && matchlen < srclen)
     320              :         {
     321            0 :                 node = node + src[matchlen];
     322            0 :                 matchlen++;
     323              : 
     324            0 :                 if (node->replaceTo)
     325              :                 {
     326            0 :                         result = node;
     327            0 :                         *p_matchlen = matchlen;
     328            0 :                 }
     329              : 
     330            0 :                 node = node->nextChar;
     331              :         }
     332              : 
     333            0 :         return result;
     334            0 : }
     335              : 
     336            0 : PG_FUNCTION_INFO_V1(unaccent_init);
     337              : Datum
     338            0 : unaccent_init(PG_FUNCTION_ARGS)
     339              : {
     340            0 :         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     341            0 :         TrieChar   *rootTrie = NULL;
     342            0 :         bool            fileloaded = false;
     343            0 :         ListCell   *l;
     344              : 
     345            0 :         foreach(l, dictoptions)
     346              :         {
     347            0 :                 DefElem    *defel = (DefElem *) lfirst(l);
     348              : 
     349            0 :                 if (strcmp(defel->defname, "rules") == 0)
     350              :                 {
     351            0 :                         if (fileloaded)
     352            0 :                                 ereport(ERROR,
     353              :                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     354              :                                                  errmsg("multiple Rules parameters")));
     355            0 :                         rootTrie = initTrie(defGetString(defel));
     356            0 :                         fileloaded = true;
     357            0 :                 }
     358              :                 else
     359              :                 {
     360            0 :                         ereport(ERROR,
     361              :                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     362              :                                          errmsg("unrecognized Unaccent parameter: \"%s\"",
     363              :                                                         defel->defname)));
     364              :                 }
     365            0 :         }
     366              : 
     367            0 :         if (!fileloaded)
     368              :         {
     369            0 :                 ereport(ERROR,
     370              :                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     371              :                                  errmsg("missing Rules parameter")));
     372            0 :         }
     373              : 
     374            0 :         PG_RETURN_POINTER(rootTrie);
     375            0 : }
     376              : 
     377            0 : PG_FUNCTION_INFO_V1(unaccent_lexize);
     378              : Datum
     379            0 : unaccent_lexize(PG_FUNCTION_ARGS)
     380              : {
     381            0 :         TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
     382            0 :         char       *srcchar = (char *) PG_GETARG_POINTER(1);
     383            0 :         int32           len = PG_GETARG_INT32(2);
     384            0 :         char       *srcstart = srcchar;
     385            0 :         TSLexeme   *res;
     386            0 :         StringInfoData buf;
     387              : 
     388              :         /* we allocate storage for the buffer only if needed */
     389            0 :         buf.data = NULL;
     390              : 
     391            0 :         while (len > 0)
     392              :         {
     393            0 :                 TrieChar   *node;
     394            0 :                 int                     matchlen;
     395              : 
     396            0 :                 node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
     397              :                                                          &matchlen);
     398            0 :                 if (node && node->replaceTo)
     399              :                 {
     400            0 :                         if (buf.data == NULL)
     401              :                         {
     402              :                                 /* initialize buffer */
     403            0 :                                 initStringInfo(&buf);
     404              :                                 /* insert any data we already skipped over */
     405            0 :                                 if (srcchar != srcstart)
     406            0 :                                         appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
     407            0 :                         }
     408            0 :                         appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
     409            0 :                 }
     410              :                 else
     411              :                 {
     412            0 :                         matchlen = pg_mblen(srcchar);
     413            0 :                         if (buf.data != NULL)
     414            0 :                                 appendBinaryStringInfo(&buf, srcchar, matchlen);
     415              :                 }
     416              : 
     417            0 :                 srcchar += matchlen;
     418            0 :                 len -= matchlen;
     419            0 :         }
     420              : 
     421              :         /* return a result only if we made at least one substitution */
     422            0 :         if (buf.data != NULL)
     423              :         {
     424            0 :                 res = palloc0_array(TSLexeme, 2);
     425            0 :                 res->lexeme = buf.data;
     426            0 :                 res->flags = TSL_FILTER;
     427            0 :         }
     428              :         else
     429            0 :                 res = NULL;
     430              : 
     431            0 :         PG_RETURN_POINTER(res);
     432            0 : }
     433              : 
     434              : /*
     435              :  * Function-like wrapper for dictionary
     436              :  */
     437            0 : PG_FUNCTION_INFO_V1(unaccent_dict);
     438              : Datum
     439            0 : unaccent_dict(PG_FUNCTION_ARGS)
     440              : {
     441            0 :         text       *str;
     442            0 :         int                     strArg;
     443            0 :         Oid                     dictOid;
     444            0 :         TSDictionaryCacheEntry *dict;
     445            0 :         TSLexeme   *res;
     446              : 
     447            0 :         if (PG_NARGS() == 1)
     448              :         {
     449              :                 /*
     450              :                  * Use the "unaccent" dictionary that is in the same schema that this
     451              :                  * function is in.
     452              :                  */
     453            0 :                 Oid                     procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
     454            0 :                 const char *dictname = "unaccent";
     455              : 
     456            0 :                 dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
     457              :                                                                   PointerGetDatum(dictname),
     458              :                                                                   ObjectIdGetDatum(procnspid));
     459            0 :                 if (!OidIsValid(dictOid))
     460            0 :                         ereport(ERROR,
     461              :                                         (errcode(ERRCODE_UNDEFINED_OBJECT),
     462              :                                          errmsg("text search dictionary \"%s.%s\" does not exist",
     463              :                                                         get_namespace_name(procnspid), dictname)));
     464            0 :                 strArg = 0;
     465            0 :         }
     466              :         else
     467              :         {
     468            0 :                 dictOid = PG_GETARG_OID(0);
     469            0 :                 strArg = 1;
     470              :         }
     471            0 :         str = PG_GETARG_TEXT_PP(strArg);
     472              : 
     473            0 :         dict = lookup_ts_dictionary_cache(dictOid);
     474              : 
     475            0 :         res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
     476              :                                                                                                          PointerGetDatum(dict->dictData),
     477              :                                                                                                          PointerGetDatum(VARDATA_ANY(str)),
     478              :                                                                                                          Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
     479              :                                                                                                          PointerGetDatum(NULL)));
     480              : 
     481            0 :         PG_FREE_IF_COPY(str, strArg);
     482              : 
     483            0 :         if (res == NULL)
     484              :         {
     485            0 :                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
     486              :         }
     487            0 :         else if (res->lexeme == NULL)
     488              :         {
     489            0 :                 pfree(res);
     490            0 :                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
     491              :         }
     492              :         else
     493              :         {
     494            0 :                 text       *txt = cstring_to_text(res->lexeme);
     495              : 
     496            0 :                 pfree(res->lexeme);
     497            0 :                 pfree(res);
     498              : 
     499            0 :                 PG_RETURN_TEXT_P(txt);
     500            0 :         }
     501            0 : }

Generated by: LCOV version 2.3.2-1