LCOV - Code coverage - src/backend/snowball/dict

LCOV - code coverage report

Current view:	top level - src/backend/snowball - dict_snowball.c (source / functions)		Coverage	Total	Hit
Test:	Code coverage	Lines:	68.9 %	90	62
Test Date:	2026-01-26 10:56:24	Functions:	100.0 %	6	6
Legend:	Lines: hit not hit Branches: + taken - not taken # not executed	Branches:	35.3 %	68	24

             Branch data     Line data    Source code

       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * dict_snowball.c
       4                 :             :  *              Snowball dictionary
       5                 :             :  *
       6                 :             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7                 :             :  *
       8                 :             :  * IDENTIFICATION
       9                 :             :  *        src/backend/snowball/dict_snowball.c
      10                 :             :  *
      11                 :             :  *-------------------------------------------------------------------------
      12                 :             :  */
      13                 :             : #include "postgres.h"
      14                 :             : 
      15                 :             : #include "catalog/pg_collation_d.h"
      16                 :             : #include "commands/defrem.h"
      17                 :             : #include "mb/pg_wchar.h"
      18                 :             : #include "tsearch/ts_public.h"
      19                 :             : #include "utils/formatting.h"
      20                 :             : 
      21                 :             : /* Some platforms define MAXINT and/or MININT, causing conflicts */
      22                 :             : #ifdef MAXINT
      23                 :             : #undef MAXINT
      24                 :             : #endif
      25                 :             : #ifdef MININT
      26                 :             : #undef MININT
      27                 :             : #endif
      28                 :             : 
      29                 :             : /* Now we can include the original Snowball snowball_runtime.h */
      30                 :             : #include "snowball/libstemmer/snowball_runtime.h"
      31                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
      32                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
      33                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
      34                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
      35                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_dutch_porter.h"
      36                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
      37                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
      38                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
      39                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
      40                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
      41                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
      42                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
      43                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
      44                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
      45                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
      46                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
      47                 :             : #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
      48                 :             : #include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
      49                 :             : #include "snowball/libstemmer/stem_ISO_8859_2_polish.h"
      50                 :             : #include "snowball/libstemmer/stem_KOI8_R_russian.h"
      51                 :             : #include "snowball/libstemmer/stem_UTF_8_arabic.h"
      52                 :             : #include "snowball/libstemmer/stem_UTF_8_armenian.h"
      53                 :             : #include "snowball/libstemmer/stem_UTF_8_basque.h"
      54                 :             : #include "snowball/libstemmer/stem_UTF_8_catalan.h"
      55                 :             : #include "snowball/libstemmer/stem_UTF_8_danish.h"
      56                 :             : #include "snowball/libstemmer/stem_UTF_8_dutch.h"
      57                 :             : #include "snowball/libstemmer/stem_UTF_8_dutch_porter.h"
      58                 :             : #include "snowball/libstemmer/stem_UTF_8_english.h"
      59                 :             : #include "snowball/libstemmer/stem_UTF_8_esperanto.h"
      60                 :             : #include "snowball/libstemmer/stem_UTF_8_estonian.h"
      61                 :             : #include "snowball/libstemmer/stem_UTF_8_finnish.h"
      62                 :             : #include "snowball/libstemmer/stem_UTF_8_french.h"
      63                 :             : #include "snowball/libstemmer/stem_UTF_8_german.h"
      64                 :             : #include "snowball/libstemmer/stem_UTF_8_greek.h"
      65                 :             : #include "snowball/libstemmer/stem_UTF_8_hindi.h"
      66                 :             : #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
      67                 :             : #include "snowball/libstemmer/stem_UTF_8_indonesian.h"
      68                 :             : #include "snowball/libstemmer/stem_UTF_8_irish.h"
      69                 :             : #include "snowball/libstemmer/stem_UTF_8_italian.h"
      70                 :             : #include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
      71                 :             : #include "snowball/libstemmer/stem_UTF_8_nepali.h"
      72                 :             : #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
      73                 :             : #include "snowball/libstemmer/stem_UTF_8_polish.h"
      74                 :             : #include "snowball/libstemmer/stem_UTF_8_porter.h"
      75                 :             : #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
      76                 :             : #include "snowball/libstemmer/stem_UTF_8_romanian.h"
      77                 :             : #include "snowball/libstemmer/stem_UTF_8_russian.h"
      78                 :             : #include "snowball/libstemmer/stem_UTF_8_serbian.h"
      79                 :             : #include "snowball/libstemmer/stem_UTF_8_spanish.h"
      80                 :             : #include "snowball/libstemmer/stem_UTF_8_swedish.h"
      81                 :             : #include "snowball/libstemmer/stem_UTF_8_tamil.h"
      82                 :             : #include "snowball/libstemmer/stem_UTF_8_turkish.h"
      83                 :             : #include "snowball/libstemmer/stem_UTF_8_yiddish.h"
      84                 :             : 
      85                 :           6 : PG_MODULE_MAGIC_EXT(
      86                 :             :                                         .name = "dict_snowball",
      87                 :             :                                         .version = PG_VERSION
      88                 :             : );
      89                 :             : 
      90                 :           6 : PG_FUNCTION_INFO_V1(dsnowball_init);
      91                 :             : 
      92                 :           6 : PG_FUNCTION_INFO_V1(dsnowball_lexize);
      93                 :             : 
      94                 :             : /* List of supported modules */
      95                 :             : typedef struct stemmer_module
      96                 :             : {
      97                 :             :         const char *name;
      98                 :             :         pg_enc          enc;
      99                 :             :         struct SN_env *(*create) (void);
     100                 :             :         void            (*close) (struct SN_env *);
     101                 :             :         int                     (*stem) (struct SN_env *);
     102                 :             : } stemmer_module;
     103                 :             : 
     104                 :             : /* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
     105                 :             : #define STEMMER_MODULE(name,enc,senc) \
     106                 :             :         {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
     107                 :             : 
     108                 :             : static const stemmer_module stemmer_modules[] =
     109                 :             : {
     110                 :             :         /*
     111                 :             :          * Stemmers list from Snowball distribution
     112                 :             :          */
     113                 :             :         STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
     114                 :             :         STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
     115                 :             :         STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
     116                 :             :         STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
     117                 :             :         STEMMER_MODULE(dutch_porter, PG_LATIN1, ISO_8859_1),
     118                 :             :         STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
     119                 :             :         STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
     120                 :             :         STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
     121                 :             :         STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
     122                 :             :         STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
     123                 :             :         STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
     124                 :             :         STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
     125                 :             :         STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
     126                 :             :         STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
     127                 :             :         STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
     128                 :             :         STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
     129                 :             :         STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
     130                 :             :         STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
     131                 :             :         STEMMER_MODULE(polish, PG_LATIN2, ISO_8859_2),
     132                 :             :         STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
     133                 :             :         STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
     134                 :             :         STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
     135                 :             :         STEMMER_MODULE(basque, PG_UTF8, UTF_8),
     136                 :             :         STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
     137                 :             :         STEMMER_MODULE(danish, PG_UTF8, UTF_8),
     138                 :             :         STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
     139                 :             :         STEMMER_MODULE(dutch_porter, PG_UTF8, UTF_8),
     140                 :             :         STEMMER_MODULE(english, PG_UTF8, UTF_8),
     141                 :             :         STEMMER_MODULE(esperanto, PG_UTF8, UTF_8),
     142                 :             :         STEMMER_MODULE(estonian, PG_UTF8, UTF_8),
     143                 :             :         STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
     144                 :             :         STEMMER_MODULE(french, PG_UTF8, UTF_8),
     145                 :             :         STEMMER_MODULE(german, PG_UTF8, UTF_8),
     146                 :             :         STEMMER_MODULE(greek, PG_UTF8, UTF_8),
     147                 :             :         STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
     148                 :             :         STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
     149                 :             :         STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
     150                 :             :         STEMMER_MODULE(irish, PG_UTF8, UTF_8),
     151                 :             :         STEMMER_MODULE(italian, PG_UTF8, UTF_8),
     152                 :             :         STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
     153                 :             :         STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
     154                 :             :         STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
     155                 :             :         STEMMER_MODULE(porter, PG_UTF8, UTF_8),
     156                 :             :         STEMMER_MODULE(polish, PG_UTF8, UTF_8),
     157                 :             :         STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
     158                 :             :         STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
     159                 :             :         STEMMER_MODULE(russian, PG_UTF8, UTF_8),
     160                 :             :         STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
     161                 :             :         STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
     162                 :             :         STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
     163                 :             :         STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
     164                 :             :         STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
     165                 :             :         STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
     166                 :             : 
     167                 :             :         /*
     168                 :             :          * Stemmer with PG_SQL_ASCII encoding should be valid for any server
     169                 :             :          * encoding
     170                 :             :          */
     171                 :             :         STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
     172                 :             : 
     173                 :             :         {NULL, 0, NULL, NULL, NULL} /* list end marker */
     174                 :             : };
     175                 :             : 
     176                 :             : 
     177                 :             : typedef struct DictSnowball
     178                 :             : {
     179                 :             :         struct SN_env *z;
     180                 :             :         StopList        stoplist;
     181                 :             :         bool            needrecode;             /* needs recoding before/after call stem */
     182                 :             :         int                     (*stem) (struct SN_env *z);
     183                 :             : 
     184                 :             :         /*
     185                 :             :          * snowball saves alloced memory between calls, so we should run it in our
     186                 :             :          * private memory context. Note, init function is executed in long lived
     187                 :             :          * context, so we just remember CurrentMemoryContext
     188                 :             :          */
     189                 :             :         MemoryContext dictCtx;
     190                 :             : } DictSnowball;
     191                 :             : 
     192                 :             : 
     193                 :             : static void
     194                 :           6 : locate_stem_module(DictSnowball *d, const char *lang)
     195                 :             : {
     196                 :           6 :         const stemmer_module *m;
     197                 :             : 
     198                 :             :         /*
     199                 :             :          * First, try to find exact match of stemmer module. Stemmer with
     200                 :             :          * PG_SQL_ASCII encoding is treated as working with any server encoding
     201                 :             :          */
     202         [ +  - ]:         168 :         for (m = stemmer_modules; m->name; m++)
     203                 :             :         {
     204   [ +  -  +  + ]:         168 :                 if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
     205                 :         168 :                         pg_strcasecmp(m->name, lang) == 0)
     206                 :             :                 {
     207                 :           6 :                         d->stem = m->stem;
     208                 :           6 :                         d->z = m->create();
     209                 :           6 :                         d->needrecode = false;
     210                 :           6 :                         return;
     211                 :             :                 }
     212                 :         162 :         }
     213                 :             : 
     214                 :             :         /*
     215                 :             :          * Second, try to find stemmer for needed language for UTF8 encoding.
     216                 :             :          */
     217         [ #  # ]:           0 :         for (m = stemmer_modules; m->name; m++)
     218                 :             :         {
     219   [ #  #  #  # ]:           0 :                 if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
     220                 :             :                 {
     221                 :           0 :                         d->stem = m->stem;
     222                 :           0 :                         d->z = m->create();
     223                 :           0 :                         d->needrecode = true;
     224                 :           0 :                         return;
     225                 :             :                 }
     226                 :           0 :         }
     227                 :             : 
     228   [ #  #  #  # ]:           0 :         ereport(ERROR,
     229                 :             :                         (errcode(ERRCODE_UNDEFINED_OBJECT),
     230                 :             :                          errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
     231                 :             :                                         lang, GetDatabaseEncodingName())));
     232         [ -  + ]:           6 : }
     233                 :             : 
     234                 :             : Datum
     235                 :           6 : dsnowball_init(PG_FUNCTION_ARGS)
     236                 :             : {
     237                 :           6 :         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     238                 :           6 :         DictSnowball *d;
     239                 :           6 :         bool            stoploaded = false;
     240                 :           6 :         ListCell   *l;
     241                 :             : 
     242                 :           6 :         d = palloc0_object(DictSnowball);
     243                 :             : 
     244   [ +  -  +  +  :          18 :         foreach(l, dictoptions)
                   +  + ]
     245                 :             :         {
     246                 :          12 :                 DefElem    *defel = (DefElem *) lfirst(l);
     247                 :             : 
     248         [ +  + ]:          12 :                 if (strcmp(defel->defname, "stopwords") == 0)
     249                 :             :                 {
     250         [ +  - ]:           6 :                         if (stoploaded)
     251   [ #  #  #  # ]:           0 :                                 ereport(ERROR,
     252                 :             :                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     253                 :             :                                                  errmsg("multiple StopWords parameters")));
     254                 :           6 :                         readstoplist(defGetString(defel), &d->stoplist, str_tolower);
     255                 :           6 :                         stoploaded = true;
     256                 :           6 :                 }
     257         [ +  - ]:           6 :                 else if (strcmp(defel->defname, "language") == 0)
     258                 :             :                 {
     259         [ +  - ]:           6 :                         if (d->stem)
     260   [ #  #  #  # ]:           0 :                                 ereport(ERROR,
     261                 :             :                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     262                 :             :                                                  errmsg("multiple Language parameters")));
     263                 :           6 :                         locate_stem_module(d, defGetString(defel));
     264                 :           6 :                 }
     265                 :             :                 else
     266                 :             :                 {
     267   [ #  #  #  # ]:           0 :                         ereport(ERROR,
     268                 :             :                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     269                 :             :                                          errmsg("unrecognized Snowball parameter: \"%s\"",
     270                 :             :                                                         defel->defname)));
     271                 :             :                 }
     272                 :          12 :         }
     273                 :             : 
     274         [ +  - ]:           6 :         if (!d->stem)
     275   [ #  #  #  # ]:           0 :                 ereport(ERROR,
     276                 :             :                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     277                 :             :                                  errmsg("missing Language parameter")));
     278                 :             : 
     279                 :           6 :         d->dictCtx = CurrentMemoryContext;
     280                 :             : 
     281                 :          12 :         PG_RETURN_POINTER(d);
     282                 :           6 : }
     283                 :             : 
     284                 :             : Datum
     285                 :        1705 : dsnowball_lexize(PG_FUNCTION_ARGS)
     286                 :             : {
     287                 :        1705 :         DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
     288                 :        1705 :         char       *in = (char *) PG_GETARG_POINTER(1);
     289                 :        1705 :         int32           len = PG_GETARG_INT32(2);
     290                 :        1705 :         char       *txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
     291                 :        1705 :         TSLexeme   *res = palloc0_array(TSLexeme, 2);
     292                 :             : 
     293                 :             :         /*
     294                 :             :          * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
     295                 :             :          * surely not words in any human language.  This restriction avoids
     296                 :             :          * wasting cycles on stuff like base64-encoded data, and it protects us
     297                 :             :          * against possible inefficiency or misbehavior in the stemmer.  (For
     298                 :             :          * example, the Turkish stemmer has an indefinite recursion, so it can
     299                 :             :          * crash on long-enough strings.)  However, Snowball dictionaries are
     300                 :             :          * defined to recognize all strings, so we can't reject the string as an
     301                 :             :          * unknown word.
     302                 :             :          */
     303         [ -  + ]:        1705 :         if (len > 1000)
     304                 :             :         {
     305                 :             :                 /* return the lexeme lowercased, but otherwise unmodified */
     306                 :           0 :                 res->lexeme = txt;
     307                 :           0 :         }
     308   [ +  -  +  + ]:        1705 :         else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
     309                 :             :         {
     310                 :             :                 /* empty or stopword, so report as stopword */
     311                 :         578 :                 pfree(txt);
     312                 :         578 :         }
     313                 :             :         else
     314                 :             :         {
     315                 :        1127 :                 MemoryContext saveCtx;
     316                 :             : 
     317                 :             :                 /*
     318                 :             :                  * recode to utf8 if stemmer is utf8 and doesn't match server encoding
     319                 :             :                  */
     320         [ +  - ]:        1127 :                 if (d->needrecode)
     321                 :             :                 {
     322                 :           0 :                         char       *recoded;
     323                 :             : 
     324                 :           0 :                         recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
     325         [ #  # ]:           0 :                         if (recoded != txt)
     326                 :             :                         {
     327                 :           0 :                                 pfree(txt);
     328                 :           0 :                                 txt = recoded;
     329                 :           0 :                         }
     330                 :           0 :                 }
     331                 :             : 
     332                 :             :                 /* see comment about d->dictCtx */
     333                 :        1127 :                 saveCtx = MemoryContextSwitchTo(d->dictCtx);
     334                 :        1127 :                 SN_set_current(d->z, strlen(txt), (symbol *) txt);
     335                 :        1127 :                 d->stem(d->z);
     336                 :        1127 :                 MemoryContextSwitchTo(saveCtx);
     337                 :             : 
     338   [ +  -  -  + ]:        1127 :                 if (d->z->p && d->z->l)
     339                 :             :                 {
     340                 :        1127 :                         txt = repalloc(txt, d->z->l + 1);
     341                 :        1127 :                         memcpy(txt, d->z->p, d->z->l);
     342                 :        1127 :                         txt[d->z->l] = '\0';
     343                 :        1127 :                 }
     344                 :             : 
     345                 :             :                 /* back recode if needed */
     346         [ +  - ]:        1127 :                 if (d->needrecode)
     347                 :             :                 {
     348                 :           0 :                         char       *recoded;
     349                 :             : 
     350                 :           0 :                         recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
     351         [ #  # ]:           0 :                         if (recoded != txt)
     352                 :             :                         {
     353                 :           0 :                                 pfree(txt);
     354                 :           0 :                                 txt = recoded;
     355                 :           0 :                         }
     356                 :           0 :                 }
     357                 :             : 
     358                 :        1127 :                 res->lexeme = txt;
     359                 :        1127 :         }
     360                 :             : 
     361                 :        3410 :         PG_RETURN_POINTER(res);
     362                 :        1705 : }

Generated by: LCOV version 2.3.2-1