LCOV - Code coverage - src/common/unicode

LCOV - code coverage report

Current view:	top level - src/common - unicode_case.c (source / functions)		Coverage	Total	Hit
Test:	Code coverage	Lines:	89.4 %	141	126
Test Date:	2026-01-26 10:56:24	Functions:	84.6 %	13	11
Legend:	Lines: hit not hit Branches: + taken - not taken # not executed	Branches:	77.8 %	108	84

             Branch data     Line data    Source code

       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  * unicode_case.c
       3                 :             :  *              Unicode case mapping and case conversion.
       4                 :             :  *
       5                 :             :  * Portions Copyright (c) 2017-2026, PostgreSQL Global Development Group
       6                 :             :  *
       7                 :             :  * IDENTIFICATION
       8                 :             :  *        src/common/unicode_case.c
       9                 :             :  *
      10                 :             :  *-------------------------------------------------------------------------
      11                 :             :  */
      12                 :             : #ifndef FRONTEND
      13                 :             : #include "postgres.h"
      14                 :             : #else
      15                 :             : #include "postgres_fe.h"
      16                 :             : #endif
      17                 :             : 
      18                 :             : #include "common/unicode_case.h"
      19                 :             : #include "common/unicode_case_table.h"
      20                 :             : #include "common/unicode_category.h"
      21                 :             : #include "mb/pg_wchar.h"
      22                 :             : 
      23                 :             : enum CaseMapResult
      24                 :             : {
      25                 :             :         CASEMAP_SELF,
      26                 :             :         CASEMAP_SIMPLE,
      27                 :             :         CASEMAP_SPECIAL,
      28                 :             : };
      29                 :             : 
      30                 :             : /*
      31                 :             :  * Map for each case kind.
      32                 :             :  */
      33                 :             : static const char32_t *const casekind_map[NCaseKind] =
      34                 :             : {
      35                 :             :         [CaseLower] = case_map_lower,
      36                 :             :         [CaseTitle] = case_map_title,
      37                 :             :         [CaseUpper] = case_map_upper,
      38                 :             :         [CaseFold] = case_map_fold,
      39                 :             : };
      40                 :             : 
      41                 :             : static char32_t find_case_map(char32_t ucs, const char32_t *map);
      42                 :             : static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
      43                 :             :                                                    CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
      44                 :             :                                                    void *wbstate);
      45                 :             : static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
      46                 :             :                                                                   const char *src, size_t srclen, size_t srcoff,
      47                 :             :                                                                   char32_t *simple, const char32_t **special);
      48                 :             : 
      49                 :             : char32_t
      50                 :          52 : unicode_lowercase_simple(char32_t code)
      51                 :             : {
      52                 :          52 :         char32_t        cp = find_case_map(code, case_map_lower);
      53                 :             : 
      54         [ +  - ]:          52 :         return cp != 0 ? cp : code;
      55                 :          52 : }
      56                 :             : 
      57                 :             : char32_t
      58                 :           0 : unicode_titlecase_simple(char32_t code)
      59                 :             : {
      60                 :           0 :         char32_t        cp = find_case_map(code, case_map_title);
      61                 :             : 
      62         [ #  # ]:           0 :         return cp != 0 ? cp : code;
      63                 :           0 : }
      64                 :             : 
      65                 :             : char32_t
      66                 :          52 : unicode_uppercase_simple(char32_t code)
      67                 :             : {
      68                 :          52 :         char32_t        cp = find_case_map(code, case_map_upper);
      69                 :             : 
      70         [ +  - ]:          52 :         return cp != 0 ? cp : code;
      71                 :          52 : }
      72                 :             : 
      73                 :             : char32_t
      74                 :           0 : unicode_casefold_simple(char32_t code)
      75                 :             : {
      76                 :           0 :         char32_t        cp = find_case_map(code, case_map_fold);
      77                 :             : 
      78         [ #  # ]:           0 :         return cp != 0 ? cp : code;
      79                 :           0 : }
      80                 :             : 
      81                 :             : /*
      82                 :             :  * unicode_strlower()
      83                 :             :  *
      84                 :             :  * Convert src to lowercase, and return the result length (not including
      85                 :             :  * terminating NUL).
      86                 :             :  *
      87                 :             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
      88                 :             :  * NUL-terminated.
      89                 :             :  *
      90                 :             :  * Result string is stored in dst, truncating if larger than dstsize. If
      91                 :             :  * dstsize is greater than the result length, dst will be NUL-terminated;
      92                 :             :  * otherwise not.
      93                 :             :  *
      94                 :             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
      95                 :             :  * required buffer size before allocating.
      96                 :             :  *
      97                 :             :  * If full is true, use special case mappings if available and if the
      98                 :             :  * conditions are satisfied.
      99                 :             :  */
     100                 :             : size_t
     101                 :          48 : unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     102                 :             :                                  bool full)
     103                 :             : {
     104                 :          48 :         return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
     105                 :             :                                                 NULL);
     106                 :             : }
     107                 :             : 
     108                 :             : /*
     109                 :             :  * unicode_strtitle()
     110                 :             :  *
     111                 :             :  * Convert src to titlecase, and return the result length (not including
     112                 :             :  * terminating NUL).
     113                 :             :  *
     114                 :             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     115                 :             :  * NUL-terminated.
     116                 :             :  *
     117                 :             :  * Result string is stored in dst, truncating if larger than dstsize. If
     118                 :             :  * dstsize is greater than the result length, dst will be NUL-terminated;
     119                 :             :  * otherwise not.
     120                 :             :  *
     121                 :             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     122                 :             :  * required buffer size before allocating.
     123                 :             :  *
     124                 :             :  * If full is true, use special case mappings if available and if the
     125                 :             :  * conditions are satisfied. Otherwise, use only simple mappings and use
     126                 :             :  * uppercase instead of titlecase.
     127                 :             :  *
     128                 :             :  * Titlecasing requires knowledge about word boundaries, which is provided by
     129                 :             :  * the callback wbnext. A word boundary is the offset of the start of a word
     130                 :             :  * or the offset of the character immediately following a word.
     131                 :             :  *
     132                 :             :  * The caller is expected to initialize and free the callback state
     133                 :             :  * wbstate. The callback should first return offset 0 for the first boundary;
     134                 :             :  * then the offset of each subsequent word boundary; then the total length of
     135                 :             :  * the string to indicate the final boundary.
     136                 :             :  */
     137                 :             : size_t
     138                 :          32 : unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     139                 :             :                                  bool full, WordBoundaryNext wbnext, void *wbstate)
     140                 :             : {
     141                 :          64 :         return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
     142                 :          32 :                                                 wbstate);
     143                 :             : }
     144                 :             : 
     145                 :             : /*
     146                 :             :  * unicode_strupper()
     147                 :             :  *
     148                 :             :  * Convert src to uppercase, and return the result length (not including
     149                 :             :  * terminating NUL).
     150                 :             :  *
     151                 :             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     152                 :             :  * NUL-terminated.
     153                 :             :  *
     154                 :             :  * Result string is stored in dst, truncating if larger than dstsize. If
     155                 :             :  * dstsize is greater than the result length, dst will be NUL-terminated;
     156                 :             :  * otherwise not.
     157                 :             :  *
     158                 :             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     159                 :             :  * required buffer size before allocating.
     160                 :             :  *
     161                 :             :  * If full is true, use special case mappings if available and if the
     162                 :             :  * conditions are satisfied.
     163                 :             :  */
     164                 :             : size_t
     165                 :          28 : unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     166                 :             :                                  bool full)
     167                 :             : {
     168                 :          28 :         return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
     169                 :             :                                                 NULL);
     170                 :             : }
     171                 :             : 
     172                 :             : /*
     173                 :             :  * unicode_strfold()
     174                 :             :  *
     175                 :             :  * Case fold src, and return the result length (not including terminating
     176                 :             :  * NUL).
     177                 :             :  *
     178                 :             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     179                 :             :  * NUL-terminated.
     180                 :             :  *
     181                 :             :  * Result string is stored in dst, truncating if larger than dstsize. If
     182                 :             :  * dstsize is greater than the result length, dst will be NUL-terminated;
     183                 :             :  * otherwise not.
     184                 :             :  *
     185                 :             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     186                 :             :  * required buffer size before allocating.
     187                 :             :  */
     188                 :             : size_t
     189                 :           2 : unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     190                 :             :                                 bool full)
     191                 :             : {
     192                 :           2 :         return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
     193                 :             :                                                 NULL);
     194                 :             : }
     195                 :             : 
     196                 :             : /*
     197                 :             :  * Implement Unicode Default Case Conversion algorithm.
     198                 :             :  *
     199                 :             :  * If str_casekind is CaseLower or CaseUpper, map each character in the string
     200                 :             :  * for which a mapping is available.
     201                 :             :  *
     202                 :             :  * If str_casekind is CaseTitle, maps characters found on a word boundary to
     203                 :             :  * titlecase (or uppercase if full is false) and other characters to
     204                 :             :  * lowercase. NB: does not currently implement the Unicode behavior in which
     205                 :             :  * the word boundary is adjusted to the next Cased character. That behavior
     206                 :             :  * could be implemented as an option, but it doesn't match the default
     207                 :             :  * behavior of ICU, nor does it match the documented behavior of INITCAP().
     208                 :             :  *
     209                 :             :  * If full is true, use special mappings for relevant characters, which can
     210                 :             :  * map a single codepoint to multiple codepoints, or depend on conditions.
     211                 :             :  */
     212                 :             : static size_t
     213                 :         110 : convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     214                 :             :                          CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
     215                 :             :                          void *wbstate)
     216                 :             : {
     217                 :             :         /* character CaseKind varies while titlecasing */
     218                 :         110 :         CaseKind        chr_casekind = str_casekind;
     219                 :         110 :         size_t          srcoff = 0;
     220                 :         110 :         size_t          result_len = 0;
     221                 :         110 :         size_t          boundary = 0;
     222                 :             : 
     223   [ +  +  +  -  :         110 :         Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
             +  -  +  - ]
     224                 :             :                    (str_casekind != CaseTitle && !wbnext && !wbstate));
     225                 :             : 
     226         [ +  + ]:         110 :         if (str_casekind == CaseTitle)
     227                 :             :         {
     228                 :          32 :                 boundary = wbnext(wbstate);
     229         [ +  - ]:          32 :                 Assert(boundary == 0);  /* start of text is always a boundary */
     230                 :          32 :         }
     231                 :             : 
     232   [ +  -  +  +  :         947 :         while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
                   +  + ]
     233                 :             :         {
     234                 :         837 :                 char32_t        u1 = utf8_to_unicode((unsigned char *) src + srcoff);
     235                 :         837 :                 int                     u1len = unicode_utf8len(u1);
     236                 :         837 :                 char32_t        simple = 0;
     237                 :         837 :                 const char32_t *special = NULL;
     238                 :         837 :                 enum CaseMapResult casemap_result;
     239                 :             : 
     240         [ +  + ]:         837 :                 if (str_casekind == CaseTitle)
     241                 :             :                 {
     242         [ +  + ]:         248 :                         if (srcoff == boundary)
     243                 :             :                         {
     244                 :         104 :                                 chr_casekind = full ? CaseTitle : CaseUpper;
     245                 :         104 :                                 boundary = wbnext(wbstate);
     246                 :         104 :                         }
     247                 :             :                         else
     248                 :         144 :                                 chr_casekind = CaseLower;
     249                 :         248 :                 }
     250                 :             : 
     251                 :         837 :                 casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
     252                 :             :                                                                  &simple, &special);
     253                 :             : 
     254   [ -  +  +  + ]:         837 :                 switch (casemap_result)
     255                 :             :                 {
     256                 :             :                         case CASEMAP_SELF:
     257                 :             :                                 /* no mapping; copy bytes from src */
     258         [ +  - ]:          20 :                                 Assert(simple == 0);
     259         [ +  - ]:          20 :                                 Assert(special == NULL);
     260         [ -  + ]:          20 :                                 if (result_len + u1len <= dstsize)
     261                 :          20 :                                         memcpy(dst + result_len, src + srcoff, u1len);
     262                 :             : 
     263                 :          20 :                                 result_len += u1len;
     264                 :          20 :                                 break;
     265                 :             :                         case CASEMAP_SIMPLE:
     266                 :             :                                 {
     267                 :             :                                         /* replace with single character */
     268                 :         798 :                                         char32_t        u2 = simple;
     269                 :         798 :                                         char32_t        u2len = unicode_utf8len(u2);
     270                 :             : 
     271         [ +  - ]:         798 :                                         Assert(special == NULL);
     272         [ +  + ]:         798 :                                         if (result_len + u2len <= dstsize)
     273                 :         790 :                                                 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     274                 :             : 
     275                 :         798 :                                         result_len += u2len;
     276                 :         798 :                                 }
     277                 :         798 :                                 break;
     278                 :             :                         case CASEMAP_SPECIAL:
     279                 :             :                                 /* replace with up to MAX_CASE_EXPANSION characters */
     280         [ +  - ]:          19 :                                 Assert(simple == 0);
     281   [ -  +  +  + ]:          47 :                                 for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
     282                 :             :                                 {
     283                 :          28 :                                         char32_t        u2 = special[i];
     284                 :          28 :                                         size_t          u2len = unicode_utf8len(u2);
     285                 :             : 
     286         [ -  + ]:          28 :                                         if (result_len + u2len <= dstsize)
     287                 :          28 :                                                 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     288                 :             : 
     289                 :          28 :                                         result_len += u2len;
     290                 :          28 :                                 }
     291                 :          19 :                                 break;
     292                 :             :                 }
     293                 :             : 
     294                 :         837 :                 srcoff += u1len;
     295                 :         837 :         }
     296                 :             : 
     297         [ +  + ]:         110 :         if (result_len < dstsize)
     298                 :          98 :                 dst[result_len] = '\0';
     299                 :             : 
     300                 :         220 :         return result_len;
     301                 :         110 : }
     302                 :             : 
     303                 :             : /*
     304                 :             :  * Check that the condition matches Final_Sigma, described in Unicode Table
     305                 :             :  * 3-17. The character at the given offset must be directly preceded by a
     306                 :             :  * Cased character, and must not be directly followed by a Cased character.
     307                 :             :  *
     308                 :             :  * Case_Ignorable characters are ignored. NB: some characters may be both
     309                 :             :  * Cased and Case_Ignorable, in which case they are ignored.
     310                 :             :  */
     311                 :             : static bool
     312                 :          10 : check_final_sigma(const unsigned char *str, size_t len, size_t offset)
     313                 :             : {
     314                 :             :         /* the start of the string is not preceded by a Cased character */
     315         [ +  + ]:          10 :         if (offset == 0)
     316                 :           1 :                 return false;
     317                 :             : 
     318                 :             :         /* iterate backwards, looking for Cased character */
     319   [ -  +  +  + ]:          33 :         for (int i = offset - 1; i >= 0; i--)
     320                 :             :         {
     321   [ +  +  +  + ]:          24 :                 if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
     322                 :             :                 {
     323                 :          13 :                         char32_t        curr = utf8_to_unicode(str + i);
     324                 :             : 
     325         [ +  + ]:          13 :                         if (pg_u_prop_case_ignorable(curr))
     326                 :           4 :                                 continue;
     327         [ +  + ]:           9 :                         else if (pg_u_prop_cased(curr))
     328                 :           7 :                                 break;
     329                 :             :                         else
     330                 :           2 :                                 return false;
     331         [ +  + ]:          13 :                 }
     332         [ +  - ]:          11 :                 else if ((str[i] & 0xC0) == 0x80)
     333                 :          11 :                         continue;
     334                 :             : 
     335                 :           0 :                 Assert(false);                  /* invalid UTF-8 */
     336                 :           0 :         }
     337                 :             : 
     338                 :             :         /* end of string is not followed by a Cased character */
     339         [ -  + ]:           7 :         if (offset == len)
     340                 :           0 :                 return true;
     341                 :             : 
     342                 :             :         /* iterate forwards, looking for Cased character */
     343   [ +  +  +  +  :          26 :         for (int i = offset + 1; i < len && str[i] != '\0'; i++)
                   +  + ]
     344                 :             :         {
     345   [ +  +  +  + ]:          19 :                 if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
     346                 :             :                 {
     347                 :           8 :                         char32_t        curr = utf8_to_unicode(str + i);
     348                 :             : 
     349         [ +  + ]:           8 :                         if (pg_u_prop_case_ignorable(curr))
     350                 :           4 :                                 continue;
     351         [ +  + ]:           4 :                         else if (pg_u_prop_cased(curr))
     352                 :           3 :                                 return false;
     353                 :             :                         else
     354                 :           1 :                                 break;
     355         [ +  + ]:           8 :                 }
     356         [ +  - ]:          11 :                 else if ((str[i] & 0xC0) == 0x80)
     357                 :          11 :                         continue;
     358                 :             : 
     359                 :           0 :                 Assert(false);                  /* invalid UTF-8 */
     360                 :           0 :         }
     361                 :             : 
     362                 :           4 :         return true;
     363                 :          10 : }
     364                 :             : 
     365                 :             : /*
     366                 :             :  * Unicode allows for special casing to be applied only under certain
     367                 :             :  * circumstances. The only currently-supported condition is Final_Sigma.
     368                 :             :  */
     369                 :             : static bool
     370                 :          25 : check_special_conditions(int conditions, const char *str, size_t len,
     371                 :             :                                                  size_t offset)
     372                 :             : {
     373         [ +  + ]:          25 :         if (conditions == 0)
     374                 :          15 :                 return true;
     375         [ +  - ]:          10 :         else if (conditions == PG_U_FINAL_SIGMA)
     376                 :          10 :                 return check_final_sigma((unsigned char *) str, len, offset);
     377                 :             : 
     378                 :             :         /* no other conditions supported */
     379                 :           0 :         Assert(false);
     380                 :           0 :         return false;
     381                 :          25 : }
     382                 :             : 
     383                 :             : /*
     384                 :             :  * Map the given character to the requested case.
     385                 :             :  *
     386                 :             :  * If full is true, and a special case mapping is found and the conditions are
     387                 :             :  * met, 'special' is set to the mapping result (which is an array of up to
     388                 :             :  * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
     389                 :             :  *
     390                 :             :  * Otherwise, search for a simple mapping, and if found, set 'simple' to the
     391                 :             :  * result and return CASEMAP_SIMPLE.
     392                 :             :  *
     393                 :             :  * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
     394                 :             :  * character without modification.
     395                 :             :  */
     396                 :             : static enum CaseMapResult
     397                 :         837 : casemap(char32_t u1, CaseKind casekind, bool full,
     398                 :             :                 const char *src, size_t srclen, size_t srcoff,
     399                 :             :                 char32_t *simple, const char32_t **special)
     400                 :             : {
     401                 :         837 :         uint16          idx;
     402                 :             : 
     403                 :             :         /* Fast path for codepoints < 0x80 */
     404         [ +  + ]:         837 :         if (u1 < 0x80)
     405                 :             :         {
     406                 :             :                 /*
     407                 :             :                  * The first elements in all tables are reserved as 0 (as NULL). The
     408                 :             :                  * data starts at index 1, not 0.
     409                 :             :                  */
     410                 :         496 :                 *simple = casekind_map[casekind][u1 + 1];
     411                 :             : 
     412                 :         496 :                 return CASEMAP_SIMPLE;
     413                 :             :         }
     414                 :             : 
     415                 :         341 :         idx = case_index(u1);
     416                 :             : 
     417         [ +  + ]:         341 :         if (idx == 0)
     418                 :          20 :                 return CASEMAP_SELF;
     419                 :             : 
     420   [ +  +  +  +  :         321 :         if (full && case_map_special[idx] &&
                   +  + ]
     421                 :          50 :                 check_special_conditions(special_case[case_map_special[idx]].conditions,
     422                 :          25 :                                                                  src, srclen, srcoff))
     423                 :             :         {
     424                 :          19 :                 *special = special_case[case_map_special[idx]].map[casekind];
     425                 :          19 :                 return CASEMAP_SPECIAL;
     426                 :             :         }
     427                 :             : 
     428                 :         302 :         *simple = casekind_map[casekind][idx];
     429                 :             : 
     430                 :         302 :         return CASEMAP_SIMPLE;
     431                 :         837 : }
     432                 :             : 
     433                 :             : /*
     434                 :             :  * Find entry in simple case map.
     435                 :             :  * If the entry does not exist, 0 will be returned.
     436                 :             :  */
     437                 :             : static char32_t
     438                 :         104 : find_case_map(char32_t ucs, const char32_t *map)
     439                 :             : {
     440                 :             :         /* Fast path for codepoints < 0x80 */
     441         [ +  + ]:         104 :         if (ucs < 0x80)
     442                 :             :                 /* The first elements in all tables are reserved as 0 (as NULL). */
     443                 :          32 :                 return map[ucs + 1];
     444                 :          72 :         return map[case_index(ucs)];
     445                 :         104 : }

Generated by: LCOV version 2.3.2-1