LCOV - Code coverage - src/common/wchar.c

LCOV - code coverage report

Current view:	top level - src/common - wchar.c (source / functions)		Coverage	Total	Hit
Test:	Code coverage	Lines:	57.9 %	1020	591
Test Date:	2026-01-26 10:56:24	Functions:	65.9 %	82	54
Legend:	Lines: hit not hit Branches: + taken - not taken # not executed	Branches:	50.5 %	683	345

             Branch data     Line data    Source code

       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * wchar.c
       4                 :             :  *        Functions for working with multibyte characters in various encodings.
       5                 :             :  *
       6                 :             :  * Portions Copyright (c) 1998-2026, PostgreSQL Global Development Group
       7                 :             :  *
       8                 :             :  * IDENTIFICATION
       9                 :             :  *        src/common/wchar.c
      10                 :             :  *
      11                 :             :  *-------------------------------------------------------------------------
      12                 :             :  */
      13                 :             : #include "c.h"
      14                 :             : 
      15                 :             : #include <limits.h>
      16                 :             : 
      17                 :             : #include "mb/pg_wchar.h"
      18                 :             : #include "utils/ascii.h"
      19                 :             : 
      20                 :             : 
      21                 :             : /*
      22                 :             :  * In today's multibyte encodings other than UTF8, this two-byte sequence
      23                 :             :  * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
      24                 :             :  *
      25                 :             :  * For historical reasons, several verifychar implementations opt to reject
      26                 :             :  * this pair specifically.  Byte pair range constraints, in encoding
      27                 :             :  * originator documentation, always excluded this pair.  No core conversion
      28                 :             :  * could translate it.  However, longstanding verifychar implementations
      29                 :             :  * accepted any non-NUL byte.  big5_to_euc_tw and big5_to_mic even translate
      30                 :             :  * pairs not valid per encoding originator documentation.  To avoid tightening
      31                 :             :  * core or non-core conversions in a security patch, we sought this one pair.
      32                 :             :  *
      33                 :             :  * PQescapeString() historically used spaces for BYTE1; many other values
      34                 :             :  * could suffice for BYTE1.
      35                 :             :  */
      36                 :             : #define NONUTF8_INVALID_BYTE0 (0x8d)
      37                 :             : #define NONUTF8_INVALID_BYTE1 (' ')
      38                 :             : 
      39                 :             : 
      40                 :             : /*
      41                 :             :  * Operations on multi-byte encodings are driven by a table of helper
      42                 :             :  * functions.
      43                 :             :  *
      44                 :             :  * To add an encoding support, define mblen(), dsplen(), verifychar() and
      45                 :             :  * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
      46                 :             :  * and wchar2mb() conversion functions.
      47                 :             :  *
      48                 :             :  * These functions generally assume that their input is validly formed.
      49                 :             :  * The "verifier" functions, further down in the file, have to be more
      50                 :             :  * paranoid.
      51                 :             :  *
      52                 :             :  * We expect that mblen() does not need to examine more than the first byte
      53                 :             :  * of the character to discover the correct length.  GB18030 is an exception
      54                 :             :  * to that rule, though, as it also looks at second byte.  But even that
      55                 :             :  * behaves in a predictable way, if you only pass the first byte: it will
      56                 :             :  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
      57                 :             :  * good enough for all current uses.
      58                 :             :  *
      59                 :             :  * Note: for the display output of psql to work properly, the return values
      60                 :             :  * of the dsplen functions must conform to the Unicode standard. In particular
      61                 :             :  * the NUL character is zero width and control characters are generally
      62                 :             :  * width -1. It is recommended that non-ASCII encodings refer their ASCII
      63                 :             :  * subset to the ASCII routines to ensure consistency.
      64                 :             :  */
      65                 :             : 
      66                 :             : /*
      67                 :             :  * SQL/ASCII
      68                 :             :  */
      69                 :             : static int
      70                 :           0 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      71                 :             : {
      72                 :           0 :         int                     cnt = 0;
      73                 :             : 
      74   [ #  #  #  # ]:           0 :         while (len > 0 && *from)
      75                 :             :         {
      76                 :           0 :                 *to++ = *from++;
      77                 :           0 :                 len--;
      78                 :           0 :                 cnt++;
      79                 :             :         }
      80                 :           0 :         *to = 0;
      81                 :           0 :         return cnt;
      82                 :           0 : }
      83                 :             : 
      84                 :             : static int
      85                 :           0 : pg_ascii_mblen(const unsigned char *s)
      86                 :             : {
      87                 :           0 :         return 1;
      88                 :             : }
      89                 :             : 
      90                 :             : static int
      91                 :           0 : pg_ascii_dsplen(const unsigned char *s)
      92                 :             : {
      93         [ #  # ]:           0 :         if (*s == '\0')
      94                 :           0 :                 return 0;
      95   [ #  #  #  # ]:           0 :         if (*s < 0x20 || *s == 0x7f)
      96                 :           0 :                 return -1;
      97                 :             : 
      98                 :           0 :         return 1;
      99                 :           0 : }
     100                 :             : 
     101                 :             : /*
     102                 :             :  * EUC
     103                 :             :  */
     104                 :             : static int
     105                 :           0 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     106                 :             : {
     107                 :           0 :         int                     cnt = 0;
     108                 :             : 
     109   [ #  #  #  # ]:           0 :         while (len > 0 && *from)
     110                 :             :         {
     111   [ #  #  #  # ]:           0 :                 if (*from == SS2 && len >= 2)        /* JIS X 0201 (so called "1 byte
     112                 :             :                                                                                  * KANA") */
     113                 :             :                 {
     114                 :           0 :                         from++;
     115                 :           0 :                         *to = (SS2 << 8) | *from++;
     116                 :           0 :                         len -= 2;
     117                 :           0 :                 }
     118   [ #  #  #  # ]:           0 :                 else if (*from == SS3 && len >= 3)   /* JIS X 0212 KANJI */
     119                 :             :                 {
     120                 :           0 :                         from++;
     121                 :           0 :                         *to = (SS3 << 16) | (*from++ << 8);
     122                 :           0 :                         *to |= *from++;
     123                 :           0 :                         len -= 3;
     124                 :           0 :                 }
     125   [ #  #  #  # ]:           0 :                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
     126                 :             :                 {
     127                 :           0 :                         *to = *from++ << 8;
     128                 :           0 :                         *to |= *from++;
     129                 :           0 :                         len -= 2;
     130                 :           0 :                 }
     131                 :             :                 else                                    /* must be ASCII */
     132                 :             :                 {
     133                 :           0 :                         *to = *from++;
     134                 :           0 :                         len--;
     135                 :             :                 }
     136                 :           0 :                 to++;
     137                 :           0 :                 cnt++;
     138                 :             :         }
     139                 :           0 :         *to = 0;
     140                 :           0 :         return cnt;
     141                 :           0 : }
     142                 :             : 
     143                 :             : static inline int
     144                 :          39 : pg_euc_mblen(const unsigned char *s)
     145                 :             : {
     146                 :          39 :         int                     len;
     147                 :             : 
     148         [ -  + ]:          39 :         if (*s == SS2)
     149                 :           0 :                 len = 2;
     150         [ -  + ]:          39 :         else if (*s == SS3)
     151                 :           0 :                 len = 3;
     152         [ +  + ]:          39 :         else if (IS_HIGHBIT_SET(*s))
     153                 :          27 :                 len = 2;
     154                 :             :         else
     155                 :          12 :                 len = 1;
     156                 :          78 :         return len;
     157                 :          39 : }
     158                 :             : 
     159                 :             : static inline int
     160                 :           0 : pg_euc_dsplen(const unsigned char *s)
     161                 :             : {
     162                 :           0 :         int                     len;
     163                 :             : 
     164         [ #  # ]:           0 :         if (*s == SS2)
     165                 :           0 :                 len = 2;
     166         [ #  # ]:           0 :         else if (*s == SS3)
     167                 :           0 :                 len = 2;
     168         [ #  # ]:           0 :         else if (IS_HIGHBIT_SET(*s))
     169                 :           0 :                 len = 2;
     170                 :             :         else
     171                 :           0 :                 len = pg_ascii_dsplen(s);
     172                 :           0 :         return len;
     173                 :           0 : }
     174                 :             : 
     175                 :             : /*
     176                 :             :  * EUC_JP
     177                 :             :  */
     178                 :             : static int
     179                 :           0 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     180                 :             : {
     181                 :           0 :         return pg_euc2wchar_with_len(from, to, len);
     182                 :             : }
     183                 :             : 
     184                 :             : static int
     185                 :          34 : pg_eucjp_mblen(const unsigned char *s)
     186                 :             : {
     187                 :          34 :         return pg_euc_mblen(s);
     188                 :             : }
     189                 :             : 
     190                 :             : static int
     191                 :           0 : pg_eucjp_dsplen(const unsigned char *s)
     192                 :             : {
     193                 :           0 :         int                     len;
     194                 :             : 
     195         [ #  # ]:           0 :         if (*s == SS2)
     196                 :           0 :                 len = 1;
     197         [ #  # ]:           0 :         else if (*s == SS3)
     198                 :           0 :                 len = 2;
     199         [ #  # ]:           0 :         else if (IS_HIGHBIT_SET(*s))
     200                 :           0 :                 len = 2;
     201                 :             :         else
     202                 :           0 :                 len = pg_ascii_dsplen(s);
     203                 :           0 :         return len;
     204                 :           0 : }
     205                 :             : 
     206                 :             : /*
     207                 :             :  * EUC_KR
     208                 :             :  */
     209                 :             : static int
     210                 :           0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     211                 :             : {
     212                 :           0 :         return pg_euc2wchar_with_len(from, to, len);
     213                 :             : }
     214                 :             : 
     215                 :             : static int
     216                 :           1 : pg_euckr_mblen(const unsigned char *s)
     217                 :             : {
     218                 :           1 :         return pg_euc_mblen(s);
     219                 :             : }
     220                 :             : 
     221                 :             : static int
     222                 :           0 : pg_euckr_dsplen(const unsigned char *s)
     223                 :             : {
     224                 :           0 :         return pg_euc_dsplen(s);
     225                 :             : }
     226                 :             : 
     227                 :             : /*
     228                 :             :  * EUC_CN
     229                 :             :  *
     230                 :             :  */
     231                 :             : static int
     232                 :           0 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     233                 :             : {
     234                 :           0 :         int                     cnt = 0;
     235                 :             : 
     236   [ #  #  #  # ]:           0 :         while (len > 0 && *from)
     237                 :             :         {
     238   [ #  #  #  # ]:           0 :                 if (*from == SS2 && len >= 3)        /* code set 2 (unused?) */
     239                 :             :                 {
     240                 :           0 :                         from++;
     241                 :           0 :                         *to = (SS2 << 16) | (*from++ << 8);
     242                 :           0 :                         *to |= *from++;
     243                 :           0 :                         len -= 3;
     244                 :           0 :                 }
     245   [ #  #  #  # ]:           0 :                 else if (*from == SS3 && len >= 3)   /* code set 3 (unused ?) */
     246                 :             :                 {
     247                 :           0 :                         from++;
     248                 :           0 :                         *to = (SS3 << 16) | (*from++ << 8);
     249                 :           0 :                         *to |= *from++;
     250                 :           0 :                         len -= 3;
     251                 :           0 :                 }
     252   [ #  #  #  # ]:           0 :                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
     253                 :             :                 {
     254                 :           0 :                         *to = *from++ << 8;
     255                 :           0 :                         *to |= *from++;
     256                 :           0 :                         len -= 2;
     257                 :           0 :                 }
     258                 :             :                 else
     259                 :             :                 {
     260                 :           0 :                         *to = *from++;
     261                 :           0 :                         len--;
     262                 :             :                 }
     263                 :           0 :                 to++;
     264                 :           0 :                 cnt++;
     265                 :             :         }
     266                 :           0 :         *to = 0;
     267                 :           0 :         return cnt;
     268                 :           0 : }
     269                 :             : 
     270                 :             : static int
     271                 :           1 : pg_euccn_mblen(const unsigned char *s)
     272                 :             : {
     273                 :           1 :         int                     len;
     274                 :             : 
     275         [ +  - ]:           1 :         if (IS_HIGHBIT_SET(*s))
     276                 :           1 :                 len = 2;
     277                 :             :         else
     278                 :           0 :                 len = 1;
     279                 :           2 :         return len;
     280                 :           1 : }
     281                 :             : 
     282                 :             : static int
     283                 :           0 : pg_euccn_dsplen(const unsigned char *s)
     284                 :             : {
     285                 :           0 :         int                     len;
     286                 :             : 
     287         [ #  # ]:           0 :         if (IS_HIGHBIT_SET(*s))
     288                 :           0 :                 len = 2;
     289                 :             :         else
     290                 :           0 :                 len = pg_ascii_dsplen(s);
     291                 :           0 :         return len;
     292                 :           0 : }
     293                 :             : 
     294                 :             : /*
     295                 :             :  * EUC_TW
     296                 :             :  *
     297                 :             :  */
     298                 :             : static int
     299                 :           0 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     300                 :             : {
     301                 :           0 :         int                     cnt = 0;
     302                 :             : 
     303   [ #  #  #  # ]:           0 :         while (len > 0 && *from)
     304                 :             :         {
     305   [ #  #  #  # ]:           0 :                 if (*from == SS2 && len >= 4)        /* code set 2 */
     306                 :             :                 {
     307                 :           0 :                         from++;
     308                 :           0 :                         *to = (((uint32) SS2) << 24) | (*from++ << 16);
     309                 :           0 :                         *to |= *from++ << 8;
     310                 :           0 :                         *to |= *from++;
     311                 :           0 :                         len -= 4;
     312                 :           0 :                 }
     313   [ #  #  #  # ]:           0 :                 else if (*from == SS3 && len >= 3)   /* code set 3 (unused?) */
     314                 :             :                 {
     315                 :           0 :                         from++;
     316                 :           0 :                         *to = (SS3 << 16) | (*from++ << 8);
     317                 :           0 :                         *to |= *from++;
     318                 :           0 :                         len -= 3;
     319                 :           0 :                 }
     320   [ #  #  #  # ]:           0 :                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
     321                 :             :                 {
     322                 :           0 :                         *to = *from++ << 8;
     323                 :           0 :                         *to |= *from++;
     324                 :           0 :                         len -= 2;
     325                 :           0 :                 }
     326                 :             :                 else
     327                 :             :                 {
     328                 :           0 :                         *to = *from++;
     329                 :           0 :                         len--;
     330                 :             :                 }
     331                 :           0 :                 to++;
     332                 :           0 :                 cnt++;
     333                 :             :         }
     334                 :           0 :         *to = 0;
     335                 :           0 :         return cnt;
     336                 :           0 : }
     337                 :             : 
     338                 :             : static int
     339                 :           1 : pg_euctw_mblen(const unsigned char *s)
     340                 :             : {
     341                 :           1 :         int                     len;
     342                 :             : 
     343         [ -  + ]:           1 :         if (*s == SS2)
     344                 :           0 :                 len = 4;
     345         [ -  + ]:           1 :         else if (*s == SS3)
     346                 :           0 :                 len = 3;
     347         [ +  - ]:           1 :         else if (IS_HIGHBIT_SET(*s))
     348                 :           1 :                 len = 2;
     349                 :             :         else
     350                 :           0 :                 len = 1;
     351                 :           2 :         return len;
     352                 :           1 : }
     353                 :             : 
     354                 :             : static int
     355                 :           0 : pg_euctw_dsplen(const unsigned char *s)
     356                 :             : {
     357                 :           0 :         int                     len;
     358                 :             : 
     359         [ #  # ]:           0 :         if (*s == SS2)
     360                 :           0 :                 len = 2;
     361         [ #  # ]:           0 :         else if (*s == SS3)
     362                 :           0 :                 len = 2;
     363         [ #  # ]:           0 :         else if (IS_HIGHBIT_SET(*s))
     364                 :           0 :                 len = 2;
     365                 :             :         else
     366                 :           0 :                 len = pg_ascii_dsplen(s);
     367                 :           0 :         return len;
     368                 :           0 : }
     369                 :             : 
     370                 :             : /*
     371                 :             :  * Convert pg_wchar to EUC_* encoding.
     372                 :             :  * caller must allocate enough space for "to", including a trailing zero!
     373                 :             :  * len: length of from.
     374                 :             :  * "from" not necessarily null terminated.
     375                 :             :  */
     376                 :             : static int
     377                 :           0 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
     378                 :             : {
     379                 :           0 :         int                     cnt = 0;
     380                 :             : 
     381   [ #  #  #  # ]:           0 :         while (len > 0 && *from)
     382                 :             :         {
     383                 :           0 :                 unsigned char c;
     384                 :             : 
     385         [ #  # ]:           0 :                 if ((c = (*from >> 24)))
     386                 :             :                 {
     387                 :           0 :                         *to++ = c;
     388                 :           0 :                         *to++ = (*from >> 16) & 0xff;
     389                 :           0 :                         *to++ = (*from >> 8) & 0xff;
     390                 :           0 :                         *to++ = *from & 0xff;
     391                 :           0 :                         cnt += 4;
     392                 :           0 :                 }
     393         [ #  # ]:           0 :                 else if ((c = (*from >> 16)))
     394                 :             :                 {
     395                 :           0 :                         *to++ = c;
     396                 :           0 :                         *to++ = (*from >> 8) & 0xff;
     397                 :           0 :                         *to++ = *from & 0xff;
     398                 :           0 :                         cnt += 3;
     399                 :           0 :                 }
     400         [ #  # ]:           0 :                 else if ((c = (*from >> 8)))
     401                 :             :                 {
     402                 :           0 :                         *to++ = c;
     403                 :           0 :                         *to++ = *from & 0xff;
     404                 :           0 :                         cnt += 2;
     405                 :           0 :                 }
     406                 :             :                 else
     407                 :             :                 {
     408                 :           0 :                         *to++ = *from;
     409                 :           0 :                         cnt++;
     410                 :             :                 }
     411                 :           0 :                 from++;
     412                 :           0 :                 len--;
     413                 :           0 :         }
     414                 :           0 :         *to = 0;
     415                 :           0 :         return cnt;
     416                 :           0 : }
     417                 :             : 
     418                 :             : 
     419                 :             : /*
     420                 :             :  * JOHAB
     421                 :             :  */
     422                 :             : static int
     423                 :           4 : pg_johab_mblen(const unsigned char *s)
     424                 :             : {
     425                 :           4 :         return pg_euc_mblen(s);
     426                 :             : }
     427                 :             : 
     428                 :             : static int
     429                 :           0 : pg_johab_dsplen(const unsigned char *s)
     430                 :             : {
     431                 :           0 :         return pg_euc_dsplen(s);
     432                 :             : }
     433                 :             : 
     434                 :             : /*
     435                 :             :  * convert UTF8 string to pg_wchar (UCS-4)
     436                 :             :  * caller must allocate enough space for "to", including a trailing zero!
     437                 :             :  * len: length of from.
     438                 :             :  * "from" not necessarily null terminated.
     439                 :             :  */
     440                 :             : static int
     441                 :     1148491 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     442                 :             : {
     443                 :     1148491 :         int                     cnt = 0;
     444                 :     1148491 :         uint32          c1,
     445                 :             :                                 c2,
     446                 :             :                                 c3,
     447                 :             :                                 c4;
     448                 :             : 
     449   [ +  +  +  + ]:    19064252 :         while (len > 0 && *from)
     450                 :             :         {
     451         [ +  + ]:    17915761 :                 if ((*from & 0x80) == 0)
     452                 :             :                 {
     453                 :    17915699 :                         *to = *from++;
     454                 :    17915699 :                         len--;
     455                 :    17915699 :                 }
     456         [ +  + ]:          62 :                 else if ((*from & 0xe0) == 0xc0)
     457                 :             :                 {
     458         [ +  - ]:          60 :                         if (len < 2)
     459                 :           0 :                                 break;                  /* drop trailing incomplete char */
     460                 :          60 :                         c1 = *from++ & 0x1f;
     461                 :          60 :                         c2 = *from++ & 0x3f;
     462                 :          60 :                         *to = (c1 << 6) | c2;
     463                 :          60 :                         len -= 2;
     464                 :          60 :                 }
     465         [ +  - ]:           2 :                 else if ((*from & 0xf0) == 0xe0)
     466                 :             :                 {
     467         [ -  + ]:           2 :                         if (len < 3)
     468                 :           0 :                                 break;                  /* drop trailing incomplete char */
     469                 :           2 :                         c1 = *from++ & 0x0f;
     470                 :           2 :                         c2 = *from++ & 0x3f;
     471                 :           2 :                         c3 = *from++ & 0x3f;
     472                 :           2 :                         *to = (c1 << 12) | (c2 << 6) | c3;
     473                 :           2 :                         len -= 3;
     474                 :           2 :                 }
     475         [ #  # ]:           0 :                 else if ((*from & 0xf8) == 0xf0)
     476                 :             :                 {
     477         [ #  # ]:           0 :                         if (len < 4)
     478                 :           0 :                                 break;                  /* drop trailing incomplete char */
     479                 :           0 :                         c1 = *from++ & 0x07;
     480                 :           0 :                         c2 = *from++ & 0x3f;
     481                 :           0 :                         c3 = *from++ & 0x3f;
     482                 :           0 :                         c4 = *from++ & 0x3f;
     483                 :           0 :                         *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
     484                 :           0 :                         len -= 4;
     485                 :           0 :                 }
     486                 :             :                 else
     487                 :             :                 {
     488                 :             :                         /* treat a bogus char as length 1; not ours to raise error */
     489                 :           0 :                         *to = *from++;
     490                 :           0 :                         len--;
     491                 :             :                 }
     492                 :    17915761 :                 to++;
     493                 :    17915761 :                 cnt++;
     494                 :             :         }
     495                 :     1148491 :         *to = 0;
     496                 :     2296982 :         return cnt;
     497                 :     1148491 : }
     498                 :             : 
     499                 :             : 
     500                 :             : /*
     501                 :             :  * Trivial conversion from pg_wchar to UTF-8.
     502                 :             :  * caller should allocate enough space for "to"
     503                 :             :  * len: length of from.
     504                 :             :  * "from" not necessarily null terminated.
     505                 :             :  */
     506                 :             : static int
     507                 :       18424 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
     508                 :             : {
     509                 :       18424 :         int                     cnt = 0;
     510                 :             : 
     511   [ +  +  +  + ]:      105409 :         while (len > 0 && *from)
     512                 :             :         {
     513                 :       86985 :                 int                     char_len;
     514                 :             : 
     515                 :       86985 :                 unicode_to_utf8(*from, to);
     516                 :       86985 :                 char_len = pg_utf_mblen(to);
     517                 :       86985 :                 cnt += char_len;
     518                 :       86985 :                 to += char_len;
     519                 :       86985 :                 from++;
     520                 :       86985 :                 len--;
     521                 :       86985 :         }
     522                 :       18424 :         *to = 0;
     523                 :       36848 :         return cnt;
     524                 :       18424 : }
     525                 :             : 
     526                 :             : /*
     527                 :             :  * Return the byte length of a UTF8 character pointed to by s
     528                 :             :  *
     529                 :             :  * Note: in the current implementation we do not support UTF8 sequences
     530                 :             :  * of more than 4 bytes; hence do NOT return a value larger than 4.
     531                 :             :  * We return "1" for any leading byte that is either flat-out illegal or
     532                 :             :  * indicates a length larger than we support.
     533                 :             :  *
     534                 :             :  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
     535                 :             :  * other places would need to be fixed to change this.
     536                 :             :  */
     537                 :             : int
     538                 :    41161753 : pg_utf_mblen(const unsigned char *s)
     539                 :             : {
     540                 :    41161753 :         int                     len;
     541                 :             : 
     542         [ +  + ]:    41161753 :         if ((*s & 0x80) == 0)
     543                 :    41159547 :                 len = 1;
     544         [ +  + ]:        2206 :         else if ((*s & 0xe0) == 0xc0)
     545                 :        1374 :                 len = 2;
     546         [ +  + ]:         832 :         else if ((*s & 0xf0) == 0xe0)
     547                 :         734 :                 len = 3;
     548         [ +  + ]:          98 :         else if ((*s & 0xf8) == 0xf0)
     549                 :          70 :                 len = 4;
     550                 :             : #ifdef NOT_USED
     551                 :             :         else if ((*s & 0xfc) == 0xf8)
     552                 :             :                 len = 5;
     553                 :             :         else if ((*s & 0xfe) == 0xfc)
     554                 :             :                 len = 6;
     555                 :             : #endif
     556                 :             :         else
     557                 :          28 :                 len = 1;
     558                 :    82323506 :         return len;
     559                 :    41161753 : }
     560                 :             : 
     561                 :             : /*
     562                 :             :  * This is an implementation of wcwidth() and wcswidth() as defined in
     563                 :             :  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
     564                 :             :  * <http://www.unix.org/online.html>
     565                 :             :  *
     566                 :             :  * Markus Kuhn -- 2001-09-08 -- public domain
     567                 :             :  *
     568                 :             :  * customised for PostgreSQL
     569                 :             :  *
     570                 :             :  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
     571                 :             :  */
     572                 :             : 
     573                 :             : struct mbinterval
     574                 :             : {
     575                 :             :         unsigned int first;
     576                 :             :         unsigned int last;
     577                 :             : };
     578                 :             : 
     579                 :             : /* auxiliary function for binary search in interval table */
     580                 :             : static int
     581                 :    13568355 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
     582                 :             : {
     583                 :    13568355 :         int                     min = 0;
     584                 :    13568355 :         int                     mid;
     585                 :             : 
     586   [ +  +  -  + ]:    13568355 :         if (ucs < table[0].first || ucs > table[max].last)
     587                 :    13567041 :                 return 0;
     588         [ +  + ]:       11508 :         while (max >= min)
     589                 :             :         {
     590                 :       10311 :                 mid = (min + max) / 2;
     591         [ +  + ]:       10311 :                 if (ucs > table[mid].last)
     592                 :        2091 :                         min = mid + 1;
     593         [ +  + ]:        8220 :                 else if (ucs < table[mid].first)
     594                 :        8103 :                         max = mid - 1;
     595                 :             :                 else
     596                 :         117 :                         return 1;
     597                 :             :         }
     598                 :             : 
     599                 :        1197 :         return 0;
     600                 :    13568355 : }
     601                 :             : 
     602                 :             : 
     603                 :             : /* The following functions define the column width of an ISO 10646
     604                 :             :  * character as follows:
     605                 :             :  *
     606                 :             :  *        - The null character (U+0000) has a column width of 0.
     607                 :             :  *
     608                 :             :  *        - Other C0/C1 control characters and DEL will lead to a return
     609                 :             :  *              value of -1.
     610                 :             :  *
     611                 :             :  *        - Non-spacing and enclosing combining characters (general
     612                 :             :  *              category code Mn, Me or Cf in the Unicode database) have a
     613                 :             :  *              column width of 0.
     614                 :             :  *
     615                 :             :  *        - Spacing characters in the East Asian Wide (W) or East Asian
     616                 :             :  *              FullWidth (F) category as defined in Unicode Technical
     617                 :             :  *              Report #11 have a column width of 2.
     618                 :             :  *
     619                 :             :  *        - All remaining characters (including all printable
     620                 :             :  *              ISO 8859-1 and WGL4 characters, Unicode control characters,
     621                 :             :  *              etc.) have a column width of 1.
     622                 :             :  *
     623                 :             :  * This implementation assumes that wchar_t characters are encoded
     624                 :             :  * in ISO 10646.
     625                 :             :  */
     626                 :             : 
     627                 :             : static int
     628                 :     6791356 : ucs_wcwidth(pg_wchar ucs)
     629                 :             : {
     630                 :             : #include "common/unicode_nonspacing_table.h"
     631                 :             : #include "common/unicode_east_asian_fw_table.h"
     632                 :             : 
     633                 :             :         /* test for 8-bit control characters */
     634         [ +  - ]:     6791356 :         if (ucs == 0)
     635                 :           0 :                 return 0;
     636                 :             : 
     637   [ +  +  +  +  :     6791356 :         if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
                   +  + ]
     638                 :        9496 :                 return -1;
     639                 :             : 
     640                 :             :         /*
     641                 :             :          * binary search in table of non-spacing characters
     642                 :             :          *
     643                 :             :          * XXX: In the official Unicode sources, it is possible for a character to
     644                 :             :          * be described as both non-spacing and wide at the same time. As of
     645                 :             :          * Unicode 13.0, treating the non-spacing property as the determining
     646                 :             :          * factor for display width leads to the correct behavior, so do that
     647                 :             :          * search first.
     648                 :             :          */
     649         [ +  + ]:     6784218 :         if (mbbisearch(ucs, nonspacing,
     650                 :             :                                    sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
     651                 :          81 :                 return 0;
     652                 :             : 
     653                 :             :         /* binary search in table of wide characters */
     654         [ +  + ]:     6784137 :         if (mbbisearch(ucs, east_asian_fw,
     655                 :             :                                    sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
     656                 :          36 :                 return 2;
     657                 :             : 
     658                 :     6784101 :         return 1;
     659                 :     6793714 : }
     660                 :             : 
     661                 :             : static int
     662                 :     6791356 : pg_utf_dsplen(const unsigned char *s)
     663                 :             : {
     664                 :     6791356 :         return ucs_wcwidth(utf8_to_unicode(s));
     665                 :             : }
     666                 :             : 
     667                 :             : /*
     668                 :             :  * convert mule internal code to pg_wchar
     669                 :             :  * caller should allocate enough space for "to"
     670                 :             :  * len: length of from.
     671                 :             :  * "from" not necessarily null terminated.
     672                 :             :  */
     673                 :             : static int
     674                 :           0 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     675                 :             : {
     676                 :           0 :         int                     cnt = 0;
     677                 :             : 
     678   [ #  #  #  # ]:           0 :         while (len > 0 && *from)
     679                 :             :         {
     680   [ #  #  #  #  :           0 :                 if (IS_LC1(*from) && len >= 2)
                   #  # ]
     681                 :             :                 {
     682                 :           0 :                         *to = *from++ << 16;
     683                 :           0 :                         *to |= *from++;
     684                 :           0 :                         len -= 2;
     685                 :           0 :                 }
     686   [ #  #  #  # ]:           0 :                 else if (IS_LCPRV1(*from) && len >= 3)
     687                 :             :                 {
     688                 :           0 :                         from++;
     689                 :           0 :                         *to = *from++ << 16;
     690                 :           0 :                         *to |= *from++;
     691                 :           0 :                         len -= 3;
     692                 :           0 :                 }
     693   [ #  #  #  #  :           0 :                 else if (IS_LC2(*from) && len >= 3)
                   #  # ]
     694                 :             :                 {
     695                 :           0 :                         *to = *from++ << 16;
     696                 :           0 :                         *to |= *from++ << 8;
     697                 :           0 :                         *to |= *from++;
     698                 :           0 :                         len -= 3;
     699                 :           0 :                 }
     700   [ #  #  #  # ]:           0 :                 else if (IS_LCPRV2(*from) && len >= 4)
     701                 :             :                 {
     702                 :           0 :                         from++;
     703                 :           0 :                         *to = *from++ << 16;
     704                 :           0 :                         *to |= *from++ << 8;
     705                 :           0 :                         *to |= *from++;
     706                 :           0 :                         len -= 4;
     707                 :           0 :                 }
     708                 :             :                 else
     709                 :             :                 {                                               /* assume ASCII */
     710                 :           0 :                         *to = (unsigned char) *from++;
     711                 :           0 :                         len--;
     712                 :             :                 }
     713                 :           0 :                 to++;
     714                 :           0 :                 cnt++;
     715                 :             :         }
     716                 :           0 :         *to = 0;
     717                 :           0 :         return cnt;
     718                 :           0 : }
     719                 :             : 
     720                 :             : /*
     721                 :             :  * convert pg_wchar to mule internal code
     722                 :             :  * caller should allocate enough space for "to"
     723                 :             :  * len: length of from.
     724                 :             :  * "from" not necessarily null terminated.
     725                 :             :  */
     726                 :             : static int
     727                 :           0 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
     728                 :             : {
     729                 :           0 :         int                     cnt = 0;
     730                 :             : 
     731   [ #  #  #  # ]:           0 :         while (len > 0 && *from)
     732                 :             :         {
     733                 :           0 :                 unsigned char lb;
     734                 :             : 
     735                 :           0 :                 lb = (*from >> 16) & 0xff;
     736   [ #  #  #  # ]:           0 :                 if (IS_LC1(lb))
     737                 :             :                 {
     738                 :           0 :                         *to++ = lb;
     739                 :           0 :                         *to++ = *from & 0xff;
     740                 :           0 :                         cnt += 2;
     741                 :           0 :                 }
     742   [ #  #  #  # ]:           0 :                 else if (IS_LC2(lb))
     743                 :             :                 {
     744                 :           0 :                         *to++ = lb;
     745                 :           0 :                         *to++ = (*from >> 8) & 0xff;
     746                 :           0 :                         *to++ = *from & 0xff;
     747                 :           0 :                         cnt += 3;
     748                 :           0 :                 }
     749   [ #  #  #  # ]:           0 :                 else if (IS_LCPRV1_A_RANGE(lb))
     750                 :             :                 {
     751                 :           0 :                         *to++ = LCPRV1_A;
     752                 :           0 :                         *to++ = lb;
     753                 :           0 :                         *to++ = *from & 0xff;
     754                 :           0 :                         cnt += 3;
     755                 :           0 :                 }
     756   [ #  #  #  # ]:           0 :                 else if (IS_LCPRV1_B_RANGE(lb))
     757                 :             :                 {
     758                 :           0 :                         *to++ = LCPRV1_B;
     759                 :           0 :                         *to++ = lb;
     760                 :           0 :                         *to++ = *from & 0xff;
     761                 :           0 :                         cnt += 3;
     762                 :           0 :                 }
     763   [ #  #  #  # ]:           0 :                 else if (IS_LCPRV2_A_RANGE(lb))
     764                 :             :                 {
     765                 :           0 :                         *to++ = LCPRV2_A;
     766                 :           0 :                         *to++ = lb;
     767                 :           0 :                         *to++ = (*from >> 8) & 0xff;
     768                 :           0 :                         *to++ = *from & 0xff;
     769                 :           0 :                         cnt += 4;
     770                 :           0 :                 }
     771   [ #  #  #  # ]:           0 :                 else if (IS_LCPRV2_B_RANGE(lb))
     772                 :             :                 {
     773                 :           0 :                         *to++ = LCPRV2_B;
     774                 :           0 :                         *to++ = lb;
     775                 :           0 :                         *to++ = (*from >> 8) & 0xff;
     776                 :           0 :                         *to++ = *from & 0xff;
     777                 :           0 :                         cnt += 4;
     778                 :           0 :                 }
     779                 :             :                 else
     780                 :             :                 {
     781                 :           0 :                         *to++ = *from & 0xff;
     782                 :           0 :                         cnt += 1;
     783                 :             :                 }
     784                 :           0 :                 from++;
     785                 :           0 :                 len--;
     786                 :           0 :         }
     787                 :           0 :         *to = 0;
     788                 :           0 :         return cnt;
     789                 :           0 : }
     790                 :             : 
     791                 :             : /* exported for direct use by conv.c */
     792                 :             : int
     793                 :         496 : pg_mule_mblen(const unsigned char *s)
     794                 :             : {
     795                 :         496 :         int                     len;
     796                 :             : 
     797   [ +  +  +  + ]:         496 :         if (IS_LC1(*s))
     798                 :         202 :                 len = 2;
     799   [ +  -  -  + ]:         294 :         else if (IS_LCPRV1(*s))
     800                 :           0 :                 len = 3;
     801   [ +  +  -  + ]:         294 :         else if (IS_LC2(*s))
     802                 :         285 :                 len = 3;
     803   [ +  -  -  + ]:           9 :         else if (IS_LCPRV2(*s))
     804                 :           0 :                 len = 4;
     805                 :             :         else
     806                 :           9 :                 len = 1;                                /* assume ASCII */
     807                 :         992 :         return len;
     808                 :         496 : }
     809                 :             : 
     810                 :             : static int
     811                 :           0 : pg_mule_dsplen(const unsigned char *s)
     812                 :             : {
     813                 :           0 :         int                     len;
     814                 :             : 
     815                 :             :         /*
     816                 :             :          * Note: it's not really appropriate to assume that all multibyte charsets
     817                 :             :          * are double-wide on screen.  But this seems an okay approximation for
     818                 :             :          * the MULE charsets we currently support.
     819                 :             :          */
     820                 :             : 
     821   [ #  #  #  # ]:           0 :         if (IS_LC1(*s))
     822                 :           0 :                 len = 1;
     823   [ #  #  #  # ]:           0 :         else if (IS_LCPRV1(*s))
     824                 :           0 :                 len = 1;
     825   [ #  #  #  # ]:           0 :         else if (IS_LC2(*s))
     826                 :           0 :                 len = 2;
     827   [ #  #  #  # ]:           0 :         else if (IS_LCPRV2(*s))
     828                 :           0 :                 len = 2;
     829                 :             :         else
     830                 :           0 :                 len = 1;                                /* assume ASCII */
     831                 :             : 
     832                 :           0 :         return len;
     833                 :           0 : }
     834                 :             : 
     835                 :             : /*
     836                 :             :  * ISO8859-1
     837                 :             :  */
     838                 :             : static int
     839                 :           0 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     840                 :             : {
     841                 :           0 :         int                     cnt = 0;
     842                 :             : 
     843   [ #  #  #  # ]:           0 :         while (len > 0 && *from)
     844                 :             :         {
     845                 :           0 :                 *to++ = *from++;
     846                 :           0 :                 len--;
     847                 :           0 :                 cnt++;
     848                 :             :         }
     849                 :           0 :         *to = 0;
     850                 :           0 :         return cnt;
     851                 :           0 : }
     852                 :             : 
     853                 :             : /*
     854                 :             :  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
     855                 :             :  * high bits.
     856                 :             :  * caller should allocate enough space for "to"
     857                 :             :  * len: length of from.
     858                 :             :  * "from" not necessarily null terminated.
     859                 :             :  */
     860                 :             : static int
     861                 :           0 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
     862                 :             : {
     863                 :           0 :         int                     cnt = 0;
     864                 :             : 
     865   [ #  #  #  # ]:           0 :         while (len > 0 && *from)
     866                 :             :         {
     867                 :           0 :                 *to++ = *from++;
     868                 :           0 :                 len--;
     869                 :           0 :                 cnt++;
     870                 :             :         }
     871                 :           0 :         *to = 0;
     872                 :           0 :         return cnt;
     873                 :           0 : }
     874                 :             : 
     875                 :             : static int
     876                 :          36 : pg_latin1_mblen(const unsigned char *s)
     877                 :             : {
     878                 :          36 :         return 1;
     879                 :             : }
     880                 :             : 
     881                 :             : static int
     882                 :           0 : pg_latin1_dsplen(const unsigned char *s)
     883                 :             : {
     884                 :           0 :         return pg_ascii_dsplen(s);
     885                 :             : }
     886                 :             : 
     887                 :             : /*
     888                 :             :  * SJIS
     889                 :             :  */
     890                 :             : static int
     891                 :         170 : pg_sjis_mblen(const unsigned char *s)
     892                 :             : {
     893                 :         170 :         int                     len;
     894                 :             : 
     895   [ -  +  #  # ]:         170 :         if (*s >= 0xa1 && *s <= 0xdf)
     896                 :           0 :                 len = 1;                                /* 1 byte kana? */
     897         [ +  + ]:         170 :         else if (IS_HIGHBIT_SET(*s))
     898                 :         152 :                 len = 2;                                /* kanji? */
     899                 :             :         else
     900                 :          18 :                 len = 1;                                /* should be ASCII */
     901                 :         340 :         return len;
     902                 :         170 : }
     903                 :             : 
     904                 :             : static int
     905                 :           0 : pg_sjis_dsplen(const unsigned char *s)
     906                 :             : {
     907                 :           0 :         int                     len;
     908                 :             : 
     909   [ #  #  #  # ]:           0 :         if (*s >= 0xa1 && *s <= 0xdf)
     910                 :           0 :                 len = 1;                                /* 1 byte kana? */
     911         [ #  # ]:           0 :         else if (IS_HIGHBIT_SET(*s))
     912                 :           0 :                 len = 2;                                /* kanji? */
     913                 :             :         else
     914                 :           0 :                 len = pg_ascii_dsplen(s);       /* should be ASCII */
     915                 :           0 :         return len;
     916                 :           0 : }
     917                 :             : 
     918                 :             : /*
     919                 :             :  * Big5
     920                 :             :  */
     921                 :             : static int
     922                 :          82 : pg_big5_mblen(const unsigned char *s)
     923                 :             : {
     924                 :          82 :         int                     len;
     925                 :             : 
     926         [ +  + ]:          82 :         if (IS_HIGHBIT_SET(*s))
     927                 :          73 :                 len = 2;                                /* kanji? */
     928                 :             :         else
     929                 :           9 :                 len = 1;                                /* should be ASCII */
     930                 :         164 :         return len;
     931                 :          82 : }
     932                 :             : 
     933                 :             : static int
     934                 :           0 : pg_big5_dsplen(const unsigned char *s)
     935                 :             : {
     936                 :           0 :         int                     len;
     937                 :             : 
     938         [ #  # ]:           0 :         if (IS_HIGHBIT_SET(*s))
     939                 :           0 :                 len = 2;                                /* kanji? */
     940                 :             :         else
     941                 :           0 :                 len = pg_ascii_dsplen(s);       /* should be ASCII */
     942                 :           0 :         return len;
     943                 :           0 : }
     944                 :             : 
     945                 :             : /*
     946                 :             :  * GBK
     947                 :             :  */
     948                 :             : static int
     949                 :           4 : pg_gbk_mblen(const unsigned char *s)
     950                 :             : {
     951                 :           4 :         int                     len;
     952                 :             : 
     953         [ +  - ]:           4 :         if (IS_HIGHBIT_SET(*s))
     954                 :           4 :                 len = 2;                                /* kanji? */
     955                 :             :         else
     956                 :           0 :                 len = 1;                                /* should be ASCII */
     957                 :           8 :         return len;
     958                 :           4 : }
     959                 :             : 
     960                 :             : static int
     961                 :           0 : pg_gbk_dsplen(const unsigned char *s)
     962                 :             : {
     963                 :           0 :         int                     len;
     964                 :             : 
     965         [ #  # ]:           0 :         if (IS_HIGHBIT_SET(*s))
     966                 :           0 :                 len = 2;                                /* kanji? */
     967                 :             :         else
     968                 :           0 :                 len = pg_ascii_dsplen(s);       /* should be ASCII */
     969                 :           0 :         return len;
     970                 :           0 : }
     971                 :             : 
     972                 :             : /*
     973                 :             :  * UHC
     974                 :             :  */
     975                 :             : static int
     976                 :           4 : pg_uhc_mblen(const unsigned char *s)
     977                 :             : {
     978                 :           4 :         int                     len;
     979                 :             : 
     980         [ +  - ]:           4 :         if (IS_HIGHBIT_SET(*s))
     981                 :           4 :                 len = 2;                                /* 2byte? */
     982                 :             :         else
     983                 :           0 :                 len = 1;                                /* should be ASCII */
     984                 :           8 :         return len;
     985                 :           4 : }
     986                 :             : 
     987                 :             : static int
     988                 :           0 : pg_uhc_dsplen(const unsigned char *s)
     989                 :             : {
     990                 :           0 :         int                     len;
     991                 :             : 
     992         [ #  # ]:           0 :         if (IS_HIGHBIT_SET(*s))
     993                 :           0 :                 len = 2;                                /* 2byte? */
     994                 :             :         else
     995                 :           0 :                 len = pg_ascii_dsplen(s);       /* should be ASCII */
     996                 :           0 :         return len;
     997                 :           0 : }
     998                 :             : 
     999                 :             : /*
    1000                 :             :  * GB18030
    1001                 :             :  *      Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
    1002                 :             :  */
    1003                 :             : 
    1004                 :             : /*
    1005                 :             :  * Unlike all other mblen() functions, this also looks at the second byte of
    1006                 :             :  * the input.  However, if you only pass the first byte of a multi-byte
    1007                 :             :  * string, and \0 as the second byte, this still works in a predictable way:
    1008                 :             :  * a 4-byte character will be reported as two 2-byte characters.  That's
    1009                 :             :  * enough for all current uses, as a client-only encoding.  It works that
    1010                 :             :  * way, because in any valid 4-byte GB18030-encoded character, the third and
    1011                 :             :  * fourth byte look like a 2-byte encoded character, when looked at
    1012                 :             :  * separately.
    1013                 :             :  */
    1014                 :             : static int
    1015                 :          28 : pg_gb18030_mblen(const unsigned char *s)
    1016                 :             : {
    1017                 :          28 :         int                     len;
    1018                 :             : 
    1019         [ +  + ]:          28 :         if (!IS_HIGHBIT_SET(*s))
    1020                 :           6 :                 len = 1;                                /* ASCII */
    1021   [ +  +  -  + ]:          22 :         else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1022                 :          21 :                 len = 4;
    1023                 :             :         else
    1024                 :           1 :                 len = 2;
    1025                 :          56 :         return len;
    1026                 :          28 : }
    1027                 :             : 
    1028                 :             : static int
    1029                 :           0 : pg_gb18030_dsplen(const unsigned char *s)
    1030                 :             : {
    1031                 :           0 :         int                     len;
    1032                 :             : 
    1033         [ #  # ]:           0 :         if (IS_HIGHBIT_SET(*s))
    1034                 :           0 :                 len = 2;
    1035                 :             :         else
    1036                 :           0 :                 len = pg_ascii_dsplen(s);       /* ASCII */
    1037                 :           0 :         return len;
    1038                 :           0 : }
    1039                 :             : 
    1040                 :             : /*
    1041                 :             :  *-------------------------------------------------------------------
    1042                 :             :  * multibyte sequence validators
    1043                 :             :  *
    1044                 :             :  * The verifychar functions accept "s", a pointer to the first byte of a
    1045                 :             :  * string, and "len", the remaining length of the string.  If there is a
    1046                 :             :  * validly encoded character beginning at *s, return its length in bytes;
    1047                 :             :  * else return -1.
    1048                 :             :  *
    1049                 :             :  * The verifystr functions also accept "s", a pointer to a string and "len",
    1050                 :             :  * the length of the string.  They verify the whole string, and return the
    1051                 :             :  * number of input bytes (<= len) that are valid.  In other words, if the
    1052                 :             :  * whole string is valid, verifystr returns "len", otherwise it returns the
    1053                 :             :  * byte offset of the first invalid character.  The verifystr functions must
    1054                 :             :  * test for and reject zeroes in the input.
    1055                 :             :  *
    1056                 :             :  * The verifychar functions can assume that len > 0 and that *s != '\0', but
    1057                 :             :  * they must test for and reject zeroes in any additional bytes of a
    1058                 :             :  * multibyte character.  Note that this definition allows the function for a
    1059                 :             :  * single-byte encoding to be just "return 1".
    1060                 :             :  *-------------------------------------------------------------------
    1061                 :             :  */
    1062                 :             : static int
    1063                 :           0 : pg_ascii_verifychar(const unsigned char *s, int len)
    1064                 :             : {
    1065                 :           0 :         return 1;
    1066                 :             : }
    1067                 :             : 
    1068                 :             : static int
    1069                 :           1 : pg_ascii_verifystr(const unsigned char *s, int len)
    1070                 :             : {
    1071                 :           1 :         const unsigned char *nullpos = memchr(s, 0, len);
    1072                 :             : 
    1073         [ -  + ]:           1 :         if (nullpos == NULL)
    1074                 :           1 :                 return len;
    1075                 :             :         else
    1076                 :           0 :                 return nullpos - s;
    1077                 :           1 : }
    1078                 :             : 
    1079                 :             : #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
    1080                 :             : 
    1081                 :             : static int
    1082                 :          84 : pg_eucjp_verifychar(const unsigned char *s, int len)
    1083                 :             : {
    1084                 :          84 :         int                     l;
    1085                 :          84 :         unsigned char c1,
    1086                 :             :                                 c2;
    1087                 :             : 
    1088                 :          84 :         c1 = *s++;
    1089                 :             : 
    1090      [ -  -  + ]:          84 :         switch (c1)
    1091                 :             :         {
    1092                 :             :                 case SS2:                               /* JIS X 0201 */
    1093                 :           0 :                         l = 2;
    1094         [ #  # ]:           0 :                         if (l > len)
    1095                 :           0 :                                 return -1;
    1096                 :           0 :                         c2 = *s++;
    1097   [ #  #  #  # ]:           0 :                         if (c2 < 0xa1 || c2 > 0xdf)
    1098                 :           0 :                                 return -1;
    1099                 :           0 :                         break;
    1100                 :             : 
    1101                 :             :                 case SS3:                               /* JIS X 0212 */
    1102                 :           0 :                         l = 3;
    1103         [ #  # ]:           0 :                         if (l > len)
    1104                 :           0 :                                 return -1;
    1105                 :           0 :                         c2 = *s++;
    1106   [ #  #  #  # ]:           0 :                         if (!IS_EUC_RANGE_VALID(c2))
    1107                 :           0 :                                 return -1;
    1108                 :           0 :                         c2 = *s++;
    1109   [ #  #  #  # ]:           0 :                         if (!IS_EUC_RANGE_VALID(c2))
    1110                 :           0 :                                 return -1;
    1111                 :           0 :                         break;
    1112                 :             : 
    1113                 :             :                 default:
    1114         [ +  - ]:          84 :                         if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1115                 :             :                         {
    1116                 :          84 :                                 l = 2;
    1117         [ +  + ]:          84 :                                 if (l > len)
    1118                 :          14 :                                         return -1;
    1119   [ +  +  -  + ]:          70 :                                 if (!IS_EUC_RANGE_VALID(c1))
    1120                 :           4 :                                         return -1;
    1121                 :          66 :                                 c2 = *s++;
    1122   [ +  +  -  + ]:          66 :                                 if (!IS_EUC_RANGE_VALID(c2))
    1123                 :          30 :                                         return -1;
    1124                 :          36 :                         }
    1125                 :             :                         else
    1126                 :             :                                 /* must be ASCII */
    1127                 :             :                         {
    1128                 :           0 :                                 l = 1;
    1129                 :             :                         }
    1130                 :          36 :                         break;
    1131                 :             :         }
    1132                 :             : 
    1133                 :          36 :         return l;
    1134                 :          84 : }
    1135                 :             : 
    1136                 :             : static int
    1137                 :          50 : pg_eucjp_verifystr(const unsigned char *s, int len)
    1138                 :             : {
    1139                 :          50 :         const unsigned char *start = s;
    1140                 :             : 
    1141         [ +  + ]:         155 :         while (len > 0)
    1142                 :             :         {
    1143                 :         141 :                 int                     l;
    1144                 :             : 
    1145                 :             :                 /* fast path for ASCII-subset characters */
    1146         [ +  + ]:         141 :                 if (!IS_HIGHBIT_SET(*s))
    1147                 :             :                 {
    1148         [ +  + ]:          99 :                         if (*s == '\0')
    1149                 :          12 :                                 break;
    1150                 :          87 :                         l = 1;
    1151                 :          87 :                 }
    1152                 :             :                 else
    1153                 :             :                 {
    1154                 :          42 :                         l = pg_eucjp_verifychar(s, len);
    1155         [ +  + ]:          42 :                         if (l == -1)
    1156                 :          24 :                                 break;
    1157                 :             :                 }
    1158                 :         105 :                 s += l;
    1159                 :         105 :                 len -= l;
    1160      [ -  +  + ]:         141 :         }
    1161                 :             : 
    1162                 :         100 :         return s - start;
    1163                 :          50 : }
    1164                 :             : 
    1165                 :             : static int
    1166                 :           6 : pg_euckr_verifychar(const unsigned char *s, int len)
    1167                 :             : {
    1168                 :           6 :         int                     l;
    1169                 :           6 :         unsigned char c1,
    1170                 :             :                                 c2;
    1171                 :             : 
    1172                 :           6 :         c1 = *s++;
    1173                 :             : 
    1174         [ +  - ]:           6 :         if (IS_HIGHBIT_SET(c1))
    1175                 :             :         {
    1176                 :           6 :                 l = 2;
    1177         [ +  + ]:           6 :                 if (l > len)
    1178                 :           2 :                         return -1;
    1179   [ -  +  #  # ]:           4 :                 if (!IS_EUC_RANGE_VALID(c1))
    1180                 :           4 :                         return -1;
    1181                 :           0 :                 c2 = *s++;
    1182   [ #  #  #  # ]:           0 :                 if (!IS_EUC_RANGE_VALID(c2))
    1183                 :           0 :                         return -1;
    1184                 :           0 :         }
    1185                 :             :         else
    1186                 :             :                 /* must be ASCII */
    1187                 :             :         {
    1188                 :           0 :                 l = 1;
    1189                 :             :         }
    1190                 :             : 
    1191                 :           0 :         return l;
    1192                 :           6 : }
    1193                 :             : 
    1194                 :             : static int
    1195                 :          10 : pg_euckr_verifystr(const unsigned char *s, int len)
    1196                 :             : {
    1197                 :          10 :         const unsigned char *start = s;
    1198                 :             : 
    1199         [ +  + ]:          22 :         while (len > 0)
    1200                 :             :         {
    1201                 :          18 :                 int                     l;
    1202                 :             : 
    1203                 :             :                 /* fast path for ASCII-subset characters */
    1204         [ +  + ]:          18 :                 if (!IS_HIGHBIT_SET(*s))
    1205                 :             :                 {
    1206         [ +  - ]:          12 :                         if (*s == '\0')
    1207                 :           0 :                                 break;
    1208                 :          12 :                         l = 1;
    1209                 :          12 :                 }
    1210                 :             :                 else
    1211                 :             :                 {
    1212                 :           6 :                         l = pg_euckr_verifychar(s, len);
    1213         [ -  + ]:           6 :                         if (l == -1)
    1214                 :           6 :                                 break;
    1215                 :             :                 }
    1216                 :          12 :                 s += l;
    1217                 :          12 :                 len -= l;
    1218      [ -  +  + ]:          18 :         }
    1219                 :             : 
    1220                 :          20 :         return s - start;
    1221                 :          10 : }
    1222                 :             : 
    1223                 :             : /* EUC-CN byte sequences are exactly same as EUC-KR */
    1224                 :             : #define pg_euccn_verifychar     pg_euckr_verifychar
    1225                 :             : #define pg_euccn_verifystr      pg_euckr_verifystr
    1226                 :             : 
    1227                 :             : static int
    1228                 :           3 : pg_euctw_verifychar(const unsigned char *s, int len)
    1229                 :             : {
    1230                 :           3 :         int                     l;
    1231                 :           3 :         unsigned char c1,
    1232                 :             :                                 c2;
    1233                 :             : 
    1234                 :           3 :         c1 = *s++;
    1235                 :             : 
    1236      [ -  +  - ]:           3 :         switch (c1)
    1237                 :             :         {
    1238                 :             :                 case SS2:                               /* CNS 11643 Plane 1-7 */
    1239                 :           0 :                         l = 4;
    1240         [ #  # ]:           0 :                         if (l > len)
    1241                 :           0 :                                 return -1;
    1242                 :           0 :                         c2 = *s++;
    1243   [ #  #  #  # ]:           0 :                         if (c2 < 0xa1 || c2 > 0xa7)
    1244                 :           0 :                                 return -1;
    1245                 :           0 :                         c2 = *s++;
    1246   [ #  #  #  # ]:           0 :                         if (!IS_EUC_RANGE_VALID(c2))
    1247                 :           0 :                                 return -1;
    1248                 :           0 :                         c2 = *s++;
    1249   [ #  #  #  # ]:           0 :                         if (!IS_EUC_RANGE_VALID(c2))
    1250                 :           0 :                                 return -1;
    1251                 :           0 :                         break;
    1252                 :             : 
    1253                 :             :                 case SS3:                               /* unused */
    1254                 :           0 :                         return -1;
    1255                 :             : 
    1256                 :             :                 default:
    1257         [ +  - ]:           3 :                         if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
    1258                 :             :                         {
    1259                 :           3 :                                 l = 2;
    1260         [ +  + ]:           3 :                                 if (l > len)
    1261                 :           1 :                                         return -1;
    1262                 :             :                                 /* no further range check on c1? */
    1263                 :           2 :                                 c2 = *s++;
    1264   [ -  +  #  # ]:           2 :                                 if (!IS_EUC_RANGE_VALID(c2))
    1265                 :           2 :                                         return -1;
    1266                 :           0 :                         }
    1267                 :             :                         else
    1268                 :             :                                 /* must be ASCII */
    1269                 :             :                         {
    1270                 :           0 :                                 l = 1;
    1271                 :             :                         }
    1272                 :           0 :                         break;
    1273                 :             :         }
    1274                 :           0 :         return l;
    1275                 :           3 : }
    1276                 :             : 
    1277                 :             : static int
    1278                 :           6 : pg_euctw_verifystr(const unsigned char *s, int len)
    1279                 :             : {
    1280                 :           6 :         const unsigned char *start = s;
    1281                 :             : 
    1282         [ +  + ]:          15 :         while (len > 0)
    1283                 :             :         {
    1284                 :          12 :                 int                     l;
    1285                 :             : 
    1286                 :             :                 /* fast path for ASCII-subset characters */
    1287         [ +  + ]:          12 :                 if (!IS_HIGHBIT_SET(*s))
    1288                 :             :                 {
    1289         [ +  - ]:           9 :                         if (*s == '\0')
    1290                 :           0 :                                 break;
    1291                 :           9 :                         l = 1;
    1292                 :           9 :                 }
    1293                 :             :                 else
    1294                 :             :                 {
    1295                 :           3 :                         l = pg_euctw_verifychar(s, len);
    1296         [ -  + ]:           3 :                         if (l == -1)
    1297                 :           3 :                                 break;
    1298                 :             :                 }
    1299                 :           9 :                 s += l;
    1300                 :           9 :                 len -= l;
    1301      [ -  +  + ]:          12 :         }
    1302                 :             : 
    1303                 :          12 :         return s - start;
    1304                 :           6 : }
    1305                 :             : 
    1306                 :             : static int
    1307                 :           3 : pg_johab_verifychar(const unsigned char *s, int len)
    1308                 :             : {
    1309                 :           3 :         int                     l,
    1310                 :             :                                 mbl;
    1311                 :           3 :         unsigned char c;
    1312                 :             : 
    1313                 :           3 :         l = mbl = pg_johab_mblen(s);
    1314                 :             : 
    1315         [ +  + ]:           3 :         if (len < l)
    1316                 :           1 :                 return -1;
    1317                 :             : 
    1318         [ +  - ]:           2 :         if (!IS_HIGHBIT_SET(*s))
    1319                 :           0 :                 return mbl;
    1320                 :             : 
    1321         [ +  - ]:           2 :         while (--l > 0)
    1322                 :             :         {
    1323                 :           2 :                 c = *++s;
    1324   [ -  +  #  # ]:           2 :                 if (!IS_EUC_RANGE_VALID(c))
    1325                 :           2 :                         return -1;
    1326                 :             :         }
    1327                 :           0 :         return mbl;
    1328                 :           3 : }
    1329                 :             : 
    1330                 :             : static int
    1331                 :           4 : pg_johab_verifystr(const unsigned char *s, int len)
    1332                 :             : {
    1333                 :           4 :         const unsigned char *start = s;
    1334                 :             : 
    1335         [ +  + ]:           7 :         while (len > 0)
    1336                 :             :         {
    1337                 :           6 :                 int                     l;
    1338                 :             : 
    1339                 :             :                 /* fast path for ASCII-subset characters */
    1340         [ +  + ]:           6 :                 if (!IS_HIGHBIT_SET(*s))
    1341                 :             :                 {
    1342         [ +  - ]:           3 :                         if (*s == '\0')
    1343                 :           0 :                                 break;
    1344                 :           3 :                         l = 1;
    1345                 :           3 :                 }
    1346                 :             :                 else
    1347                 :             :                 {
    1348                 :           3 :                         l = pg_johab_verifychar(s, len);
    1349         [ -  + ]:           3 :                         if (l == -1)
    1350                 :           3 :                                 break;
    1351                 :             :                 }
    1352                 :           3 :                 s += l;
    1353                 :           3 :                 len -= l;
    1354      [ -  +  + ]:           6 :         }
    1355                 :             : 
    1356                 :           8 :         return s - start;
    1357                 :           4 : }
    1358                 :             : 
    1359                 :             : static int
    1360                 :         219 : pg_mule_verifychar(const unsigned char *s, int len)
    1361                 :             : {
    1362                 :         219 :         int                     l,
    1363                 :             :                                 mbl;
    1364                 :         219 :         unsigned char c;
    1365                 :             : 
    1366                 :         219 :         l = mbl = pg_mule_mblen(s);
    1367                 :             : 
    1368         [ +  + ]:         219 :         if (len < l)
    1369                 :          55 :                 return -1;
    1370                 :             : 
    1371         [ +  + ]:         335 :         while (--l > 0)
    1372                 :             :         {
    1373                 :         221 :                 c = *++s;
    1374         [ +  + ]:         221 :                 if (!IS_HIGHBIT_SET(c))
    1375                 :          50 :                         return -1;
    1376                 :             :         }
    1377                 :         114 :         return mbl;
    1378                 :         219 : }
    1379                 :             : 
    1380                 :             : static int
    1381                 :          66 : pg_mule_verifystr(const unsigned char *s, int len)
    1382                 :             : {
    1383                 :          66 :         const unsigned char *start = s;
    1384                 :             : 
    1385         [ +  + ]:         180 :         while (len > 0)
    1386                 :             :         {
    1387                 :         153 :                 int                     l;
    1388                 :             : 
    1389                 :             :                 /* fast path for ASCII-subset characters */
    1390         [ +  + ]:         153 :                 if (!IS_HIGHBIT_SET(*s))
    1391                 :             :                 {
    1392         [ +  + ]:          87 :                         if (*s == '\0')
    1393                 :           6 :                                 break;
    1394                 :          81 :                         l = 1;
    1395                 :          81 :                 }
    1396                 :             :                 else
    1397                 :             :                 {
    1398                 :          66 :                         l = pg_mule_verifychar(s, len);
    1399         [ +  + ]:          66 :                         if (l == -1)
    1400                 :          33 :                                 break;
    1401                 :             :                 }
    1402                 :         114 :                 s += l;
    1403                 :         114 :                 len -= l;
    1404      [ -  +  + ]:         153 :         }
    1405                 :             : 
    1406                 :         132 :         return s - start;
    1407                 :          66 : }
    1408                 :             : 
    1409                 :             : static int
    1410                 :          33 : pg_latin1_verifychar(const unsigned char *s, int len)
    1411                 :             : {
    1412                 :          33 :         return 1;
    1413                 :             : }
    1414                 :             : 
    1415                 :             : static int
    1416                 :          74 : pg_latin1_verifystr(const unsigned char *s, int len)
    1417                 :             : {
    1418                 :          74 :         const unsigned char *nullpos = memchr(s, 0, len);
    1419                 :             : 
    1420         [ +  + ]:          74 :         if (nullpos == NULL)
    1421                 :          56 :                 return len;
    1422                 :             :         else
    1423                 :          18 :                 return nullpos - s;
    1424                 :          74 : }
    1425                 :             : 
    1426                 :             : static int
    1427                 :         123 : pg_sjis_verifychar(const unsigned char *s, int len)
    1428                 :             : {
    1429                 :         123 :         int                     l,
    1430                 :             :                                 mbl;
    1431                 :         123 :         unsigned char c1,
    1432                 :             :                                 c2;
    1433                 :             : 
    1434                 :         123 :         l = mbl = pg_sjis_mblen(s);
    1435                 :             : 
    1436         [ +  + ]:         123 :         if (len < l)
    1437                 :          20 :                 return -1;
    1438                 :             : 
    1439         [ -  + ]:         103 :         if (l == 1)                                     /* pg_sjis_mblen already verified it */
    1440                 :           0 :                 return mbl;
    1441                 :             : 
    1442                 :         103 :         c1 = *s++;
    1443                 :         103 :         c2 = *s;
    1444   [ +  -  +  +  :         103 :         if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
          -  +  +  -  -  
                      + ]
    1445                 :          40 :                 return -1;
    1446                 :          63 :         return mbl;
    1447                 :         123 : }
    1448                 :             : 
    1449                 :             : static int
    1450                 :          53 : pg_sjis_verifystr(const unsigned char *s, int len)
    1451                 :             : {
    1452                 :          53 :         const unsigned char *start = s;
    1453                 :             : 
    1454         [ +  + ]:         215 :         while (len > 0)
    1455                 :             :         {
    1456                 :         198 :                 int                     l;
    1457                 :             : 
    1458                 :             :                 /* fast path for ASCII-subset characters */
    1459         [ +  + ]:         198 :                 if (!IS_HIGHBIT_SET(*s))
    1460                 :             :                 {
    1461         [ +  + ]:         153 :                         if (*s == '\0')
    1462                 :          12 :                                 break;
    1463                 :         141 :                         l = 1;
    1464                 :         141 :                 }
    1465                 :             :                 else
    1466                 :             :                 {
    1467                 :          45 :                         l = pg_sjis_verifychar(s, len);
    1468         [ +  + ]:          45 :                         if (l == -1)
    1469                 :          24 :                                 break;
    1470                 :             :                 }
    1471                 :         162 :                 s += l;
    1472                 :         162 :                 len -= l;
    1473      [ -  +  + ]:         198 :         }
    1474                 :             : 
    1475                 :         106 :         return s - start;
    1476                 :          53 : }
    1477                 :             : 
    1478                 :             : static int
    1479                 :          60 : pg_big5_verifychar(const unsigned char *s, int len)
    1480                 :             : {
    1481                 :          60 :         int                     l,
    1482                 :             :                                 mbl;
    1483                 :             : 
    1484                 :          60 :         l = mbl = pg_big5_mblen(s);
    1485                 :             : 
    1486         [ +  + ]:          60 :         if (len < l)
    1487                 :           1 :                 return -1;
    1488                 :             : 
    1489         [ +  - ]:          59 :         if (l == 2 &&
    1490   [ +  +  -  + ]:          59 :                 s[0] == NONUTF8_INVALID_BYTE0 &&
    1491                 :           2 :                 s[1] == NONUTF8_INVALID_BYTE1)
    1492                 :           2 :                 return -1;
    1493                 :             : 
    1494         [ +  + ]:          96 :         while (--l > 0)
    1495                 :             :         {
    1496         [ +  + ]:          57 :                 if (*++s == '\0')
    1497                 :          18 :                         return -1;
    1498                 :             :         }
    1499                 :             : 
    1500                 :          39 :         return mbl;
    1501                 :          60 : }
    1502                 :             : 
    1503                 :             : static int
    1504                 :          27 : pg_big5_verifystr(const unsigned char *s, int len)
    1505                 :             : {
    1506                 :          27 :         const unsigned char *start = s;
    1507                 :             : 
    1508         [ +  + ]:         111 :         while (len > 0)
    1509                 :             :         {
    1510                 :          99 :                 int                     l;
    1511                 :             : 
    1512                 :             :                 /* fast path for ASCII-subset characters */
    1513         [ +  + ]:          99 :                 if (!IS_HIGHBIT_SET(*s))
    1514                 :             :                 {
    1515         [ +  + ]:          78 :                         if (*s == '\0')
    1516                 :           6 :                                 break;
    1517                 :          72 :                         l = 1;
    1518                 :          72 :                 }
    1519                 :             :                 else
    1520                 :             :                 {
    1521                 :          21 :                         l = pg_big5_verifychar(s, len);
    1522         [ +  + ]:          21 :                         if (l == -1)
    1523                 :           9 :                                 break;
    1524                 :             :                 }
    1525                 :          84 :                 s += l;
    1526                 :          84 :                 len -= l;
    1527      [ -  +  + ]:          99 :         }
    1528                 :             : 
    1529                 :          54 :         return s - start;
    1530                 :          27 : }
    1531                 :             : 
    1532                 :             : static int
    1533                 :           3 : pg_gbk_verifychar(const unsigned char *s, int len)
    1534                 :             : {
    1535                 :           3 :         int                     l,
    1536                 :             :                                 mbl;
    1537                 :             : 
    1538                 :           3 :         l = mbl = pg_gbk_mblen(s);
    1539                 :             : 
    1540         [ +  + ]:           3 :         if (len < l)
    1541                 :           1 :                 return -1;
    1542                 :             : 
    1543         [ +  - ]:           2 :         if (l == 2 &&
    1544   [ +  -  -  + ]:           2 :                 s[0] == NONUTF8_INVALID_BYTE0 &&
    1545                 :           2 :                 s[1] == NONUTF8_INVALID_BYTE1)
    1546                 :           2 :                 return -1;
    1547                 :             : 
    1548         [ #  # ]:           0 :         while (--l > 0)
    1549                 :             :         {
    1550         [ #  # ]:           0 :                 if (*++s == '\0')
    1551                 :           0 :                         return -1;
    1552                 :             :         }
    1553                 :             : 
    1554                 :           0 :         return mbl;
    1555                 :           3 : }
    1556                 :             : 
    1557                 :             : static int
    1558                 :           4 : pg_gbk_verifystr(const unsigned char *s, int len)
    1559                 :             : {
    1560                 :           4 :         const unsigned char *start = s;
    1561                 :             : 
    1562         [ +  + ]:           7 :         while (len > 0)
    1563                 :             :         {
    1564                 :           6 :                 int                     l;
    1565                 :             : 
    1566                 :             :                 /* fast path for ASCII-subset characters */
    1567         [ +  + ]:           6 :                 if (!IS_HIGHBIT_SET(*s))
    1568                 :             :                 {
    1569         [ +  - ]:           3 :                         if (*s == '\0')
    1570                 :           0 :                                 break;
    1571                 :           3 :                         l = 1;
    1572                 :           3 :                 }
    1573                 :             :                 else
    1574                 :             :                 {
    1575                 :           3 :                         l = pg_gbk_verifychar(s, len);
    1576         [ -  + ]:           3 :                         if (l == -1)
    1577                 :           3 :                                 break;
    1578                 :             :                 }
    1579                 :           3 :                 s += l;
    1580                 :           3 :                 len -= l;
    1581      [ -  +  + ]:           6 :         }
    1582                 :             : 
    1583                 :           8 :         return s - start;
    1584                 :           4 : }
    1585                 :             : 
    1586                 :             : static int
    1587                 :           3 : pg_uhc_verifychar(const unsigned char *s, int len)
    1588                 :             : {
    1589                 :           3 :         int                     l,
    1590                 :             :                                 mbl;
    1591                 :             : 
    1592                 :           3 :         l = mbl = pg_uhc_mblen(s);
    1593                 :             : 
    1594         [ +  + ]:           3 :         if (len < l)
    1595                 :           1 :                 return -1;
    1596                 :             : 
    1597         [ +  - ]:           2 :         if (l == 2 &&
    1598   [ +  -  -  + ]:           2 :                 s[0] == NONUTF8_INVALID_BYTE0 &&
    1599                 :           2 :                 s[1] == NONUTF8_INVALID_BYTE1)
    1600                 :           2 :                 return -1;
    1601                 :             : 
    1602         [ #  # ]:           0 :         while (--l > 0)
    1603                 :             :         {
    1604         [ #  # ]:           0 :                 if (*++s == '\0')
    1605                 :           0 :                         return -1;
    1606                 :             :         }
    1607                 :             : 
    1608                 :           0 :         return mbl;
    1609                 :           3 : }
    1610                 :             : 
    1611                 :             : static int
    1612                 :           4 : pg_uhc_verifystr(const unsigned char *s, int len)
    1613                 :             : {
    1614                 :           4 :         const unsigned char *start = s;
    1615                 :             : 
    1616         [ +  + ]:           7 :         while (len > 0)
    1617                 :             :         {
    1618                 :           6 :                 int                     l;
    1619                 :             : 
    1620                 :             :                 /* fast path for ASCII-subset characters */
    1621         [ +  + ]:           6 :                 if (!IS_HIGHBIT_SET(*s))
    1622                 :             :                 {
    1623         [ +  - ]:           3 :                         if (*s == '\0')
    1624                 :           0 :                                 break;
    1625                 :           3 :                         l = 1;
    1626                 :           3 :                 }
    1627                 :             :                 else
    1628                 :             :                 {
    1629                 :           3 :                         l = pg_uhc_verifychar(s, len);
    1630         [ -  + ]:           3 :                         if (l == -1)
    1631                 :           3 :                                 break;
    1632                 :             :                 }
    1633                 :           3 :                 s += l;
    1634                 :           3 :                 len -= l;
    1635      [ -  +  + ]:           6 :         }
    1636                 :             : 
    1637                 :           8 :         return s - start;
    1638                 :           4 : }
    1639                 :             : 
    1640                 :             : static int
    1641                 :          90 : pg_gb18030_verifychar(const unsigned char *s, int len)
    1642                 :             : {
    1643                 :          90 :         int                     l;
    1644                 :             : 
    1645         [ -  + ]:          90 :         if (!IS_HIGHBIT_SET(*s))
    1646                 :           0 :                 l = 1;                                  /* ASCII */
    1647   [ +  +  +  +  :          90 :         else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
                   -  + ]
    1648                 :             :         {
    1649                 :             :                 /* Should be 4-byte, validate remaining bytes */
    1650   [ +  -  +  - ]:          51 :                 if (*s >= 0x81 && *s <= 0xfe &&
    1651   [ +  -  +  - ]:          51 :                         *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
    1652   [ +  +  -  + ]:          51 :                         *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
    1653                 :          27 :                         l = 4;
    1654                 :             :                 else
    1655                 :          24 :                         l = -1;
    1656                 :          51 :         }
    1657   [ +  +  +  -  :          39 :         else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
                   -  + ]
    1658                 :             :         {
    1659                 :             :                 /* Should be 2-byte, validate */
    1660   [ +  +  -  + ]:          38 :                 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
    1661         [ +  + ]:          26 :                         (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
    1662                 :          12 :                         l = 2;
    1663                 :             :                 else
    1664                 :          14 :                         l = -1;
    1665                 :          26 :         }
    1666                 :             :         else
    1667                 :          13 :                 l = -1;
    1668                 :         180 :         return l;
    1669                 :          90 : }
    1670                 :             : 
    1671                 :             : static int
    1672                 :          49 : pg_gb18030_verifystr(const unsigned char *s, int len)
    1673                 :             : {
    1674                 :          49 :         const unsigned char *start = s;
    1675                 :             : 
    1676         [ +  + ]:         196 :         while (len > 0)
    1677                 :             :         {
    1678                 :         180 :                 int                     l;
    1679                 :             : 
    1680                 :             :                 /* fast path for ASCII-subset characters */
    1681         [ +  + ]:         180 :                 if (!IS_HIGHBIT_SET(*s))
    1682                 :             :                 {
    1683         [ +  + ]:         135 :                         if (*s == '\0')
    1684                 :           6 :                                 break;
    1685                 :         129 :                         l = 1;
    1686                 :         129 :                 }
    1687                 :             :                 else
    1688                 :             :                 {
    1689                 :          45 :                         l = pg_gb18030_verifychar(s, len);
    1690         [ +  + ]:          45 :                         if (l == -1)
    1691                 :          27 :                                 break;
    1692                 :             :                 }
    1693                 :         147 :                 s += l;
    1694                 :         147 :                 len -= l;
    1695      [ -  +  + ]:         180 :         }
    1696                 :             : 
    1697                 :          98 :         return s - start;
    1698                 :          49 : }
    1699                 :             : 
    1700                 :             : static int
    1701                 :         632 : pg_utf8_verifychar(const unsigned char *s, int len)
    1702                 :             : {
    1703                 :         632 :         int                     l;
    1704                 :             : 
    1705         [ -  + ]:         632 :         if ((*s & 0x80) == 0)
    1706                 :             :         {
    1707         [ #  # ]:           0 :                 if (*s == '\0')
    1708                 :           0 :                         return -1;
    1709                 :           0 :                 return 1;
    1710                 :             :         }
    1711         [ +  + ]:         632 :         else if ((*s & 0xe0) == 0xc0)
    1712                 :         240 :                 l = 2;
    1713         [ +  + ]:         392 :         else if ((*s & 0xf0) == 0xe0)
    1714                 :         244 :                 l = 3;
    1715         [ +  + ]:         148 :         else if ((*s & 0xf8) == 0xf0)
    1716                 :         104 :                 l = 4;
    1717                 :             :         else
    1718                 :          44 :                 l = 1;
    1719                 :             : 
    1720         [ +  + ]:         632 :         if (l > len)
    1721                 :          31 :                 return -1;
    1722                 :             : 
    1723         [ +  + ]:         601 :         if (!pg_utf8_islegal(s, l))
    1724                 :         304 :                 return -1;
    1725                 :             : 
    1726                 :         297 :         return l;
    1727                 :         632 : }
    1728                 :             : 
    1729                 :             : /*
    1730                 :             :  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
    1731                 :             :  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
    1732                 :             :  * input byte and current state are used to compute an index into an array of
    1733                 :             :  * state transitions. Since the address of the next transition is dependent
    1734                 :             :  * on this computation, there is latency in executing the load instruction,
    1735                 :             :  * and the CPU is not kept busy.
    1736                 :             :  *
    1737                 :             :  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
    1738                 :             :  *
    1739                 :             :  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
    1740                 :             :  *
    1741                 :             :  * In a shift-based DFA, the input byte is an index into array of integers
    1742                 :             :  * whose bit pattern encodes the state transitions. To compute the next
    1743                 :             :  * state, we simply right-shift the integer by the current state and apply a
    1744                 :             :  * mask. In this scheme, the address of the transition only depends on the
    1745                 :             :  * input byte, so there is better pipelining.
    1746                 :             :  *
    1747                 :             :  * The naming convention for states and transitions was adopted from a UTF-8
    1748                 :             :  * to UTF-16/32 transcoder, whose table is reproduced below:
    1749                 :             :  *
    1750                 :             :  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
    1751                 :             :  *
    1752                 :             :  * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
    1753                 :             :  * ==========================================================================
    1754                 :             :  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
    1755                 :             :  * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
    1756                 :             :  *                                                                  |
    1757                 :             :  * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
    1758                 :             :  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
    1759                 :             :  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
    1760                 :             :  *                                                                  |
    1761                 :             :  * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
    1762                 :             :  * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
    1763                 :             :  *                                                                  |
    1764                 :             :  * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
    1765                 :             :  * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
    1766                 :             :  *
    1767                 :             :  * In the most straightforward implementation, a shift-based DFA for UTF-8
    1768                 :             :  * requires 64-bit integers to encode the transitions, but with an SMT solver
    1769                 :             :  * it's possible to find state numbers such that the transitions fit within
    1770                 :             :  * 32-bit integers, as Dougall Johnson demonstrated:
    1771                 :             :  *
    1772                 :             :  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
    1773                 :             :  *
    1774                 :             :  * This packed representation is the reason for the seemingly odd choice of
    1775                 :             :  * state values below.
    1776                 :             :  */
    1777                 :             : 
    1778                 :             : /* Error */
    1779                 :             : #define ERR  0
    1780                 :             : /* Begin */
    1781                 :             : #define BGN 11
    1782                 :             : /* Continuation states, expect 1/2/3 continuation bytes */
    1783                 :             : #define CS1 16
    1784                 :             : #define CS2  1
    1785                 :             : #define CS3  5
    1786                 :             : /* Partial states, where the first continuation byte has a restricted range */
    1787                 :             : #define P3A  6                                  /* Lead was E0, check for 3-byte overlong */
    1788                 :             : #define P3B 20                                  /* Lead was ED, check for surrogate */
    1789                 :             : #define P4A 25                                  /* Lead was F0, check for 4-byte overlong */
    1790                 :             : #define P4B 30                                  /* Lead was F4, check for too-large */
    1791                 :             : /* Begin and End are the same state */
    1792                 :             : #define END BGN
    1793                 :             : 
    1794                 :             : /* the encoded state transitions for the lookup table */
    1795                 :             : 
    1796                 :             : /* ASCII */
    1797                 :             : #define ASC (END << BGN)
    1798                 :             : /* 2-byte lead */
    1799                 :             : #define L2A (CS1 << BGN)
    1800                 :             : /* 3-byte lead */
    1801                 :             : #define L3A (P3A << BGN)
    1802                 :             : #define L3B (CS2 << BGN)
    1803                 :             : #define L3C (P3B << BGN)
    1804                 :             : /* 4-byte lead */
    1805                 :             : #define L4A (P4A << BGN)
    1806                 :             : #define L4B (CS3 << BGN)
    1807                 :             : #define L4C (P4B << BGN)
    1808                 :             : /* continuation byte */
    1809                 :             : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
    1810                 :             : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
    1811                 :             : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
    1812                 :             : /* invalid byte */
    1813                 :             : #define ILL ERR
    1814                 :             : 
    1815                 :             : static const uint32 Utf8Transition[256] =
    1816                 :             : {
    1817                 :             :         /* ASCII */
    1818                 :             : 
    1819                 :             :         ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1820                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1821                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1822                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1823                 :             : 
    1824                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1825                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1826                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1827                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1828                 :             : 
    1829                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1830                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1831                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1832                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1833                 :             : 
    1834                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1835                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1836                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1837                 :             :         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1838                 :             : 
    1839                 :             :         /* continuation bytes */
    1840                 :             : 
    1841                 :             :         /* 80..8F */
    1842                 :             :         CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1843                 :             :         CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1844                 :             : 
    1845                 :             :         /* 90..9F */
    1846                 :             :         CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1847                 :             :         CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1848                 :             : 
    1849                 :             :         /* A0..BF */
    1850                 :             :         CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1851                 :             :         CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1852                 :             :         CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1853                 :             :         CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1854                 :             : 
    1855                 :             :         /* leading bytes */
    1856                 :             : 
    1857                 :             :         /* C0..DF */
    1858                 :             :         ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
    1859                 :             :         L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1860                 :             :         L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1861                 :             :         L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1862                 :             : 
    1863                 :             :         /* E0..EF */
    1864                 :             :         L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
    1865                 :             :         L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
    1866                 :             : 
    1867                 :             :         /* F0..FF */
    1868                 :             :         L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
    1869                 :             :         ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
    1870                 :             : };
    1871                 :             : 
    1872                 :             : static void
    1873                 :         266 : utf8_advance(const unsigned char *s, uint32 *state, int len)
    1874                 :             : {
    1875                 :             :         /* Note: We deliberately don't check the state's value here. */
    1876         [ +  + ]:        8778 :         while (len > 0)
    1877                 :             :         {
    1878                 :             :                 /*
    1879                 :             :                  * It's important that the mask value is 31: In most instruction sets,
    1880                 :             :                  * a shift by a 32-bit operand is understood to be a shift by its mod
    1881                 :             :                  * 32, so the compiler should elide the mask operation.
    1882                 :             :                  */
    1883                 :        8512 :                 *state = Utf8Transition[*s++] >> (*state & 31);
    1884                 :        8512 :                 len--;
    1885                 :             :         }
    1886                 :             : 
    1887                 :         266 :         *state &= 31;
    1888                 :         266 : }
    1889                 :             : 
    1890                 :             : static int
    1891                 :      112186 : pg_utf8_verifystr(const unsigned char *s, int len)
    1892                 :             : {
    1893                 :      112186 :         const unsigned char *start = s;
    1894                 :      112186 :         const int       orig_len = len;
    1895                 :      112186 :         uint32          state = BGN;
    1896                 :             : 
    1897                 :             : /*
    1898                 :             :  * With a stride of two vector widths, gcc will unroll the loop. Even if
    1899                 :             :  * the compiler can unroll a longer loop, it's not worth it because we
    1900                 :             :  * must fall back to the byte-wise algorithm if we find any non-ASCII.
    1901                 :             :  */
    1902                 :             : #define STRIDE_LENGTH (2 * sizeof(Vector8))
    1903                 :             : 
    1904         [ +  + ]:      112186 :         if (len >= STRIDE_LENGTH)
    1905                 :             :         {
    1906         [ +  + ]:      416724 :                 while (len >= STRIDE_LENGTH)
    1907                 :             :                 {
    1908                 :             :                         /*
    1909                 :             :                          * If the chunk is all ASCII, we can skip the full UTF-8 check,
    1910                 :             :                          * but we must first check for a non-END state, which means the
    1911                 :             :                          * previous chunk ended in the middle of a multibyte sequence.
    1912                 :             :                          */
    1913   [ +  +  +  + ]:      326778 :                         if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
    1914                 :         266 :                                 utf8_advance(s, &state, STRIDE_LENGTH);
    1915                 :             : 
    1916                 :      326778 :                         s += STRIDE_LENGTH;
    1917                 :      326778 :                         len -= STRIDE_LENGTH;
    1918                 :             :                 }
    1919                 :             : 
    1920                 :             :                 /* The error state persists, so we only need to check for it here. */
    1921         [ +  + ]:       89946 :                 if (state == ERR)
    1922                 :             :                 {
    1923                 :             :                         /*
    1924                 :             :                          * Start over from the beginning with the slow path so we can
    1925                 :             :                          * count the valid bytes.
    1926                 :             :                          */
    1927                 :          84 :                         len = orig_len;
    1928                 :          84 :                         s = start;
    1929                 :          84 :                 }
    1930         [ +  + ]:       89862 :                 else if (state != END)
    1931                 :             :                 {
    1932                 :             :                         /*
    1933                 :             :                          * The fast path exited in the middle of a multibyte sequence.
    1934                 :             :                          * Walk backwards to find the leading byte so that the slow path
    1935                 :             :                          * can resume checking from there. We must always backtrack at
    1936                 :             :                          * least one byte, since the current byte could be e.g. an ASCII
    1937                 :             :                          * byte after a 2-byte lead, which is invalid.
    1938                 :             :                          */
    1939                 :           9 :                         do
    1940                 :             :                         {
    1941         [ +  - ]:          15 :                                 Assert(s > start);
    1942                 :          15 :                                 s--;
    1943                 :          15 :                                 len++;
    1944         [ +  - ]:          15 :                                 Assert(IS_HIGHBIT_SET(*s));
    1945         [ +  + ]:          15 :                         } while (pg_utf_mblen(s) <= 1);
    1946                 :           9 :                 }
    1947                 :       89946 :         }
    1948                 :             : 
    1949                 :             :         /* check remaining bytes */
    1950         [ +  + ]:     1260954 :         while (len > 0)
    1951                 :             :         {
    1952                 :     1149132 :                 int                     l;
    1953                 :             : 
    1954                 :             :                 /* fast path for ASCII-subset characters */
    1955         [ +  + ]:     1149132 :                 if (!IS_HIGHBIT_SET(*s))
    1956                 :             :                 {
    1957         [ +  + ]:     1148500 :                         if (*s == '\0')
    1958                 :          29 :                                 break;
    1959                 :     1148471 :                         l = 1;
    1960                 :     1148471 :                 }
    1961                 :             :                 else
    1962                 :             :                 {
    1963                 :         632 :                         l = pg_utf8_verifychar(s, len);
    1964         [ +  + ]:         632 :                         if (l == -1)
    1965                 :         335 :                                 break;
    1966                 :             :                 }
    1967                 :     1148768 :                 s += l;
    1968                 :     1148768 :                 len -= l;
    1969      [ -  +  + ]:     1149132 :         }
    1970                 :             : 
    1971                 :      224372 :         return s - start;
    1972                 :      112186 : }
    1973                 :             : 
    1974                 :             : /*
    1975                 :             :  * Check for validity of a single UTF-8 encoded character
    1976                 :             :  *
    1977                 :             :  * This directly implements the rules in RFC3629.  The bizarre-looking
    1978                 :             :  * restrictions on the second byte are meant to ensure that there isn't
    1979                 :             :  * more than one encoding of a given Unicode character point; that is,
    1980                 :             :  * you may not use a longer-than-necessary byte sequence with high order
    1981                 :             :  * zero bits to represent a character that would fit in fewer bytes.
    1982                 :             :  * To do otherwise is to create security hazards (eg, create an apparent
    1983                 :             :  * non-ASCII character that decodes to plain ASCII).
    1984                 :             :  *
    1985                 :             :  * length is assumed to have been obtained by pg_utf_mblen(), and the
    1986                 :             :  * caller must have checked that that many bytes are present in the buffer.
    1987                 :             :  */
    1988                 :             : bool
    1989                 :        1681 : pg_utf8_islegal(const unsigned char *source, int length)
    1990                 :             : {
    1991                 :        1681 :         unsigned char a;
    1992                 :             : 
    1993   [ +  +  +  -  :        1681 :         switch (length)
                      + ]
    1994                 :             :         {
    1995                 :             :                 default:
    1996                 :             :                         /* reject lengths 5 and 6 for now */
    1997                 :           0 :                         return false;
    1998                 :             :                 case 4:
    1999                 :          98 :                         a = source[3];
    2000   [ +  +  -  + ]:          98 :                         if (a < 0x80 || a > 0xBF)
    2001                 :          16 :                                 return false;
    2002                 :             :                         /* FALL THRU */
    2003                 :             :                 case 3:
    2004                 :         590 :                         a = source[2];
    2005   [ +  +  +  + ]:         590 :                         if (a < 0x80 || a > 0xBF)
    2006                 :         100 :                                 return false;
    2007                 :             :                         /* FALL THRU */
    2008                 :             :                 case 2:
    2009                 :         813 :                         a = source[1];
    2010   [ +  +  +  +  :         813 :                         switch (*source)
                      + ]
    2011                 :             :                         {
    2012                 :             :                                 case 0xE0:
    2013   [ +  +  -  + ]:          52 :                                         if (a < 0xA0 || a > 0xBF)
    2014                 :          44 :                                                 return false;
    2015                 :           8 :                                         break;
    2016                 :             :                                 case 0xED:
    2017   [ +  -  +  + ]:          52 :                                         if (a < 0x80 || a > 0x9F)
    2018                 :          44 :                                                 return false;
    2019                 :           8 :                                         break;
    2020                 :             :                                 case 0xF0:
    2021   [ +  +  -  + ]:          52 :                                         if (a < 0x90 || a > 0xBF)
    2022                 :          44 :                                                 return false;
    2023                 :           8 :                                         break;
    2024                 :             :                                 case 0xF4:
    2025   [ +  -  +  + ]:          30 :                                         if (a < 0x80 || a > 0x8F)
    2026                 :          22 :                                                 return false;
    2027                 :           8 :                                         break;
    2028                 :             :                                 default:
    2029   [ +  +  -  + ]:         627 :                                         if (a < 0x80 || a > 0xBF)
    2030                 :          18 :                                                 return false;
    2031                 :         609 :                                         break;
    2032                 :         641 :                         }
    2033                 :             :                         /* FALL THRU */
    2034                 :             :                 case 1:
    2035                 :        1393 :                         a = *source;
    2036   [ +  +  +  + ]:        1393 :                         if (a >= 0x80 && a < 0xC2)
    2037                 :          66 :                                 return false;
    2038         [ +  + ]:        1327 :                         if (a > 0xF4)
    2039                 :          22 :                                 return false;
    2040                 :        1305 :                         break;
    2041                 :             :         }
    2042                 :        1305 :         return true;
    2043                 :        1681 : }
    2044                 :             : 
    2045                 :             : 
    2046                 :             : /*
    2047                 :             :  * Fills the provided buffer with two bytes such that:
    2048                 :             :  *   pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
    2049                 :             :  */
    2050                 :             : void
    2051                 :          14 : pg_encoding_set_invalid(int encoding, char *dst)
    2052                 :             : {
    2053         [ +  - ]:          14 :         Assert(pg_encoding_max_length(encoding) > 1);
    2054                 :             : 
    2055                 :          14 :         dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
    2056                 :          14 :         dst[1] = NONUTF8_INVALID_BYTE1;
    2057                 :          14 : }
    2058                 :             : 
    2059                 :             : /*
    2060                 :             :  *-------------------------------------------------------------------
    2061                 :             :  * encoding info table
    2062                 :             :  *-------------------------------------------------------------------
    2063                 :             :  */
    2064                 :             : const pg_wchar_tbl pg_wchar_table[] = {
    2065                 :             :         [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
    2066                 :             :         [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    2067                 :             :         [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
    2068                 :             :         [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
    2069                 :             :         [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
    2070                 :             :         [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    2071                 :             :         [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
    2072                 :             :         [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
    2073                 :             :         [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2074                 :             :         [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2075                 :             :         [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2076                 :             :         [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2077                 :             :         [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2078                 :             :         [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2079                 :             :         [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2080                 :             :         [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2081                 :             :         [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2082                 :             :         [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2083                 :             :         [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2084                 :             :         [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2085                 :             :         [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2086                 :             :         [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2087                 :             :         [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2088                 :             :         [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2089                 :             :         [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2090                 :             :         [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2091                 :             :         [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2092                 :             :         [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2093                 :             :         [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2094                 :             :         [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2095                 :             :         [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2096                 :             :         [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2097                 :             :         [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2098                 :             :         [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2099                 :             :         [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2100                 :             :         [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    2101                 :             :         [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
    2102                 :             :         [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
    2103                 :             :         [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
    2104                 :             :         [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
    2105                 :             :         [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
    2106                 :             :         [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    2107                 :             : };
    2108                 :             : 
    2109                 :             : /*
    2110                 :             :  * Returns the byte length of a multibyte character.
    2111                 :             :  *
    2112                 :             :  * Choose "mblen" functions based on the input string characteristics.
    2113                 :             :  * pg_encoding_mblen() can be used when ANY of these conditions are met:
    2114                 :             :  *
    2115                 :             :  * - The input string is zero-terminated
    2116                 :             :  *
    2117                 :             :  * - The input string is known to be valid in the encoding (e.g., string
    2118                 :             :  *   converted from database encoding)
    2119                 :             :  *
    2120                 :             :  * - The encoding is not GB18030 (e.g., when only database encodings are
    2121                 :             :  *   passed to 'encoding' parameter)
    2122                 :             :  *
    2123                 :             :  * encoding==GB18030 requires examining up to two bytes to determine character
    2124                 :             :  * length.  Therefore, callers satisfying none of those conditions must use
    2125                 :             :  * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
    2126                 :             :  * guaranteed to be within allocation bounds.
    2127                 :             :  *
    2128                 :             :  * When dealing with text that is not certainly valid in the specified
    2129                 :             :  * encoding, the result may exceed the actual remaining string length.
    2130                 :             :  * Callers that are not prepared to deal with that should use Min(remaining,
    2131                 :             :  * pg_encoding_mblen_or_incomplete()).  For zero-terminated strings, that and
    2132                 :             :  * pg_encoding_mblen_bounded() are interchangeable.
    2133                 :             :  */
    2134                 :             : int
    2135                 :     6811070 : pg_encoding_mblen(int encoding, const char *mbstr)
    2136                 :             : {
    2137   [ +  -  -  + ]:     6811070 :         return (PG_VALID_ENCODING(encoding) ?
    2138                 :     6811070 :                         pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
    2139                 :           0 :                         pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
    2140                 :             : }
    2141                 :             : 
    2142                 :             : /*
    2143                 :             :  * Returns the byte length of a multibyte character (possibly not
    2144                 :             :  * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
    2145                 :             :  */
    2146                 :             : int
    2147                 :         674 : pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
    2148                 :             :                                                                 size_t remaining)
    2149                 :             : {
    2150                 :             :         /*
    2151                 :             :          * Define zero remaining as too few, even for single-byte encodings.
    2152                 :             :          * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
    2153                 :             :          * zero; others read one.
    2154                 :             :          */
    2155   [ +  -  +  + ]:         701 :         if (remaining < 1 ||
    2156   [ +  +  +  + ]:         674 :                 (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
    2157                 :           6 :                 return INT_MAX;
    2158                 :         668 :         return pg_encoding_mblen(encoding, mbstr);
    2159                 :         674 : }
    2160                 :             : 
    2161                 :             : /*
    2162                 :             :  * Returns the byte length of a multibyte character; but not more than the
    2163                 :             :  * distance to the terminating zero byte.  For input that might lack a
    2164                 :             :  * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
    2165                 :             :  */
    2166                 :             : int
    2167                 :           0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
    2168                 :             : {
    2169                 :           0 :         return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
    2170                 :             : }
    2171                 :             : 
    2172                 :             : /*
    2173                 :             :  * Returns the display length of a multibyte character.
    2174                 :             :  */
    2175                 :             : int
    2176                 :     6789902 : pg_encoding_dsplen(int encoding, const char *mbstr)
    2177                 :             : {
    2178   [ +  -  -  + ]:     6789902 :         return (PG_VALID_ENCODING(encoding) ?
    2179                 :     6789902 :                         pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
    2180                 :           0 :                         pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
    2181                 :             : }
    2182                 :             : 
    2183                 :             : /*
    2184                 :             :  * Verify the first multibyte character of the given string.
    2185                 :             :  * Return its byte length if good, -1 if bad.  (See comments above for
    2186                 :             :  * full details of the mbverifychar API.)
    2187                 :             :  */
    2188                 :             : int
    2189                 :         390 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
    2190                 :             : {
    2191   [ +  -  -  + ]:         390 :         return (PG_VALID_ENCODING(encoding) ?
    2192                 :         390 :                         pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
    2193                 :           0 :                         pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
    2194                 :             : }
    2195                 :             : 
    2196                 :             : /*
    2197                 :             :  * Verify that a string is valid for the given encoding.
    2198                 :             :  * Returns the number of input bytes (<= len) that form a valid string.
    2199                 :             :  * (See comments above for full details of the mbverifystr API.)
    2200                 :             :  */
    2201                 :             : int
    2202                 :         936 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
    2203                 :             : {
    2204   [ +  -  -  + ]:         936 :         return (PG_VALID_ENCODING(encoding) ?
    2205                 :         936 :                         pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
    2206                 :           0 :                         pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
    2207                 :             : }
    2208                 :             : 
    2209                 :             : /*
    2210                 :             :  * fetch maximum length of a given encoding
    2211                 :             :  */
    2212                 :             : int
    2213                 :       26499 : pg_encoding_max_length(int encoding)
    2214                 :             : {
    2215         [ +  - ]:       26499 :         Assert(PG_VALID_ENCODING(encoding));
           [ -  +  +  - ]
    2216                 :             : 
    2217                 :             :         /*
    2218                 :             :          * Check for the encoding despite the assert, due to some mingw versions
    2219                 :             :          * otherwise issuing bogus warnings.
    2220                 :             :          */
    2221   [ +  -  -  + ]:       26499 :         return PG_VALID_ENCODING(encoding) ?
    2222                 :       26499 :                 pg_wchar_table[encoding].maxmblen :
    2223                 :             :                 pg_wchar_table[PG_SQL_ASCII].maxmblen;
    2224                 :             : }

Generated by: LCOV version 2.3.2-1