LCOV - code coverage report
Current view: top level - contrib/fuzzystrmatch - fuzzystrmatch.c (source / functions) Coverage Total Hit
Test: Code coverage Lines: 0.0 % 313 0
Test Date: 2026-01-26 10:56:24 Functions: 0.0 % 21 0
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*
       2              :  * fuzzystrmatch.c
       3              :  *
       4              :  * Functions for "fuzzy" comparison of strings
       5              :  *
       6              :  * Joe Conway <mail@joeconway.com>
       7              :  *
       8              :  * contrib/fuzzystrmatch/fuzzystrmatch.c
       9              :  * Copyright (c) 2001-2026, PostgreSQL Global Development Group
      10              :  * ALL RIGHTS RESERVED;
      11              :  *
      12              :  * metaphone()
      13              :  * -----------
      14              :  * Modified for PostgreSQL by Joe Conway.
      15              :  * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
      16              :  * Code slightly modified for use as PostgreSQL function (palloc, elog, etc).
      17              :  * Metaphone was originally created by Lawrence Philips and presented in article
      18              :  * in "Computer Language" December 1990 issue.
      19              :  *
      20              :  * Permission to use, copy, modify, and distribute this software and its
      21              :  * documentation for any purpose, without fee, and without a written agreement
      22              :  * is hereby granted, provided that the above copyright notice and this
      23              :  * paragraph and the following two paragraphs appear in all copies.
      24              :  *
      25              :  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
      26              :  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
      27              :  * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
      28              :  * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE
      29              :  * POSSIBILITY OF SUCH DAMAGE.
      30              :  *
      31              :  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
      32              :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      33              :  * AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
      34              :  * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO
      35              :  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
      36              :  *
      37              :  */
      38              : 
      39              : #include "postgres.h"
      40              : 
      41              : #include <ctype.h>
      42              : 
      43              : #include "utils/builtins.h"
      44              : #include "utils/varlena.h"
      45              : #include "varatt.h"
      46              : 
      47            0 : PG_MODULE_MAGIC_EXT(
      48              :                                         .name = "fuzzystrmatch",
      49              :                                         .version = PG_VERSION
      50              : );
      51              : 
      52              : /*
      53              :  * Soundex
      54              :  */
      55              : static void _soundex(const char *instr, char *outstr);
      56              : 
      57              : #define SOUNDEX_LEN 4
      58              : 
      59              : /*                                                                      ABCDEFGHIJKLMNOPQRSTUVWXYZ */
      60              : static const char *const soundex_table = "01230120022455012623010202";
      61              : 
      62              : static char
      63            0 : soundex_code(char letter)
      64              : {
      65            0 :         letter = pg_ascii_toupper((unsigned char) letter);
      66              :         /* Defend against non-ASCII letters */
      67            0 :         if (letter >= 'A' && letter <= 'Z')
      68            0 :                 return soundex_table[letter - 'A'];
      69            0 :         return letter;
      70            0 : }
      71              : 
      72              : /*
      73              :  * Metaphone
      74              :  */
      75              : #define MAX_METAPHONE_STRLEN            255
      76              : 
      77              : /*
      78              :  * Original code by Michael G Schwern starts here.
      79              :  * Code slightly modified for use as PostgreSQL function.
      80              :  */
      81              : 
      82              : 
      83              : /**************************************************************************
      84              :         metaphone -- Breaks english phrases down into their phonemes.
      85              : 
      86              :         Input
      87              :                 word                    --      An english word to be phonized
      88              :                 max_phonemes    --      How many phonemes to calculate.  If 0, then it
      89              :                                                         will phonize the entire phrase.
      90              :                 phoned_word             --      The final phonized word.  (We'll allocate the
      91              :                                                         memory.)
      92              :         Output
      93              :                 error   --      A simple error flag, returns true or false
      94              : 
      95              :         NOTES:  ALL non-alpha characters are ignored, this includes whitespace,
      96              :         although non-alpha characters will break up phonemes.
      97              : ****************************************************************************/
      98              : 
      99              : 
     100              : /*      I add modifications to the traditional metaphone algorithm that you
     101              :         might find in books.  Define this if you want metaphone to behave
     102              :         traditionally */
     103              : #undef USE_TRADITIONAL_METAPHONE
     104              : 
     105              : /* Special encodings */
     106              : #define  SH             'X'
     107              : #define  TH             '0'
     108              : 
     109              : static char Lookahead(char *word, int how_far);
     110              : static void _metaphone(char *word, int max_phonemes, char **phoned_word);
     111              : 
     112              : /* Metachar.h ... little bits about characters for metaphone */
     113              : 
     114              : 
     115              : /*-- Character encoding array & accessing macros --*/
     116              : /* Stolen directly out of the book... */
     117              : static const char _codes[26] = {
     118              :         1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
     119              : /*      a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
     120              : };
     121              : 
     122              : static int
     123            0 : getcode(char c)
     124              : {
     125            0 :         c = pg_ascii_toupper((unsigned char) c);
     126              :         /* Defend against non-ASCII letters */
     127            0 :         if (c >= 'A' && c <= 'Z')
     128            0 :                 return _codes[c - 'A'];
     129              : 
     130            0 :         return 0;
     131            0 : }
     132              : 
     133              : static bool
     134            0 : ascii_isalpha(char c)
     135              : {
     136            0 :         return (c >= 'A' && c <= 'Z') ||
     137            0 :                 (c >= 'a' && c <= 'z');
     138              : }
     139              : 
     140              : #define isvowel(c)      (getcode(c) & 1)    /* AEIOU */
     141              : 
     142              : /* These letters are passed through unchanged */
     143              : #define NOCHANGE(c) (getcode(c) & 2)        /* FJMNR */
     144              : 
     145              : /* These form diphthongs when preceding H */
     146              : #define AFFECTH(c)      (getcode(c) & 4)    /* CGPST */
     147              : 
     148              : /* These make C and G soft */
     149              : #define MAKESOFT(c) (getcode(c) & 8)        /* EIY */
     150              : 
     151              : /* These prevent GH from becoming F */
     152              : #define NOGHTOF(c)      (getcode(c) & 16)   /* BDH */
     153              : 
     154            0 : PG_FUNCTION_INFO_V1(levenshtein_with_costs);
     155              : Datum
     156            0 : levenshtein_with_costs(PG_FUNCTION_ARGS)
     157              : {
     158            0 :         text       *src = PG_GETARG_TEXT_PP(0);
     159            0 :         text       *dst = PG_GETARG_TEXT_PP(1);
     160            0 :         int                     ins_c = PG_GETARG_INT32(2);
     161            0 :         int                     del_c = PG_GETARG_INT32(3);
     162            0 :         int                     sub_c = PG_GETARG_INT32(4);
     163            0 :         const char *s_data;
     164            0 :         const char *t_data;
     165            0 :         int                     s_bytes,
     166              :                                 t_bytes;
     167              : 
     168              :         /* Extract a pointer to the actual character data */
     169            0 :         s_data = VARDATA_ANY(src);
     170            0 :         t_data = VARDATA_ANY(dst);
     171              :         /* Determine length of each string in bytes */
     172            0 :         s_bytes = VARSIZE_ANY_EXHDR(src);
     173            0 :         t_bytes = VARSIZE_ANY_EXHDR(dst);
     174              : 
     175            0 :         PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
     176              :                                                                            ins_c, del_c, sub_c, false));
     177            0 : }
     178              : 
     179              : 
     180            0 : PG_FUNCTION_INFO_V1(levenshtein);
     181              : Datum
     182            0 : levenshtein(PG_FUNCTION_ARGS)
     183              : {
     184            0 :         text       *src = PG_GETARG_TEXT_PP(0);
     185            0 :         text       *dst = PG_GETARG_TEXT_PP(1);
     186            0 :         const char *s_data;
     187            0 :         const char *t_data;
     188            0 :         int                     s_bytes,
     189              :                                 t_bytes;
     190              : 
     191              :         /* Extract a pointer to the actual character data */
     192            0 :         s_data = VARDATA_ANY(src);
     193            0 :         t_data = VARDATA_ANY(dst);
     194              :         /* Determine length of each string in bytes */
     195            0 :         s_bytes = VARSIZE_ANY_EXHDR(src);
     196            0 :         t_bytes = VARSIZE_ANY_EXHDR(dst);
     197              : 
     198            0 :         PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
     199              :                                                                            1, 1, 1, false));
     200            0 : }
     201              : 
     202              : 
     203            0 : PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
     204              : Datum
     205            0 : levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
     206              : {
     207            0 :         text       *src = PG_GETARG_TEXT_PP(0);
     208            0 :         text       *dst = PG_GETARG_TEXT_PP(1);
     209            0 :         int                     ins_c = PG_GETARG_INT32(2);
     210            0 :         int                     del_c = PG_GETARG_INT32(3);
     211            0 :         int                     sub_c = PG_GETARG_INT32(4);
     212            0 :         int                     max_d = PG_GETARG_INT32(5);
     213            0 :         const char *s_data;
     214            0 :         const char *t_data;
     215            0 :         int                     s_bytes,
     216              :                                 t_bytes;
     217              : 
     218              :         /* Extract a pointer to the actual character data */
     219            0 :         s_data = VARDATA_ANY(src);
     220            0 :         t_data = VARDATA_ANY(dst);
     221              :         /* Determine length of each string in bytes */
     222            0 :         s_bytes = VARSIZE_ANY_EXHDR(src);
     223            0 :         t_bytes = VARSIZE_ANY_EXHDR(dst);
     224              : 
     225            0 :         PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
     226              :                                                                                                   t_data, t_bytes,
     227              :                                                                                                   ins_c, del_c, sub_c,
     228              :                                                                                                   max_d, false));
     229            0 : }
     230              : 
     231              : 
     232            0 : PG_FUNCTION_INFO_V1(levenshtein_less_equal);
     233              : Datum
     234            0 : levenshtein_less_equal(PG_FUNCTION_ARGS)
     235              : {
     236            0 :         text       *src = PG_GETARG_TEXT_PP(0);
     237            0 :         text       *dst = PG_GETARG_TEXT_PP(1);
     238            0 :         int                     max_d = PG_GETARG_INT32(2);
     239            0 :         const char *s_data;
     240            0 :         const char *t_data;
     241            0 :         int                     s_bytes,
     242              :                                 t_bytes;
     243              : 
     244              :         /* Extract a pointer to the actual character data */
     245            0 :         s_data = VARDATA_ANY(src);
     246            0 :         t_data = VARDATA_ANY(dst);
     247              :         /* Determine length of each string in bytes */
     248            0 :         s_bytes = VARSIZE_ANY_EXHDR(src);
     249            0 :         t_bytes = VARSIZE_ANY_EXHDR(dst);
     250              : 
     251            0 :         PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
     252              :                                                                                                   t_data, t_bytes,
     253              :                                                                                                   1, 1, 1,
     254              :                                                                                                   max_d, false));
     255            0 : }
     256              : 
     257              : 
     258              : /*
     259              :  * Calculates the metaphone of an input string.
     260              :  * Returns number of characters requested
     261              :  * (suggested value is 4)
     262              :  */
     263            0 : PG_FUNCTION_INFO_V1(metaphone);
     264              : Datum
     265            0 : metaphone(PG_FUNCTION_ARGS)
     266              : {
     267            0 :         char       *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
     268            0 :         size_t          str_i_len = strlen(str_i);
     269            0 :         int                     reqlen;
     270            0 :         char       *metaph;
     271              : 
     272              :         /* return an empty string if we receive one */
     273            0 :         if (!(str_i_len > 0))
     274            0 :                 PG_RETURN_TEXT_P(cstring_to_text(""));
     275              : 
     276            0 :         if (str_i_len > MAX_METAPHONE_STRLEN)
     277            0 :                 ereport(ERROR,
     278              :                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     279              :                                  errmsg("argument exceeds the maximum length of %d bytes",
     280              :                                                 MAX_METAPHONE_STRLEN)));
     281              : 
     282            0 :         reqlen = PG_GETARG_INT32(1);
     283            0 :         if (reqlen > MAX_METAPHONE_STRLEN)
     284            0 :                 ereport(ERROR,
     285              :                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     286              :                                  errmsg("output exceeds the maximum length of %d bytes",
     287              :                                                 MAX_METAPHONE_STRLEN)));
     288              : 
     289            0 :         if (!(reqlen > 0))
     290            0 :                 ereport(ERROR,
     291              :                                 (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
     292              :                                  errmsg("output cannot be empty string")));
     293              : 
     294            0 :         _metaphone(str_i, reqlen, &metaph);
     295            0 :         PG_RETURN_TEXT_P(cstring_to_text(metaph));
     296            0 : }
     297              : 
     298              : 
     299              : /*
     300              :  * Original code by Michael G Schwern starts here.
     301              :  * Code slightly modified for use as PostgreSQL
     302              :  * function (palloc, etc).
     303              :  */
     304              : 
     305              : /* I suppose I could have been using a character pointer instead of
     306              :  * accessing the array directly... */
     307              : 
     308              : /* Look at the next letter in the word */
     309              : #define Next_Letter (pg_ascii_toupper((unsigned char) word[w_idx+1]))
     310              : /* Look at the current letter in the word */
     311              : #define Curr_Letter (pg_ascii_toupper((unsigned char) word[w_idx]))
     312              : /* Go N letters back. */
     313              : #define Look_Back_Letter(n) \
     314              :         (w_idx >= (n) ? pg_ascii_toupper((unsigned char) word[w_idx-(n)]) : '\0')
     315              : /* Previous letter.  I dunno, should this return null on failure? */
     316              : #define Prev_Letter (Look_Back_Letter(1))
     317              : /* Look two letters down.  It makes sure you don't walk off the string. */
     318              : #define After_Next_Letter \
     319              :         (Next_Letter != '\0' ? pg_ascii_toupper((unsigned char) word[w_idx+2]) : '\0')
     320              : #define Look_Ahead_Letter(n) pg_ascii_toupper((unsigned char) Lookahead(word+w_idx, n))
     321              : 
     322              : 
     323              : /* Allows us to safely look ahead an arbitrary # of letters */
     324              : /* I probably could have just used strlen... */
     325              : static char
     326            0 : Lookahead(char *word, int how_far)
     327              : {
     328            0 :         char            letter_ahead = '\0';    /* null by default */
     329            0 :         int                     idx;
     330              : 
     331            0 :         for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
     332              :         /* Edge forward in the string... */
     333              : 
     334            0 :         letter_ahead = word[idx];       /* idx will be either == to how_far or at the
     335              :                                                                  * end of the string */
     336            0 :         return letter_ahead;
     337            0 : }
     338              : 
     339              : 
     340              : /* phonize one letter */
     341              : #define Phonize(c)      do {(*phoned_word)[p_idx++] = c;} while (0)
     342              : /* Slap a null character on the end of the phoned word */
     343              : #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
     344              : /* How long is the phoned word? */
     345              : #define Phone_Len       (p_idx)
     346              : 
     347              : /* Note is a letter is a 'break' in the word */
     348              : #define Isbreak(c)      (!ascii_isalpha((unsigned char) (c)))
     349              : 
     350              : 
     351              : static void
     352            0 : _metaphone(char *word,                  /* IN */
     353              :                    int max_phonemes,
     354              :                    char **phoned_word)  /* OUT */
     355              : {
     356            0 :         int                     w_idx = 0;              /* point in the phonization we're at. */
     357            0 :         int                     p_idx = 0;              /* end of the phoned phrase */
     358              : 
     359              :         /*-- Parameter checks --*/
     360              : 
     361              :         /*
     362              :          * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
     363              :          */
     364              : 
     365              :         /* Negative phoneme length is meaningless */
     366            0 :         if (!(max_phonemes > 0))
     367              :                 /* internal error */
     368            0 :                 elog(ERROR, "metaphone: Requested output length must be > 0");
     369              : 
     370              :         /* Empty/null string is meaningless */
     371            0 :         if ((word == NULL) || !(strlen(word) > 0))
     372              :                 /* internal error */
     373            0 :                 elog(ERROR, "metaphone: Input string length must be > 0");
     374              : 
     375              :         /*-- Allocate memory for our phoned_phrase --*/
     376            0 :         if (max_phonemes == 0)
     377              :         {                                                       /* Assume largest possible */
     378            0 :                 *phoned_word = palloc(sizeof(char) * strlen(word) + 1);
     379            0 :         }
     380              :         else
     381              :         {
     382            0 :                 *phoned_word = palloc(sizeof(char) * max_phonemes + 1);
     383              :         }
     384              : 
     385              :         /*-- The first phoneme has to be processed specially. --*/
     386              :         /* Find our first letter */
     387            0 :         for (; !ascii_isalpha((unsigned char) (Curr_Letter)); w_idx++)
     388              :         {
     389              :                 /* On the off chance we were given nothing but crap... */
     390            0 :                 if (Curr_Letter == '\0')
     391              :                 {
     392            0 :                         End_Phoned_Word;
     393            0 :                         return;
     394              :                 }
     395            0 :         }
     396              : 
     397            0 :         switch (Curr_Letter)
     398              :         {
     399              :                         /* AE becomes E */
     400              :                 case 'A':
     401            0 :                         if (Next_Letter == 'E')
     402              :                         {
     403            0 :                                 Phonize('E');
     404            0 :                                 w_idx += 2;
     405            0 :                         }
     406              :                         /* Remember, preserve vowels at the beginning */
     407              :                         else
     408              :                         {
     409            0 :                                 Phonize('A');
     410            0 :                                 w_idx++;
     411              :                         }
     412            0 :                         break;
     413              :                         /* [GKP]N becomes N */
     414              :                 case 'G':
     415              :                 case 'K':
     416              :                 case 'P':
     417            0 :                         if (Next_Letter == 'N')
     418              :                         {
     419            0 :                                 Phonize('N');
     420            0 :                                 w_idx += 2;
     421            0 :                         }
     422            0 :                         break;
     423              : 
     424              :                         /*
     425              :                          * WH becomes H, WR becomes R W if followed by a vowel
     426              :                          */
     427              :                 case 'W':
     428            0 :                         if (Next_Letter == 'H' ||
     429            0 :                                 Next_Letter == 'R')
     430              :                         {
     431            0 :                                 Phonize(Next_Letter);
     432            0 :                                 w_idx += 2;
     433            0 :                         }
     434            0 :                         else if (isvowel(Next_Letter))
     435              :                         {
     436            0 :                                 Phonize('W');
     437            0 :                                 w_idx += 2;
     438            0 :                         }
     439              :                         /* else ignore */
     440            0 :                         break;
     441              :                         /* X becomes S */
     442              :                 case 'X':
     443            0 :                         Phonize('S');
     444            0 :                         w_idx++;
     445            0 :                         break;
     446              :                         /* Vowels are kept */
     447              : 
     448              :                         /*
     449              :                          * We did A already case 'A': case 'a':
     450              :                          */
     451              :                 case 'E':
     452              :                 case 'I':
     453              :                 case 'O':
     454              :                 case 'U':
     455            0 :                         Phonize(Curr_Letter);
     456            0 :                         w_idx++;
     457            0 :                         break;
     458              :                 default:
     459              :                         /* do nothing */
     460            0 :                         break;
     461              :         }
     462              : 
     463              : 
     464              : 
     465              :         /* On to the metaphoning */
     466            0 :         for (; Curr_Letter != '\0' &&
     467            0 :                  (max_phonemes == 0 || Phone_Len < max_phonemes);
     468            0 :                  w_idx++)
     469              :         {
     470              :                 /*
     471              :                  * How many letters to skip because an earlier encoding handled
     472              :                  * multiple letters
     473              :                  */
     474            0 :                 unsigned short int skip_letter = 0;
     475              : 
     476              : 
     477              :                 /*
     478              :                  * THOUGHT:  It would be nice if, rather than having things like...
     479              :                  * well, SCI.  For SCI you encode the S, then have to remember to skip
     480              :                  * the C.  So the phonome SCI invades both S and C.  It would be
     481              :                  * better, IMHO, to skip the C from the S part of the encoding. Hell,
     482              :                  * I'm trying it.
     483              :                  */
     484              : 
     485              :                 /* Ignore non-alphas */
     486            0 :                 if (!ascii_isalpha((unsigned char) (Curr_Letter)))
     487            0 :                         continue;
     488              : 
     489              :                 /* Drop duplicates, except CC */
     490            0 :                 if (Curr_Letter == Prev_Letter &&
     491            0 :                         Curr_Letter != 'C')
     492            0 :                         continue;
     493              : 
     494            0 :                 switch (Curr_Letter)
     495              :                 {
     496              :                                 /* B -> B unless in MB */
     497              :                         case 'B':
     498            0 :                                 if (Prev_Letter != 'M')
     499            0 :                                         Phonize('B');
     500            0 :                                 break;
     501              : 
     502              :                                 /*
     503              :                                  * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
     504              :                                  * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
     505              :                                  * SCE-, -SCY- (handed in S) else K
     506              :                                  */
     507              :                         case 'C':
     508            0 :                                 if (MAKESOFT(Next_Letter))
     509              :                                 {                               /* C[IEY] */
     510            0 :                                         if (After_Next_Letter == 'A' &&
     511            0 :                                                 Next_Letter == 'I')
     512              :                                         {                       /* CIA */
     513            0 :                                                 Phonize(SH);
     514            0 :                                         }
     515              :                                         /* SC[IEY] */
     516            0 :                                         else if (Prev_Letter == 'S')
     517              :                                         {
     518              :                                                 /* Dropped */
     519            0 :                                         }
     520              :                                         else
     521            0 :                                                 Phonize('S');
     522            0 :                                 }
     523            0 :                                 else if (Next_Letter == 'H')
     524              :                                 {
     525              : #ifndef USE_TRADITIONAL_METAPHONE
     526            0 :                                         if (After_Next_Letter == 'R' ||
     527            0 :                                                 Prev_Letter == 'S')
     528              :                                         {                       /* Christ, School */
     529            0 :                                                 Phonize('K');
     530            0 :                                         }
     531              :                                         else
     532            0 :                                                 Phonize(SH);
     533              : #else
     534              :                                         Phonize(SH);
     535              : #endif
     536            0 :                                         skip_letter++;
     537            0 :                                 }
     538              :                                 else
     539            0 :                                         Phonize('K');
     540            0 :                                 break;
     541              : 
     542              :                                 /*
     543              :                                  * J if in -DGE-, -DGI- or -DGY- else T
     544              :                                  */
     545              :                         case 'D':
     546            0 :                                 if (Next_Letter == 'G' &&
     547            0 :                                         MAKESOFT(After_Next_Letter))
     548              :                                 {
     549            0 :                                         Phonize('J');
     550            0 :                                         skip_letter++;
     551            0 :                                 }
     552              :                                 else
     553            0 :                                         Phonize('T');
     554            0 :                                 break;
     555              : 
     556              :                                 /*
     557              :                                  * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
     558              :                                  * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
     559              :                                  * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
     560              :                                  * else K
     561              :                                  */
     562              :                         case 'G':
     563            0 :                                 if (Next_Letter == 'H')
     564              :                                 {
     565            0 :                                         if (!(NOGHTOF(Look_Back_Letter(3)) ||
     566            0 :                                                   Look_Back_Letter(4) == 'H'))
     567              :                                         {
     568            0 :                                                 Phonize('F');
     569            0 :                                                 skip_letter++;
     570            0 :                                         }
     571              :                                         else
     572              :                                         {
     573              :                                                 /* silent */
     574              :                                         }
     575            0 :                                 }
     576            0 :                                 else if (Next_Letter == 'N')
     577              :                                 {
     578            0 :                                         if (Isbreak(After_Next_Letter) ||
     579            0 :                                                 (After_Next_Letter == 'E' &&
     580            0 :                                                  Look_Ahead_Letter(3) == 'D'))
     581              :                                         {
     582              :                                                 /* dropped */
     583            0 :                                         }
     584              :                                         else
     585            0 :                                                 Phonize('K');
     586            0 :                                 }
     587            0 :                                 else if (MAKESOFT(Next_Letter) &&
     588            0 :                                                  Prev_Letter != 'G')
     589            0 :                                         Phonize('J');
     590              :                                 else
     591            0 :                                         Phonize('K');
     592            0 :                                 break;
     593              :                                 /* H if before a vowel and not after C,G,P,S,T */
     594              :                         case 'H':
     595            0 :                                 if (isvowel(Next_Letter) &&
     596            0 :                                         !AFFECTH(Prev_Letter))
     597            0 :                                         Phonize('H');
     598            0 :                                 break;
     599              : 
     600              :                                 /*
     601              :                                  * dropped if after C else K
     602              :                                  */
     603              :                         case 'K':
     604            0 :                                 if (Prev_Letter != 'C')
     605            0 :                                         Phonize('K');
     606            0 :                                 break;
     607              : 
     608              :                                 /*
     609              :                                  * F if before H else P
     610              :                                  */
     611              :                         case 'P':
     612            0 :                                 if (Next_Letter == 'H')
     613            0 :                                         Phonize('F');
     614              :                                 else
     615            0 :                                         Phonize('P');
     616            0 :                                 break;
     617              : 
     618              :                                 /*
     619              :                                  * K
     620              :                                  */
     621              :                         case 'Q':
     622            0 :                                 Phonize('K');
     623            0 :                                 break;
     624              : 
     625              :                                 /*
     626              :                                  * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
     627              :                                  */
     628              :                         case 'S':
     629            0 :                                 if (Next_Letter == 'I' &&
     630            0 :                                         (After_Next_Letter == 'O' ||
     631            0 :                                          After_Next_Letter == 'A'))
     632            0 :                                         Phonize(SH);
     633            0 :                                 else if (Next_Letter == 'H')
     634              :                                 {
     635            0 :                                         Phonize(SH);
     636            0 :                                         skip_letter++;
     637            0 :                                 }
     638              : #ifndef USE_TRADITIONAL_METAPHONE
     639            0 :                                 else if (Next_Letter == 'C' &&
     640            0 :                                                  Look_Ahead_Letter(2) == 'H' &&
     641            0 :                                                  Look_Ahead_Letter(3) == 'W')
     642              :                                 {
     643            0 :                                         Phonize(SH);
     644            0 :                                         skip_letter += 2;
     645            0 :                                 }
     646              : #endif
     647              :                                 else
     648            0 :                                         Phonize('S');
     649            0 :                                 break;
     650              : 
     651              :                                 /*
     652              :                                  * 'sh' in -TIA- or -TIO- else 'th' before H else T
     653              :                                  */
     654              :                         case 'T':
     655            0 :                                 if (Next_Letter == 'I' &&
     656            0 :                                         (After_Next_Letter == 'O' ||
     657            0 :                                          After_Next_Letter == 'A'))
     658            0 :                                         Phonize(SH);
     659            0 :                                 else if (Next_Letter == 'H')
     660              :                                 {
     661            0 :                                         Phonize(TH);
     662            0 :                                         skip_letter++;
     663            0 :                                 }
     664              :                                 else
     665            0 :                                         Phonize('T');
     666            0 :                                 break;
     667              :                                 /* F */
     668              :                         case 'V':
     669            0 :                                 Phonize('F');
     670            0 :                                 break;
     671              :                                 /* W before a vowel, else dropped */
     672              :                         case 'W':
     673            0 :                                 if (isvowel(Next_Letter))
     674            0 :                                         Phonize('W');
     675            0 :                                 break;
     676              :                                 /* KS */
     677              :                         case 'X':
     678            0 :                                 Phonize('K');
     679            0 :                                 if (max_phonemes == 0 || Phone_Len < max_phonemes)
     680            0 :                                         Phonize('S');
     681            0 :                                 break;
     682              :                                 /* Y if followed by a vowel */
     683              :                         case 'Y':
     684            0 :                                 if (isvowel(Next_Letter))
     685            0 :                                         Phonize('Y');
     686            0 :                                 break;
     687              :                                 /* S */
     688              :                         case 'Z':
     689            0 :                                 Phonize('S');
     690            0 :                                 break;
     691              :                                 /* No transformation */
     692              :                         case 'F':
     693              :                         case 'J':
     694              :                         case 'L':
     695              :                         case 'M':
     696              :                         case 'N':
     697              :                         case 'R':
     698            0 :                                 Phonize(Curr_Letter);
     699            0 :                                 break;
     700              :                         default:
     701              :                                 /* nothing */
     702            0 :                                 break;
     703              :                 }                                               /* END SWITCH */
     704              : 
     705            0 :                 w_idx += skip_letter;
     706            0 :         }                                                       /* END FOR */
     707              : 
     708            0 :         End_Phoned_Word;
     709            0 : }                                                               /* END metaphone */
     710              : 
     711              : 
     712              : /*
     713              :  * SQL function: soundex(text) returns text
     714              :  */
     715            0 : PG_FUNCTION_INFO_V1(soundex);
     716              : 
     717              : Datum
     718            0 : soundex(PG_FUNCTION_ARGS)
     719              : {
     720            0 :         char            outstr[SOUNDEX_LEN + 1];
     721            0 :         char       *arg;
     722              : 
     723            0 :         arg = text_to_cstring(PG_GETARG_TEXT_PP(0));
     724              : 
     725            0 :         _soundex(arg, outstr);
     726              : 
     727            0 :         PG_RETURN_TEXT_P(cstring_to_text(outstr));
     728            0 : }
     729              : 
     730              : static void
     731            0 : _soundex(const char *instr, char *outstr)
     732              : {
     733            0 :         int                     count;
     734              : 
     735            0 :         Assert(instr);
     736            0 :         Assert(outstr);
     737              : 
     738              :         /* Skip leading non-alphabetic characters */
     739            0 :         while (*instr && !ascii_isalpha((unsigned char) *instr))
     740            0 :                 ++instr;
     741              : 
     742              :         /* If no string left, return all-zeroes buffer */
     743            0 :         if (!*instr)
     744              :         {
     745            0 :                 memset(outstr, '\0', SOUNDEX_LEN + 1);
     746            0 :                 return;
     747              :         }
     748              : 
     749              :         /* Take the first letter as is */
     750            0 :         *outstr++ = (char) pg_ascii_toupper((unsigned char) *instr++);
     751              : 
     752            0 :         count = 1;
     753            0 :         while (*instr && count < SOUNDEX_LEN)
     754              :         {
     755            0 :                 if (ascii_isalpha((unsigned char) *instr) &&
     756            0 :                         soundex_code(*instr) != soundex_code(*(instr - 1)))
     757              :                 {
     758            0 :                         *outstr = soundex_code(*instr);
     759            0 :                         if (*outstr != '0')
     760              :                         {
     761            0 :                                 ++outstr;
     762            0 :                                 ++count;
     763            0 :                         }
     764            0 :                 }
     765            0 :                 ++instr;
     766              :         }
     767              : 
     768              :         /* Fill with 0's */
     769            0 :         while (count < SOUNDEX_LEN)
     770              :         {
     771            0 :                 *outstr = '0';
     772            0 :                 ++outstr;
     773            0 :                 ++count;
     774              :         }
     775              : 
     776              :         /* And null-terminate */
     777            0 :         *outstr = '\0';
     778            0 : }
     779              : 
     780            0 : PG_FUNCTION_INFO_V1(difference);
     781              : 
     782              : Datum
     783            0 : difference(PG_FUNCTION_ARGS)
     784              : {
     785            0 :         char            sndx1[SOUNDEX_LEN + 1],
     786              :                                 sndx2[SOUNDEX_LEN + 1];
     787            0 :         int                     i,
     788              :                                 result;
     789              : 
     790            0 :         _soundex(text_to_cstring(PG_GETARG_TEXT_PP(0)), sndx1);
     791            0 :         _soundex(text_to_cstring(PG_GETARG_TEXT_PP(1)), sndx2);
     792              : 
     793            0 :         result = 0;
     794            0 :         for (i = 0; i < SOUNDEX_LEN; i++)
     795              :         {
     796            0 :                 if (sndx1[i] == sndx2[i])
     797            0 :                         result++;
     798            0 :         }
     799              : 
     800            0 :         PG_RETURN_INT32(result);
     801            0 : }
        

Generated by: LCOV version 2.3.2-1