Line data Source code
1 : /*
2 : * fuzzystrmatch.c
3 : *
4 : * Functions for "fuzzy" comparison of strings
5 : *
6 : * Joe Conway <mail@joeconway.com>
7 : *
8 : * contrib/fuzzystrmatch/fuzzystrmatch.c
9 : * Copyright (c) 2001-2026, PostgreSQL Global Development Group
10 : * ALL RIGHTS RESERVED;
11 : *
12 : * metaphone()
13 : * -----------
14 : * Modified for PostgreSQL by Joe Conway.
15 : * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
16 : * Code slightly modified for use as PostgreSQL function (palloc, elog, etc).
17 : * Metaphone was originally created by Lawrence Philips and presented in article
18 : * in "Computer Language" December 1990 issue.
19 : *
20 : * Permission to use, copy, modify, and distribute this software and its
21 : * documentation for any purpose, without fee, and without a written agreement
22 : * is hereby granted, provided that the above copyright notice and this
23 : * paragraph and the following two paragraphs appear in all copies.
24 : *
25 : * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
26 : * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
27 : * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
28 : * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE
29 : * POSSIBILITY OF SUCH DAMAGE.
30 : *
31 : * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32 : * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
33 : * AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
34 : * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO
35 : * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
36 : *
37 : */
38 :
39 : #include "postgres.h"
40 :
41 : #include <ctype.h>
42 :
43 : #include "utils/builtins.h"
44 : #include "utils/varlena.h"
45 : #include "varatt.h"
46 :
47 0 : PG_MODULE_MAGIC_EXT(
48 : .name = "fuzzystrmatch",
49 : .version = PG_VERSION
50 : );
51 :
52 : /*
53 : * Soundex
54 : */
55 : static void _soundex(const char *instr, char *outstr);
56 :
57 : #define SOUNDEX_LEN 4
58 :
59 : /* ABCDEFGHIJKLMNOPQRSTUVWXYZ */
60 : static const char *const soundex_table = "01230120022455012623010202";
61 :
62 : static char
63 0 : soundex_code(char letter)
64 : {
65 0 : letter = pg_ascii_toupper((unsigned char) letter);
66 : /* Defend against non-ASCII letters */
67 0 : if (letter >= 'A' && letter <= 'Z')
68 0 : return soundex_table[letter - 'A'];
69 0 : return letter;
70 0 : }
71 :
72 : /*
73 : * Metaphone
74 : */
75 : #define MAX_METAPHONE_STRLEN 255
76 :
77 : /*
78 : * Original code by Michael G Schwern starts here.
79 : * Code slightly modified for use as PostgreSQL function.
80 : */
81 :
82 :
83 : /**************************************************************************
84 : metaphone -- Breaks english phrases down into their phonemes.
85 :
86 : Input
87 : word -- An english word to be phonized
88 : max_phonemes -- How many phonemes to calculate. If 0, then it
89 : will phonize the entire phrase.
90 : phoned_word -- The final phonized word. (We'll allocate the
91 : memory.)
92 : Output
93 : error -- A simple error flag, returns true or false
94 :
95 : NOTES: ALL non-alpha characters are ignored, this includes whitespace,
96 : although non-alpha characters will break up phonemes.
97 : ****************************************************************************/
98 :
99 :
100 : /* I add modifications to the traditional metaphone algorithm that you
101 : might find in books. Define this if you want metaphone to behave
102 : traditionally */
103 : #undef USE_TRADITIONAL_METAPHONE
104 :
105 : /* Special encodings */
106 : #define SH 'X'
107 : #define TH '0'
108 :
109 : static char Lookahead(char *word, int how_far);
110 : static void _metaphone(char *word, int max_phonemes, char **phoned_word);
111 :
112 : /* Metachar.h ... little bits about characters for metaphone */
113 :
114 :
115 : /*-- Character encoding array & accessing macros --*/
116 : /* Stolen directly out of the book... */
117 : static const char _codes[26] = {
118 : 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
119 : /* a b c d e f g h i j k l m n o p q r s t u v w x y z */
120 : };
121 :
122 : static int
123 0 : getcode(char c)
124 : {
125 0 : c = pg_ascii_toupper((unsigned char) c);
126 : /* Defend against non-ASCII letters */
127 0 : if (c >= 'A' && c <= 'Z')
128 0 : return _codes[c - 'A'];
129 :
130 0 : return 0;
131 0 : }
132 :
133 : static bool
134 0 : ascii_isalpha(char c)
135 : {
136 0 : return (c >= 'A' && c <= 'Z') ||
137 0 : (c >= 'a' && c <= 'z');
138 : }
139 :
140 : #define isvowel(c) (getcode(c) & 1) /* AEIOU */
141 :
142 : /* These letters are passed through unchanged */
143 : #define NOCHANGE(c) (getcode(c) & 2) /* FJMNR */
144 :
145 : /* These form diphthongs when preceding H */
146 : #define AFFECTH(c) (getcode(c) & 4) /* CGPST */
147 :
148 : /* These make C and G soft */
149 : #define MAKESOFT(c) (getcode(c) & 8) /* EIY */
150 :
151 : /* These prevent GH from becoming F */
152 : #define NOGHTOF(c) (getcode(c) & 16) /* BDH */
153 :
154 0 : PG_FUNCTION_INFO_V1(levenshtein_with_costs);
155 : Datum
156 0 : levenshtein_with_costs(PG_FUNCTION_ARGS)
157 : {
158 0 : text *src = PG_GETARG_TEXT_PP(0);
159 0 : text *dst = PG_GETARG_TEXT_PP(1);
160 0 : int ins_c = PG_GETARG_INT32(2);
161 0 : int del_c = PG_GETARG_INT32(3);
162 0 : int sub_c = PG_GETARG_INT32(4);
163 0 : const char *s_data;
164 0 : const char *t_data;
165 0 : int s_bytes,
166 : t_bytes;
167 :
168 : /* Extract a pointer to the actual character data */
169 0 : s_data = VARDATA_ANY(src);
170 0 : t_data = VARDATA_ANY(dst);
171 : /* Determine length of each string in bytes */
172 0 : s_bytes = VARSIZE_ANY_EXHDR(src);
173 0 : t_bytes = VARSIZE_ANY_EXHDR(dst);
174 :
175 0 : PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
176 : ins_c, del_c, sub_c, false));
177 0 : }
178 :
179 :
180 0 : PG_FUNCTION_INFO_V1(levenshtein);
181 : Datum
182 0 : levenshtein(PG_FUNCTION_ARGS)
183 : {
184 0 : text *src = PG_GETARG_TEXT_PP(0);
185 0 : text *dst = PG_GETARG_TEXT_PP(1);
186 0 : const char *s_data;
187 0 : const char *t_data;
188 0 : int s_bytes,
189 : t_bytes;
190 :
191 : /* Extract a pointer to the actual character data */
192 0 : s_data = VARDATA_ANY(src);
193 0 : t_data = VARDATA_ANY(dst);
194 : /* Determine length of each string in bytes */
195 0 : s_bytes = VARSIZE_ANY_EXHDR(src);
196 0 : t_bytes = VARSIZE_ANY_EXHDR(dst);
197 :
198 0 : PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
199 : 1, 1, 1, false));
200 0 : }
201 :
202 :
203 0 : PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
204 : Datum
205 0 : levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
206 : {
207 0 : text *src = PG_GETARG_TEXT_PP(0);
208 0 : text *dst = PG_GETARG_TEXT_PP(1);
209 0 : int ins_c = PG_GETARG_INT32(2);
210 0 : int del_c = PG_GETARG_INT32(3);
211 0 : int sub_c = PG_GETARG_INT32(4);
212 0 : int max_d = PG_GETARG_INT32(5);
213 0 : const char *s_data;
214 0 : const char *t_data;
215 0 : int s_bytes,
216 : t_bytes;
217 :
218 : /* Extract a pointer to the actual character data */
219 0 : s_data = VARDATA_ANY(src);
220 0 : t_data = VARDATA_ANY(dst);
221 : /* Determine length of each string in bytes */
222 0 : s_bytes = VARSIZE_ANY_EXHDR(src);
223 0 : t_bytes = VARSIZE_ANY_EXHDR(dst);
224 :
225 0 : PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
226 : t_data, t_bytes,
227 : ins_c, del_c, sub_c,
228 : max_d, false));
229 0 : }
230 :
231 :
232 0 : PG_FUNCTION_INFO_V1(levenshtein_less_equal);
233 : Datum
234 0 : levenshtein_less_equal(PG_FUNCTION_ARGS)
235 : {
236 0 : text *src = PG_GETARG_TEXT_PP(0);
237 0 : text *dst = PG_GETARG_TEXT_PP(1);
238 0 : int max_d = PG_GETARG_INT32(2);
239 0 : const char *s_data;
240 0 : const char *t_data;
241 0 : int s_bytes,
242 : t_bytes;
243 :
244 : /* Extract a pointer to the actual character data */
245 0 : s_data = VARDATA_ANY(src);
246 0 : t_data = VARDATA_ANY(dst);
247 : /* Determine length of each string in bytes */
248 0 : s_bytes = VARSIZE_ANY_EXHDR(src);
249 0 : t_bytes = VARSIZE_ANY_EXHDR(dst);
250 :
251 0 : PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
252 : t_data, t_bytes,
253 : 1, 1, 1,
254 : max_d, false));
255 0 : }
256 :
257 :
258 : /*
259 : * Calculates the metaphone of an input string.
260 : * Returns number of characters requested
261 : * (suggested value is 4)
262 : */
263 0 : PG_FUNCTION_INFO_V1(metaphone);
264 : Datum
265 0 : metaphone(PG_FUNCTION_ARGS)
266 : {
267 0 : char *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
268 0 : size_t str_i_len = strlen(str_i);
269 0 : int reqlen;
270 0 : char *metaph;
271 :
272 : /* return an empty string if we receive one */
273 0 : if (!(str_i_len > 0))
274 0 : PG_RETURN_TEXT_P(cstring_to_text(""));
275 :
276 0 : if (str_i_len > MAX_METAPHONE_STRLEN)
277 0 : ereport(ERROR,
278 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
279 : errmsg("argument exceeds the maximum length of %d bytes",
280 : MAX_METAPHONE_STRLEN)));
281 :
282 0 : reqlen = PG_GETARG_INT32(1);
283 0 : if (reqlen > MAX_METAPHONE_STRLEN)
284 0 : ereport(ERROR,
285 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
286 : errmsg("output exceeds the maximum length of %d bytes",
287 : MAX_METAPHONE_STRLEN)));
288 :
289 0 : if (!(reqlen > 0))
290 0 : ereport(ERROR,
291 : (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
292 : errmsg("output cannot be empty string")));
293 :
294 0 : _metaphone(str_i, reqlen, &metaph);
295 0 : PG_RETURN_TEXT_P(cstring_to_text(metaph));
296 0 : }
297 :
298 :
299 : /*
300 : * Original code by Michael G Schwern starts here.
301 : * Code slightly modified for use as PostgreSQL
302 : * function (palloc, etc).
303 : */
304 :
305 : /* I suppose I could have been using a character pointer instead of
306 : * accessing the array directly... */
307 :
308 : /* Look at the next letter in the word */
309 : #define Next_Letter (pg_ascii_toupper((unsigned char) word[w_idx+1]))
310 : /* Look at the current letter in the word */
311 : #define Curr_Letter (pg_ascii_toupper((unsigned char) word[w_idx]))
312 : /* Go N letters back. */
313 : #define Look_Back_Letter(n) \
314 : (w_idx >= (n) ? pg_ascii_toupper((unsigned char) word[w_idx-(n)]) : '\0')
315 : /* Previous letter. I dunno, should this return null on failure? */
316 : #define Prev_Letter (Look_Back_Letter(1))
317 : /* Look two letters down. It makes sure you don't walk off the string. */
318 : #define After_Next_Letter \
319 : (Next_Letter != '\0' ? pg_ascii_toupper((unsigned char) word[w_idx+2]) : '\0')
320 : #define Look_Ahead_Letter(n) pg_ascii_toupper((unsigned char) Lookahead(word+w_idx, n))
321 :
322 :
323 : /* Allows us to safely look ahead an arbitrary # of letters */
324 : /* I probably could have just used strlen... */
325 : static char
326 0 : Lookahead(char *word, int how_far)
327 : {
328 0 : char letter_ahead = '\0'; /* null by default */
329 0 : int idx;
330 :
331 0 : for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
332 : /* Edge forward in the string... */
333 :
334 0 : letter_ahead = word[idx]; /* idx will be either == to how_far or at the
335 : * end of the string */
336 0 : return letter_ahead;
337 0 : }
338 :
339 :
340 : /* phonize one letter */
341 : #define Phonize(c) do {(*phoned_word)[p_idx++] = c;} while (0)
342 : /* Slap a null character on the end of the phoned word */
343 : #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
344 : /* How long is the phoned word? */
345 : #define Phone_Len (p_idx)
346 :
347 : /* Note is a letter is a 'break' in the word */
348 : #define Isbreak(c) (!ascii_isalpha((unsigned char) (c)))
349 :
350 :
351 : static void
352 0 : _metaphone(char *word, /* IN */
353 : int max_phonemes,
354 : char **phoned_word) /* OUT */
355 : {
356 0 : int w_idx = 0; /* point in the phonization we're at. */
357 0 : int p_idx = 0; /* end of the phoned phrase */
358 :
359 : /*-- Parameter checks --*/
360 :
361 : /*
362 : * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
363 : */
364 :
365 : /* Negative phoneme length is meaningless */
366 0 : if (!(max_phonemes > 0))
367 : /* internal error */
368 0 : elog(ERROR, "metaphone: Requested output length must be > 0");
369 :
370 : /* Empty/null string is meaningless */
371 0 : if ((word == NULL) || !(strlen(word) > 0))
372 : /* internal error */
373 0 : elog(ERROR, "metaphone: Input string length must be > 0");
374 :
375 : /*-- Allocate memory for our phoned_phrase --*/
376 0 : if (max_phonemes == 0)
377 : { /* Assume largest possible */
378 0 : *phoned_word = palloc(sizeof(char) * strlen(word) + 1);
379 0 : }
380 : else
381 : {
382 0 : *phoned_word = palloc(sizeof(char) * max_phonemes + 1);
383 : }
384 :
385 : /*-- The first phoneme has to be processed specially. --*/
386 : /* Find our first letter */
387 0 : for (; !ascii_isalpha((unsigned char) (Curr_Letter)); w_idx++)
388 : {
389 : /* On the off chance we were given nothing but crap... */
390 0 : if (Curr_Letter == '\0')
391 : {
392 0 : End_Phoned_Word;
393 0 : return;
394 : }
395 0 : }
396 :
397 0 : switch (Curr_Letter)
398 : {
399 : /* AE becomes E */
400 : case 'A':
401 0 : if (Next_Letter == 'E')
402 : {
403 0 : Phonize('E');
404 0 : w_idx += 2;
405 0 : }
406 : /* Remember, preserve vowels at the beginning */
407 : else
408 : {
409 0 : Phonize('A');
410 0 : w_idx++;
411 : }
412 0 : break;
413 : /* [GKP]N becomes N */
414 : case 'G':
415 : case 'K':
416 : case 'P':
417 0 : if (Next_Letter == 'N')
418 : {
419 0 : Phonize('N');
420 0 : w_idx += 2;
421 0 : }
422 0 : break;
423 :
424 : /*
425 : * WH becomes H, WR becomes R W if followed by a vowel
426 : */
427 : case 'W':
428 0 : if (Next_Letter == 'H' ||
429 0 : Next_Letter == 'R')
430 : {
431 0 : Phonize(Next_Letter);
432 0 : w_idx += 2;
433 0 : }
434 0 : else if (isvowel(Next_Letter))
435 : {
436 0 : Phonize('W');
437 0 : w_idx += 2;
438 0 : }
439 : /* else ignore */
440 0 : break;
441 : /* X becomes S */
442 : case 'X':
443 0 : Phonize('S');
444 0 : w_idx++;
445 0 : break;
446 : /* Vowels are kept */
447 :
448 : /*
449 : * We did A already case 'A': case 'a':
450 : */
451 : case 'E':
452 : case 'I':
453 : case 'O':
454 : case 'U':
455 0 : Phonize(Curr_Letter);
456 0 : w_idx++;
457 0 : break;
458 : default:
459 : /* do nothing */
460 0 : break;
461 : }
462 :
463 :
464 :
465 : /* On to the metaphoning */
466 0 : for (; Curr_Letter != '\0' &&
467 0 : (max_phonemes == 0 || Phone_Len < max_phonemes);
468 0 : w_idx++)
469 : {
470 : /*
471 : * How many letters to skip because an earlier encoding handled
472 : * multiple letters
473 : */
474 0 : unsigned short int skip_letter = 0;
475 :
476 :
477 : /*
478 : * THOUGHT: It would be nice if, rather than having things like...
479 : * well, SCI. For SCI you encode the S, then have to remember to skip
480 : * the C. So the phonome SCI invades both S and C. It would be
481 : * better, IMHO, to skip the C from the S part of the encoding. Hell,
482 : * I'm trying it.
483 : */
484 :
485 : /* Ignore non-alphas */
486 0 : if (!ascii_isalpha((unsigned char) (Curr_Letter)))
487 0 : continue;
488 :
489 : /* Drop duplicates, except CC */
490 0 : if (Curr_Letter == Prev_Letter &&
491 0 : Curr_Letter != 'C')
492 0 : continue;
493 :
494 0 : switch (Curr_Letter)
495 : {
496 : /* B -> B unless in MB */
497 : case 'B':
498 0 : if (Prev_Letter != 'M')
499 0 : Phonize('B');
500 0 : break;
501 :
502 : /*
503 : * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
504 : * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
505 : * SCE-, -SCY- (handed in S) else K
506 : */
507 : case 'C':
508 0 : if (MAKESOFT(Next_Letter))
509 : { /* C[IEY] */
510 0 : if (After_Next_Letter == 'A' &&
511 0 : Next_Letter == 'I')
512 : { /* CIA */
513 0 : Phonize(SH);
514 0 : }
515 : /* SC[IEY] */
516 0 : else if (Prev_Letter == 'S')
517 : {
518 : /* Dropped */
519 0 : }
520 : else
521 0 : Phonize('S');
522 0 : }
523 0 : else if (Next_Letter == 'H')
524 : {
525 : #ifndef USE_TRADITIONAL_METAPHONE
526 0 : if (After_Next_Letter == 'R' ||
527 0 : Prev_Letter == 'S')
528 : { /* Christ, School */
529 0 : Phonize('K');
530 0 : }
531 : else
532 0 : Phonize(SH);
533 : #else
534 : Phonize(SH);
535 : #endif
536 0 : skip_letter++;
537 0 : }
538 : else
539 0 : Phonize('K');
540 0 : break;
541 :
542 : /*
543 : * J if in -DGE-, -DGI- or -DGY- else T
544 : */
545 : case 'D':
546 0 : if (Next_Letter == 'G' &&
547 0 : MAKESOFT(After_Next_Letter))
548 : {
549 0 : Phonize('J');
550 0 : skip_letter++;
551 0 : }
552 : else
553 0 : Phonize('T');
554 0 : break;
555 :
556 : /*
557 : * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
558 : * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
559 : * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
560 : * else K
561 : */
562 : case 'G':
563 0 : if (Next_Letter == 'H')
564 : {
565 0 : if (!(NOGHTOF(Look_Back_Letter(3)) ||
566 0 : Look_Back_Letter(4) == 'H'))
567 : {
568 0 : Phonize('F');
569 0 : skip_letter++;
570 0 : }
571 : else
572 : {
573 : /* silent */
574 : }
575 0 : }
576 0 : else if (Next_Letter == 'N')
577 : {
578 0 : if (Isbreak(After_Next_Letter) ||
579 0 : (After_Next_Letter == 'E' &&
580 0 : Look_Ahead_Letter(3) == 'D'))
581 : {
582 : /* dropped */
583 0 : }
584 : else
585 0 : Phonize('K');
586 0 : }
587 0 : else if (MAKESOFT(Next_Letter) &&
588 0 : Prev_Letter != 'G')
589 0 : Phonize('J');
590 : else
591 0 : Phonize('K');
592 0 : break;
593 : /* H if before a vowel and not after C,G,P,S,T */
594 : case 'H':
595 0 : if (isvowel(Next_Letter) &&
596 0 : !AFFECTH(Prev_Letter))
597 0 : Phonize('H');
598 0 : break;
599 :
600 : /*
601 : * dropped if after C else K
602 : */
603 : case 'K':
604 0 : if (Prev_Letter != 'C')
605 0 : Phonize('K');
606 0 : break;
607 :
608 : /*
609 : * F if before H else P
610 : */
611 : case 'P':
612 0 : if (Next_Letter == 'H')
613 0 : Phonize('F');
614 : else
615 0 : Phonize('P');
616 0 : break;
617 :
618 : /*
619 : * K
620 : */
621 : case 'Q':
622 0 : Phonize('K');
623 0 : break;
624 :
625 : /*
626 : * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
627 : */
628 : case 'S':
629 0 : if (Next_Letter == 'I' &&
630 0 : (After_Next_Letter == 'O' ||
631 0 : After_Next_Letter == 'A'))
632 0 : Phonize(SH);
633 0 : else if (Next_Letter == 'H')
634 : {
635 0 : Phonize(SH);
636 0 : skip_letter++;
637 0 : }
638 : #ifndef USE_TRADITIONAL_METAPHONE
639 0 : else if (Next_Letter == 'C' &&
640 0 : Look_Ahead_Letter(2) == 'H' &&
641 0 : Look_Ahead_Letter(3) == 'W')
642 : {
643 0 : Phonize(SH);
644 0 : skip_letter += 2;
645 0 : }
646 : #endif
647 : else
648 0 : Phonize('S');
649 0 : break;
650 :
651 : /*
652 : * 'sh' in -TIA- or -TIO- else 'th' before H else T
653 : */
654 : case 'T':
655 0 : if (Next_Letter == 'I' &&
656 0 : (After_Next_Letter == 'O' ||
657 0 : After_Next_Letter == 'A'))
658 0 : Phonize(SH);
659 0 : else if (Next_Letter == 'H')
660 : {
661 0 : Phonize(TH);
662 0 : skip_letter++;
663 0 : }
664 : else
665 0 : Phonize('T');
666 0 : break;
667 : /* F */
668 : case 'V':
669 0 : Phonize('F');
670 0 : break;
671 : /* W before a vowel, else dropped */
672 : case 'W':
673 0 : if (isvowel(Next_Letter))
674 0 : Phonize('W');
675 0 : break;
676 : /* KS */
677 : case 'X':
678 0 : Phonize('K');
679 0 : if (max_phonemes == 0 || Phone_Len < max_phonemes)
680 0 : Phonize('S');
681 0 : break;
682 : /* Y if followed by a vowel */
683 : case 'Y':
684 0 : if (isvowel(Next_Letter))
685 0 : Phonize('Y');
686 0 : break;
687 : /* S */
688 : case 'Z':
689 0 : Phonize('S');
690 0 : break;
691 : /* No transformation */
692 : case 'F':
693 : case 'J':
694 : case 'L':
695 : case 'M':
696 : case 'N':
697 : case 'R':
698 0 : Phonize(Curr_Letter);
699 0 : break;
700 : default:
701 : /* nothing */
702 0 : break;
703 : } /* END SWITCH */
704 :
705 0 : w_idx += skip_letter;
706 0 : } /* END FOR */
707 :
708 0 : End_Phoned_Word;
709 0 : } /* END metaphone */
710 :
711 :
712 : /*
713 : * SQL function: soundex(text) returns text
714 : */
715 0 : PG_FUNCTION_INFO_V1(soundex);
716 :
717 : Datum
718 0 : soundex(PG_FUNCTION_ARGS)
719 : {
720 0 : char outstr[SOUNDEX_LEN + 1];
721 0 : char *arg;
722 :
723 0 : arg = text_to_cstring(PG_GETARG_TEXT_PP(0));
724 :
725 0 : _soundex(arg, outstr);
726 :
727 0 : PG_RETURN_TEXT_P(cstring_to_text(outstr));
728 0 : }
729 :
730 : static void
731 0 : _soundex(const char *instr, char *outstr)
732 : {
733 0 : int count;
734 :
735 0 : Assert(instr);
736 0 : Assert(outstr);
737 :
738 : /* Skip leading non-alphabetic characters */
739 0 : while (*instr && !ascii_isalpha((unsigned char) *instr))
740 0 : ++instr;
741 :
742 : /* If no string left, return all-zeroes buffer */
743 0 : if (!*instr)
744 : {
745 0 : memset(outstr, '\0', SOUNDEX_LEN + 1);
746 0 : return;
747 : }
748 :
749 : /* Take the first letter as is */
750 0 : *outstr++ = (char) pg_ascii_toupper((unsigned char) *instr++);
751 :
752 0 : count = 1;
753 0 : while (*instr && count < SOUNDEX_LEN)
754 : {
755 0 : if (ascii_isalpha((unsigned char) *instr) &&
756 0 : soundex_code(*instr) != soundex_code(*(instr - 1)))
757 : {
758 0 : *outstr = soundex_code(*instr);
759 0 : if (*outstr != '0')
760 : {
761 0 : ++outstr;
762 0 : ++count;
763 0 : }
764 0 : }
765 0 : ++instr;
766 : }
767 :
768 : /* Fill with 0's */
769 0 : while (count < SOUNDEX_LEN)
770 : {
771 0 : *outstr = '0';
772 0 : ++outstr;
773 0 : ++count;
774 : }
775 :
776 : /* And null-terminate */
777 0 : *outstr = '\0';
778 0 : }
779 :
780 0 : PG_FUNCTION_INFO_V1(difference);
781 :
782 : Datum
783 0 : difference(PG_FUNCTION_ARGS)
784 : {
785 0 : char sndx1[SOUNDEX_LEN + 1],
786 : sndx2[SOUNDEX_LEN + 1];
787 0 : int i,
788 : result;
789 :
790 0 : _soundex(text_to_cstring(PG_GETARG_TEXT_PP(0)), sndx1);
791 0 : _soundex(text_to_cstring(PG_GETARG_TEXT_PP(1)), sndx2);
792 :
793 0 : result = 0;
794 0 : for (i = 0; i < SOUNDEX_LEN; i++)
795 : {
796 0 : if (sndx1[i] == sndx2[i])
797 0 : result++;
798 0 : }
799 :
800 0 : PG_RETURN_INT32(result);
801 0 : }
|