1/*
2   +----------------------------------------------------------------------+
3   | PHP Version 7                                                        |
4   +----------------------------------------------------------------------+
5   | Copyright (c) 1997-2014 The PHP Group                                |
6   +----------------------------------------------------------------------+
7   | This source file is subject to version 3.01 of the PHP license,      |
8   | that is bundled with this package in the file LICENSE, and is        |
9   | available through the world-wide-web at the following url:           |
10   | http://www.php.net/license/3_01.txt                                  |
11   | If you did not receive a copy of the PHP license and are unable to   |
12   | obtain it through the world-wide-web, please send a note to          |
13   | license@php.net so we can mail you a copy immediately.               |
14   +----------------------------------------------------------------------+
15   | Author: Thies C. Arntzen <thies@thieso.net>                          |
16   +----------------------------------------------------------------------+
17*/
18
19/* $Id$ */
20
21/*
22    Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
23*/
24
25#include "php.h"
26#include "php_metaphone.h"
27
28static int metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional);
29
30/* {{{ proto string metaphone(string text[, int phones])
31   Break english phrases down into their phonemes */
32PHP_FUNCTION(metaphone)
33{
34    zend_string *str;
35    zend_string *result = NULL;
36    zend_long phones = 0;
37
38    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "S|l", &str, &phones) == FAILURE) {
39        return;
40    }
41
42    if (metaphone((unsigned char *)str->val, str->len, phones, &result, 1) == 0) {
43        RETVAL_STR(result);
44    } else {
45        if (result) {
46            zend_string_free(result);
47        }
48        RETURN_FALSE;
49    }
50}
51/* }}} */
52
53/*
54   this is now the original code by Michael G Schwern:
55   i've changed it just a slightly bit (use emalloc,
56   get rid of includes etc)
57    - thies - 13.09.1999
58*/
59
60/*-----------------------------  */
61/* this used to be "metaphone.h" */
62/*-----------------------------  */
63
64/* Special encodings */
65#define  SH     'X'
66#define  TH     '0'
67
68/*-----------------------------  */
69/* end of "metaphone.h"          */
70/*-----------------------------  */
71
72/*----------------------------- */
73/* this used to be "metachar.h" */
74/*----------------------------- */
75
76/* Metachar.h ... little bits about characters for metaphone */
77/*-- Character encoding array & accessing macros --*/
78/* Stolen directly out of the book... */
79char _codes[26] =
80{
81    1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
82/*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
83};
84
85
86#define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
87
88#define isvowel(c)  (ENCODE(c) & 1)     /* AEIOU */
89
90/* These letters are passed through unchanged */
91#define NOCHANGE(c) (ENCODE(c) & 2)     /* FJMNR */
92
93/* These form diphthongs when preceding H */
94#define AFFECTH(c)  (ENCODE(c) & 4)     /* CGPST */
95
96/* These make C and G soft */
97#define MAKESOFT(c) (ENCODE(c) & 8)     /* EIY */
98
99/* These prevent GH from becoming F */
100#define NOGHTOF(c)  (ENCODE(c) & 16)    /* BDH */
101
102/*----------------------------- */
103/* end of "metachar.h"          */
104/*----------------------------- */
105
106/* I suppose I could have been using a character pointer instead of
107 * accesssing the array directly... */
108
109/* Look at the next letter in the word */
110#define Next_Letter (toupper(word[w_idx+1]))
111/* Look at the current letter in the word */
112#define Curr_Letter (toupper(word[w_idx]))
113/* Go N letters back. */
114#define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
115/* Previous letter.  I dunno, should this return null on failure? */
116#define Prev_Letter (Look_Back_Letter(1))
117/* Look two letters down.  It makes sure you don't walk off the string. */
118#define After_Next_Letter   (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
119                                                 : '\0')
120#define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
121
122
123/* Allows us to safely look ahead an arbitrary # of letters */
124/* I probably could have just used strlen... */
125static char Lookahead(char *word, int how_far)
126{
127    char letter_ahead = '\0';   /* null by default */
128    int idx;
129    for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
130    /* Edge forward in the string... */
131
132    letter_ahead = word[idx];   /* idx will be either == to how_far or
133                                 * at the end of the string
134                                 */
135    return letter_ahead;
136}
137
138
139/* phonize one letter
140 * We don't know the buffers size in advance. On way to solve this is to just
141 * re-allocate the buffer size. We're using an extra of 2 characters (this
142 * could be one though; or more too). */
143#define Phonize(c)  { \
144                        if (p_idx >= max_buffer_len) { \
145                            *phoned_word = zend_string_realloc(*phoned_word, 2 * sizeof(char) + max_buffer_len, 0); \
146                            max_buffer_len += 2; \
147                        } \
148                        (*phoned_word)->val[p_idx++] = c; \
149                        (*phoned_word)->len = p_idx; \
150                    }
151/* Slap a null character on the end of the phoned word */
152#define End_Phoned_Word { \
153                            if (p_idx == max_buffer_len) { \
154                                *phoned_word = zend_string_realloc(*phoned_word, 1 * sizeof(char) + max_buffer_len, 0); \
155                                max_buffer_len += 1; \
156                            } \
157                            (*phoned_word)->val[p_idx] = '\0'; \
158                            (*phoned_word)->len = p_idx; \
159                        }
160/* How long is the phoned word? */
161#define Phone_Len   (p_idx)
162
163/* Note is a letter is a 'break' in the word */
164#define Isbreak(c)  (!isalpha(c))
165
166/* {{{ metaphone
167 */
168static int metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional)
169{
170    int w_idx = 0;              /* point in the phonization we're at. */
171    int p_idx = 0;              /* end of the phoned phrase */
172    size_t max_buffer_len = 0;      /* maximum length of the destination buffer */
173
174/*-- Parameter checks --*/
175    /* Negative phoneme length is meaningless */
176
177    if (max_phonemes < 0)
178        return -1;
179
180    /* Empty/null string is meaningless */
181    /* Overly paranoid */
182    /* assert(word != NULL && word[0] != '\0'); */
183
184    if (word == NULL)
185        return -1;
186
187/*-- Allocate memory for our phoned_phrase --*/
188    if (max_phonemes == 0) {    /* Assume largest possible */
189        max_buffer_len = word_len;
190        *phoned_word = zend_string_alloc(sizeof(char) * word_len + 1, 0);
191    } else {
192        max_buffer_len = max_phonemes;
193        *phoned_word = zend_string_alloc(sizeof(char) * max_phonemes + 1, 0);
194    }
195
196
197/*-- The first phoneme has to be processed specially. --*/
198    /* Find our first letter */
199    for (; !isalpha(Curr_Letter); w_idx++) {
200        /* On the off chance we were given nothing but crap... */
201        if (Curr_Letter == '\0') {
202            End_Phoned_Word
203                return SUCCESS; /* For testing */
204        }
205    }
206
207    switch (Curr_Letter) {
208        /* AE becomes E */
209    case 'A':
210        if (Next_Letter == 'E') {
211            Phonize('E');
212            w_idx += 2;
213        }
214        /* Remember, preserve vowels at the beginning */
215        else {
216            Phonize('A');
217            w_idx++;
218        }
219        break;
220        /* [GKP]N becomes N */
221    case 'G':
222    case 'K':
223    case 'P':
224        if (Next_Letter == 'N') {
225            Phonize('N');
226            w_idx += 2;
227        }
228        break;
229        /* WH becomes W,
230           WR becomes R
231           W if followed by a vowel */
232    case 'W':
233        if (Next_Letter == 'R') {
234            Phonize(Next_Letter);
235            w_idx += 2;
236        } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
237            Phonize('W');
238            w_idx += 2;
239        }
240        /* else ignore */
241        break;
242        /* X becomes S */
243    case 'X':
244        Phonize('S');
245        w_idx++;
246        break;
247        /* Vowels are kept */
248        /* We did A already
249           case 'A':
250           case 'a':
251         */
252    case 'E':
253    case 'I':
254    case 'O':
255    case 'U':
256        Phonize(Curr_Letter);
257        w_idx++;
258        break;
259    default:
260        /* do nothing */
261        break;
262    }
263
264
265
266    /* On to the metaphoning */
267    for (; Curr_Letter != '\0' &&
268         (max_phonemes == 0 || Phone_Len < max_phonemes);
269         w_idx++) {
270        /* How many letters to skip because an eariler encoding handled
271         * multiple letters */
272        unsigned short int skip_letter = 0;
273
274
275        /* THOUGHT:  It would be nice if, rather than having things like...
276         * well, SCI.  For SCI you encode the S, then have to remember
277         * to skip the C.  So the phonome SCI invades both S and C.  It would
278         * be better, IMHO, to skip the C from the S part of the encoding.
279         * Hell, I'm trying it.
280         */
281
282        /* Ignore non-alphas */
283        if (!isalpha(Curr_Letter))
284            continue;
285
286        /* Drop duplicates, except CC */
287        if (Curr_Letter == Prev_Letter &&
288            Curr_Letter != 'C')
289            continue;
290
291        switch (Curr_Letter) {
292            /* B -> B unless in MB */
293        case 'B':
294            if (Prev_Letter != 'M')
295                Phonize('B');
296            break;
297            /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
298             * (SCHW is handled in S)
299             *  S if -CI-, -CE- or -CY-
300             *  dropped if -SCI-, SCE-, -SCY- (handed in S)
301             *  else K
302             */
303        case 'C':
304            if (MAKESOFT(Next_Letter)) {    /* C[IEY] */
305                if (After_Next_Letter == 'A' &&
306                    Next_Letter == 'I') {   /* CIA */
307                    Phonize(SH);
308                }
309                /* SC[IEY] */
310                else if (Prev_Letter == 'S') {
311                    /* Dropped */
312                } else {
313                    Phonize('S');
314                }
315            } else if (Next_Letter == 'H') {
316                if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) {   /* Christ, School */
317                    Phonize('K');
318                } else {
319                    Phonize(SH);
320                }
321                skip_letter++;
322            } else {
323                Phonize('K');
324            }
325            break;
326            /* J if in -DGE-, -DGI- or -DGY-
327             * else T
328             */
329        case 'D':
330            if (Next_Letter == 'G' &&
331                MAKESOFT(After_Next_Letter)) {
332                Phonize('J');
333                skip_letter++;
334            } else
335                Phonize('T');
336            break;
337            /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
338             * else dropped if -GNED, -GN,
339             * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
340             * else J if in -GE-, -GI, -GY and not GG
341             * else K
342             */
343        case 'G':
344            if (Next_Letter == 'H') {
345                if (!(NOGHTOF(Look_Back_Letter(3)) ||
346                      Look_Back_Letter(4) == 'H')) {
347                    Phonize('F');
348                    skip_letter++;
349                } else {
350                    /* silent */
351                }
352            } else if (Next_Letter == 'N') {
353                if (Isbreak(After_Next_Letter) ||
354                    (After_Next_Letter == 'E' &&
355                     Look_Ahead_Letter(3) == 'D')) {
356                    /* dropped */
357                } else
358                    Phonize('K');
359            } else if (MAKESOFT(Next_Letter) &&
360                       Prev_Letter != 'G') {
361                Phonize('J');
362            } else {
363                Phonize('K');
364            }
365            break;
366            /* H if before a vowel and not after C,G,P,S,T */
367        case 'H':
368            if (isvowel(Next_Letter) &&
369                !AFFECTH(Prev_Letter))
370                Phonize('H');
371            break;
372            /* dropped if after C
373             * else K
374             */
375        case 'K':
376            if (Prev_Letter != 'C')
377                Phonize('K');
378            break;
379            /* F if before H
380             * else P
381             */
382        case 'P':
383            if (Next_Letter == 'H') {
384                Phonize('F');
385            } else {
386                Phonize('P');
387            }
388            break;
389            /* K
390             */
391        case 'Q':
392            Phonize('K');
393            break;
394            /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
395             * else S
396             */
397        case 'S':
398            if (Next_Letter == 'I' &&
399                (After_Next_Letter == 'O' ||
400                 After_Next_Letter == 'A')) {
401                Phonize(SH);
402            } else if (Next_Letter == 'H') {
403                Phonize(SH);
404                skip_letter++;
405            } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
406                Phonize(SH);
407                skip_letter += 2;
408            } else {
409                Phonize('S');
410            }
411            break;
412            /* 'sh' in -TIA- or -TIO-
413             * else 'th' before H
414             * else T
415             */
416        case 'T':
417            if (Next_Letter == 'I' &&
418                (After_Next_Letter == 'O' ||
419                 After_Next_Letter == 'A')) {
420                Phonize(SH);
421            } else if (Next_Letter == 'H') {
422                Phonize(TH);
423                skip_letter++;
424            } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
425                Phonize('T');
426            }
427            break;
428            /* F */
429        case 'V':
430            Phonize('F');
431            break;
432            /* W before a vowel, else dropped */
433        case 'W':
434            if (isvowel(Next_Letter))
435                Phonize('W');
436            break;
437            /* KS */
438        case 'X':
439            Phonize('K');
440            Phonize('S');
441            break;
442            /* Y if followed by a vowel */
443        case 'Y':
444            if (isvowel(Next_Letter))
445                Phonize('Y');
446            break;
447            /* S */
448        case 'Z':
449            Phonize('S');
450            break;
451            /* No transformation */
452        case 'F':
453        case 'J':
454        case 'L':
455        case 'M':
456        case 'N':
457        case 'R':
458            Phonize(Curr_Letter);
459            break;
460        default:
461            /* nothing */
462            break;
463        }                       /* END SWITCH */
464
465        w_idx += skip_letter;
466    }                           /* END FOR */
467
468    End_Phoned_Word;
469
470    return 0;
471}                               /* END metaphone */
472/* }}} */
473
474/*
475 * Local variables:
476 * tab-width: 4
477 * c-basic-offset: 4
478 * End:
479 * vim600: sw=4 ts=4 fdm=marker
480 * vim<600: sw=4 ts=4
481 */
482