1/* 2 +----------------------------------------------------------------------+ 3 | PHP Version 5 | 4 +----------------------------------------------------------------------+ 5 | Copyright (c) 1997-2013 The PHP Group | 6 +----------------------------------------------------------------------+ 7 | This source file is subject to version 3.01 of the PHP license, | 8 | that is bundled with this package in the file LICENSE, and is | 9 | available through the world-wide-web at the following url: | 10 | http://www.php.net/license/3_01.txt | 11 | If you did not receive a copy of the PHP license and are unable to | 12 | obtain it through the world-wide-web, please send a note to | 13 | license@php.net so we can mail you a copy immediately. | 14 +----------------------------------------------------------------------+ 15 | Author: Thies C. Arntzen <thies@thieso.net> | 16 +----------------------------------------------------------------------+ 17*/ 18 19/* $Id$ */ 20 21/* 22 Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> 23*/ 24 25#include "php.h" 26#include "php_metaphone.h" 27 28static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional); 29 30/* {{{ proto string metaphone(string text[, int phones]) 31 Break english phrases down into their phonemes */ 32PHP_FUNCTION(metaphone) 33{ 34 char *str; 35 char *result = 0; 36 int str_len; 37 long phones = 0; 38 39 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len, 40 &phones) == FAILURE) { 41 return; 42 } 43 44 if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) { 45 RETVAL_STRING(result, 0); 46 } else { 47 if (result) { 48 efree(result); 49 } 50 RETURN_FALSE; 51 } 52} 53/* }}} */ 54 55/* 56 this is now the original code by Michael G Schwern: 57 i've changed it just a slightly bit (use emalloc, 58 get rid of includes etc) 59 - thies - 13.09.1999 60*/ 61 62/*----------------------------- */ 63/* this used to be "metaphone.h" */ 64/*----------------------------- */ 65 66/* Special encodings */ 67#define SH 'X' 68#define TH '0' 69 70/*----------------------------- */ 71/* end of "metaphone.h" */ 72/*----------------------------- */ 73 74/*----------------------------- */ 75/* this used to be "metachar.h" */ 76/*----------------------------- */ 77 78/* Metachar.h ... little bits about characters for metaphone */ 79/*-- Character encoding array & accessing macros --*/ 80/* Stolen directly out of the book... */ 81char _codes[26] = 82{ 83 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0 84/* a b c d e f g h i j k l m n o p q r s t u v w x y z */ 85}; 86 87 88#define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0) 89 90#define isvowel(c) (ENCODE(c) & 1) /* AEIOU */ 91 92/* These letters are passed through unchanged */ 93#define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */ 94 95/* These form dipthongs when preceding H */ 96#define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */ 97 98/* These make C and G soft */ 99#define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */ 100 101/* These prevent GH from becoming F */ 102#define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */ 103 104/*----------------------------- */ 105/* end of "metachar.h" */ 106/*----------------------------- */ 107 108/* I suppose I could have been using a character pointer instead of 109 * accesssing the array directly... */ 110 111/* Look at the next letter in the word */ 112#define Next_Letter (toupper(word[w_idx+1])) 113/* Look at the current letter in the word */ 114#define Curr_Letter (toupper(word[w_idx])) 115/* Go N letters back. */ 116#define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0') 117/* Previous letter. I dunno, should this return null on failure? */ 118#define Prev_Letter (Look_Back_Letter(1)) 119/* Look two letters down. It makes sure you don't walk off the string. */ 120#define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \ 121 : '\0') 122#define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n))) 123 124 125/* Allows us to safely look ahead an arbitrary # of letters */ 126/* I probably could have just used strlen... */ 127static char Lookahead(char *word, int how_far) 128{ 129 char letter_ahead = '\0'; /* null by default */ 130 int idx; 131 for (idx = 0; word[idx] != '\0' && idx < how_far; idx++); 132 /* Edge forward in the string... */ 133 134 letter_ahead = word[idx]; /* idx will be either == to how_far or 135 * at the end of the string 136 */ 137 return letter_ahead; 138} 139 140 141/* phonize one letter 142 * We don't know the buffers size in advance. On way to solve this is to just 143 * re-allocate the buffer size. We're using an extra of 2 characters (this 144 * could be one though; or more too). */ 145#define Phonize(c) { \ 146 if (p_idx >= max_buffer_len) { \ 147 *phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \ 148 max_buffer_len += 2; \ 149 } \ 150 (*phoned_word)[p_idx++] = c; \ 151 } 152/* Slap a null character on the end of the phoned word */ 153#define End_Phoned_Word { \ 154 if (p_idx == max_buffer_len) { \ 155 *phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \ 156 } \ 157 (*phoned_word)[p_idx] = '\0'; \ 158 } 159/* How long is the phoned word? */ 160#define Phone_Len (p_idx) 161 162/* Note is a letter is a 'break' in the word */ 163#define Isbreak(c) (!isalpha(c)) 164 165/* {{{ metaphone 166 */ 167static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional) 168{ 169 int w_idx = 0; /* point in the phonization we're at. */ 170 int p_idx = 0; /* end of the phoned phrase */ 171 int max_buffer_len = 0; /* maximum length of the destination buffer */ 172 173/*-- Parameter checks --*/ 174 /* Negative phoneme length is meaningless */ 175 176 if (max_phonemes < 0) 177 return -1; 178 179 /* Empty/null string is meaningless */ 180 /* Overly paranoid */ 181 /* assert(word != NULL && word[0] != '\0'); */ 182 183 if (word == NULL) 184 return -1; 185 186/*-- Allocate memory for our phoned_phrase --*/ 187 if (max_phonemes == 0) { /* Assume largest possible */ 188 max_buffer_len = word_len; 189 *phoned_word = safe_emalloc(sizeof(char), word_len, 1); 190 } else { 191 max_buffer_len = max_phonemes; 192 *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1); 193 } 194 195 196/*-- The first phoneme has to be processed specially. --*/ 197 /* Find our first letter */ 198 for (; !isalpha(Curr_Letter); w_idx++) { 199 /* On the off chance we were given nothing but crap... */ 200 if (Curr_Letter == '\0') { 201 End_Phoned_Word 202 return SUCCESS; /* For testing */ 203 } 204 } 205 206 switch (Curr_Letter) { 207 /* AE becomes E */ 208 case 'A': 209 if (Next_Letter == 'E') { 210 Phonize('E'); 211 w_idx += 2; 212 } 213 /* Remember, preserve vowels at the beginning */ 214 else { 215 Phonize('A'); 216 w_idx++; 217 } 218 break; 219 /* [GKP]N becomes N */ 220 case 'G': 221 case 'K': 222 case 'P': 223 if (Next_Letter == 'N') { 224 Phonize('N'); 225 w_idx += 2; 226 } 227 break; 228 /* WH becomes W, 229 WR becomes R 230 W if followed by a vowel */ 231 case 'W': 232 if (Next_Letter == 'R') { 233 Phonize(Next_Letter); 234 w_idx += 2; 235 } else if (Next_Letter == 'H' || isvowel(Next_Letter)) { 236 Phonize('W'); 237 w_idx += 2; 238 } 239 /* else ignore */ 240 break; 241 /* X becomes S */ 242 case 'X': 243 Phonize('S'); 244 w_idx++; 245 break; 246 /* Vowels are kept */ 247 /* We did A already 248 case 'A': 249 case 'a': 250 */ 251 case 'E': 252 case 'I': 253 case 'O': 254 case 'U': 255 Phonize(Curr_Letter); 256 w_idx++; 257 break; 258 default: 259 /* do nothing */ 260 break; 261 } 262 263 264 265 /* On to the metaphoning */ 266 for (; Curr_Letter != '\0' && 267 (max_phonemes == 0 || Phone_Len < max_phonemes); 268 w_idx++) { 269 /* How many letters to skip because an eariler encoding handled 270 * multiple letters */ 271 unsigned short int skip_letter = 0; 272 273 274 /* THOUGHT: It would be nice if, rather than having things like... 275 * well, SCI. For SCI you encode the S, then have to remember 276 * to skip the C. So the phonome SCI invades both S and C. It would 277 * be better, IMHO, to skip the C from the S part of the encoding. 278 * Hell, I'm trying it. 279 */ 280 281 /* Ignore non-alphas */ 282 if (!isalpha(Curr_Letter)) 283 continue; 284 285 /* Drop duplicates, except CC */ 286 if (Curr_Letter == Prev_Letter && 287 Curr_Letter != 'C') 288 continue; 289 290 switch (Curr_Letter) { 291 /* B -> B unless in MB */ 292 case 'B': 293 if (Prev_Letter != 'M') 294 Phonize('B'); 295 break; 296 /* 'sh' if -CIA- or -CH, but not SCH, except SCHW. 297 * (SCHW is handled in S) 298 * S if -CI-, -CE- or -CY- 299 * dropped if -SCI-, SCE-, -SCY- (handed in S) 300 * else K 301 */ 302 case 'C': 303 if (MAKESOFT(Next_Letter)) { /* C[IEY] */ 304 if (After_Next_Letter == 'A' && 305 Next_Letter == 'I') { /* CIA */ 306 Phonize(SH); 307 } 308 /* SC[IEY] */ 309 else if (Prev_Letter == 'S') { 310 /* Dropped */ 311 } else { 312 Phonize('S'); 313 } 314 } else if (Next_Letter == 'H') { 315 if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */ 316 Phonize('K'); 317 } else { 318 Phonize(SH); 319 } 320 skip_letter++; 321 } else { 322 Phonize('K'); 323 } 324 break; 325 /* J if in -DGE-, -DGI- or -DGY- 326 * else T 327 */ 328 case 'D': 329 if (Next_Letter == 'G' && 330 MAKESOFT(After_Next_Letter)) { 331 Phonize('J'); 332 skip_letter++; 333 } else 334 Phonize('T'); 335 break; 336 /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH 337 * else dropped if -GNED, -GN, 338 * else dropped if -DGE-, -DGI- or -DGY- (handled in D) 339 * else J if in -GE-, -GI, -GY and not GG 340 * else K 341 */ 342 case 'G': 343 if (Next_Letter == 'H') { 344 if (!(NOGHTOF(Look_Back_Letter(3)) || 345 Look_Back_Letter(4) == 'H')) { 346 Phonize('F'); 347 skip_letter++; 348 } else { 349 /* silent */ 350 } 351 } else if (Next_Letter == 'N') { 352 if (Isbreak(After_Next_Letter) || 353 (After_Next_Letter == 'E' && 354 Look_Ahead_Letter(3) == 'D')) { 355 /* dropped */ 356 } else 357 Phonize('K'); 358 } else if (MAKESOFT(Next_Letter) && 359 Prev_Letter != 'G') { 360 Phonize('J'); 361 } else { 362 Phonize('K'); 363 } 364 break; 365 /* H if before a vowel and not after C,G,P,S,T */ 366 case 'H': 367 if (isvowel(Next_Letter) && 368 !AFFECTH(Prev_Letter)) 369 Phonize('H'); 370 break; 371 /* dropped if after C 372 * else K 373 */ 374 case 'K': 375 if (Prev_Letter != 'C') 376 Phonize('K'); 377 break; 378 /* F if before H 379 * else P 380 */ 381 case 'P': 382 if (Next_Letter == 'H') { 383 Phonize('F'); 384 } else { 385 Phonize('P'); 386 } 387 break; 388 /* K 389 */ 390 case 'Q': 391 Phonize('K'); 392 break; 393 /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW- 394 * else S 395 */ 396 case 'S': 397 if (Next_Letter == 'I' && 398 (After_Next_Letter == 'O' || 399 After_Next_Letter == 'A')) { 400 Phonize(SH); 401 } else if (Next_Letter == 'H') { 402 Phonize(SH); 403 skip_letter++; 404 } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) { 405 Phonize(SH); 406 skip_letter += 2; 407 } else { 408 Phonize('S'); 409 } 410 break; 411 /* 'sh' in -TIA- or -TIO- 412 * else 'th' before H 413 * else T 414 */ 415 case 'T': 416 if (Next_Letter == 'I' && 417 (After_Next_Letter == 'O' || 418 After_Next_Letter == 'A')) { 419 Phonize(SH); 420 } else if (Next_Letter == 'H') { 421 Phonize(TH); 422 skip_letter++; 423 } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) { 424 Phonize('T'); 425 } 426 break; 427 /* F */ 428 case 'V': 429 Phonize('F'); 430 break; 431 /* W before a vowel, else dropped */ 432 case 'W': 433 if (isvowel(Next_Letter)) 434 Phonize('W'); 435 break; 436 /* KS */ 437 case 'X': 438 Phonize('K'); 439 Phonize('S'); 440 break; 441 /* Y if followed by a vowel */ 442 case 'Y': 443 if (isvowel(Next_Letter)) 444 Phonize('Y'); 445 break; 446 /* S */ 447 case 'Z': 448 Phonize('S'); 449 break; 450 /* No transformation */ 451 case 'F': 452 case 'J': 453 case 'L': 454 case 'M': 455 case 'N': 456 case 'R': 457 Phonize(Curr_Letter); 458 break; 459 default: 460 /* nothing */ 461 break; 462 } /* END SWITCH */ 463 464 w_idx += skip_letter; 465 } /* END FOR */ 466 467 End_Phoned_Word; 468 469 return 0; 470} /* END metaphone */ 471/* }}} */ 472 473/* 474 * Local variables: 475 * tab-width: 4 476 * c-basic-offset: 4 477 * End: 478 * vim600: sw=4 ts=4 fdm=marker 479 * vim<600: sw=4 ts=4 480 */ 481