1/* 2 +----------------------------------------------------------------------+ 3 | PHP Version 5 | 4 +----------------------------------------------------------------------+ 5 | This source file is subject to version 3.01 of the PHP license, | 6 | that is bundled with this package in the file LICENSE, and is | 7 | available through the world-wide-web at the following url: | 8 | http://www.php.net/license/3_01.txt | 9 | If you did not receive a copy of the PHP license and are unable to | 10 | obtain it through the world-wide-web, please send a note to | 11 | license@php.net so we can mail you a copy immediately. | 12 +----------------------------------------------------------------------+ 13 | Author: Ed Batutis <ed@batutis.com> | 14 +----------------------------------------------------------------------+ 15 */ 16 17/* {{{ includes */ 18#ifdef HAVE_CONFIG_H 19#include "config.h" 20#endif 21 22#include <php.h> 23#include "grapheme.h" 24#include "grapheme_util.h" 25 26#include <unicode/utypes.h> 27#include <unicode/ucol.h> 28#include <unicode/ustring.h> 29#include <unicode/ubrk.h> 30 31#include "ext/standard/php_string.h" 32 33/* }}} */ 34 35#define GRAPHEME_EXTRACT_TYPE_COUNT 0 36#define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1 37#define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2 38#define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT 39#define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS 40 41 42/* {{{ grapheme_register_constants 43 * Register API constants 44 */ 45void grapheme_register_constants( INIT_FUNC_ARGS ) 46{ 47 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT); 48 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT); 49 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT); 50} 51/* }}} */ 52 53/* {{{ proto int grapheme_strlen(string str) 54 Get number of graphemes in a string */ 55PHP_FUNCTION(grapheme_strlen) 56{ 57 unsigned char* string; 58 int string_len; 59 UChar* ustring = NULL; 60 int ustring_len = 0; 61 int ret_len; 62 UErrorCode status; 63 64 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) { 65 66 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 67 "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC ); 68 69 RETURN_FALSE; 70 } 71 72 ret_len = grapheme_ascii_check(string, string_len); 73 74 if ( ret_len >= 0 ) 75 RETURN_LONG(ret_len); 76 77 /* convert the string to UTF-16. */ 78 status = U_ZERO_ERROR; 79 intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status ); 80 81 if ( U_FAILURE( status ) ) { 82 /* Set global error code. */ 83 intl_error_set_code( NULL, status TSRMLS_CC ); 84 85 /* Set error messages. */ 86 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); 87 if (ustring) { 88 efree( ustring ); 89 } 90 RETURN_NULL(); 91 } 92 93 ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC ); 94 95 if (ustring) { 96 efree( ustring ); 97 } 98 99 if (ret_len >= 0) { 100 RETVAL_LONG(ret_len); 101 } else { 102 RETVAL_FALSE; 103 } 104} 105/* }}} */ 106 107/* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ]) 108 Find position of first occurrence of a string within another */ 109PHP_FUNCTION(grapheme_strpos) 110{ 111 unsigned char *haystack, *needle; 112 int haystack_len, needle_len; 113 unsigned char *found; 114 long loffset = 0; 115 int32_t offset = 0; 116 int ret_pos, uchar_pos; 117 118 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) { 119 120 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 121 "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC ); 122 123 RETURN_FALSE; 124 } 125 126 if ( OUTSIDE_STRING(loffset, haystack_len) ) { 127 128 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); 129 130 RETURN_FALSE; 131 } 132 133 /* we checked that it will fit: */ 134 offset = (int32_t) loffset; 135 136 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ 137 138 if (needle_len == 0) { 139 140 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC ); 141 142 RETURN_FALSE; 143 } 144 145 146 /* quick check to see if the string might be there 147 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that 148 */ 149 found = (unsigned char *)php_memnstr((char *)haystack + offset, (char *)needle, needle_len, (char *)haystack + haystack_len); 150 151 /* if it isn't there the we are done */ 152 if (!found) { 153 RETURN_FALSE; 154 } 155 156 /* if it is there, and if the haystack is ascii, we are all done */ 157 if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) { 158 159 RETURN_LONG(found - haystack); 160 } 161 162 /* do utf16 part of the strpos */ 163 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 0 /* fIgnoreCase */ TSRMLS_CC ); 164 165 if ( ret_pos >= 0 ) { 166 RETURN_LONG(ret_pos + offset); 167 } else { 168 RETURN_FALSE; 169 } 170 171} 172/* }}} */ 173 174/* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ]) 175 Find position of first occurrence of a string within another, ignoring case differences */ 176PHP_FUNCTION(grapheme_stripos) 177{ 178 unsigned char *haystack, *needle, *haystack_dup, *needle_dup; 179 int haystack_len, needle_len; 180 unsigned char *found; 181 long loffset = 0; 182 int32_t offset = 0; 183 int ret_pos, uchar_pos; 184 int is_ascii; 185 186 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) { 187 188 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 189 "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC ); 190 191 RETURN_FALSE; 192 } 193 194 if ( OUTSIDE_STRING(loffset, haystack_len) ) { 195 196 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC ); 197 198 RETURN_FALSE; 199 } 200 201 /* we checked that it will fit: */ 202 offset = (int32_t) loffset; 203 204 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ 205 206 if (needle_len == 0) { 207 208 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC ); 209 210 RETURN_FALSE; 211 } 212 213 214 is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 ); 215 216 if ( is_ascii ) { 217 needle_dup = (unsigned char *)estrndup((char *)needle, needle_len); 218 php_strtolower((char *)needle_dup, needle_len); 219 haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len); 220 php_strtolower((char *)haystack_dup, haystack_len); 221 222 found = (unsigned char*) php_memnstr((char *)haystack_dup + offset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len); 223 224 efree(haystack_dup); 225 efree(needle_dup); 226 227 if (found) { 228 RETURN_LONG(found - haystack_dup); 229 } 230 231 /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */ 232 if ( grapheme_ascii_check(needle, needle_len) >= 0 ) { 233 RETURN_FALSE; 234 } 235 } 236 237 /* do utf16 part of the strpos */ 238 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 1 /* fIgnoreCase */ TSRMLS_CC ); 239 240 if ( ret_pos >= 0 ) { 241 RETURN_LONG(ret_pos + offset); 242 } else { 243 RETURN_FALSE; 244 } 245 246} 247/* }}} */ 248 249/* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset]) 250 Find position of last occurrence of a string within another */ 251PHP_FUNCTION(grapheme_strrpos) 252{ 253 unsigned char *haystack, *needle; 254 int haystack_len, needle_len; 255 long loffset = 0; 256 int32_t offset = 0; 257 int32_t ret_pos; 258 int is_ascii; 259 260 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) { 261 262 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 263 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC ); 264 265 RETURN_FALSE; 266 } 267 268 if ( OUTSIDE_STRING(loffset, haystack_len) ) { 269 270 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); 271 272 RETURN_FALSE; 273 } 274 275 /* we checked that it will fit: */ 276 offset = (int32_t) loffset; 277 278 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ 279 280 if (needle_len == 0) { 281 282 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC ); 283 284 RETURN_FALSE; 285 } 286 287 is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0; 288 289 if ( is_ascii ) { 290 291 ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset); 292 293 294 if ( ret_pos >= 0 ) { 295 RETURN_LONG(ret_pos); 296 } 297 298 /* if the needle was ascii too, we are done */ 299 300 if ( grapheme_ascii_check(needle, needle_len) >= 0 ) { 301 RETURN_FALSE; 302 } 303 304 /* else we need to continue via utf16 */ 305 } 306 307 ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 0 /* f_ignore_case */ TSRMLS_CC); 308 309 if ( ret_pos >= 0 ) { 310 RETURN_LONG(ret_pos); 311 } else { 312 RETURN_FALSE; 313 } 314 315 316} 317/* }}} */ 318 319/* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset]) 320 Find position of last occurrence of a string within another, ignoring case */ 321PHP_FUNCTION(grapheme_strripos) 322{ 323 unsigned char *haystack, *needle; 324 int haystack_len, needle_len; 325 long loffset = 0; 326 int32_t offset = 0; 327 int32_t ret_pos; 328 int is_ascii; 329 330 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) { 331 332 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 333 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC ); 334 335 RETURN_FALSE; 336 } 337 338 if ( OUTSIDE_STRING(loffset, haystack_len) ) { 339 340 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); 341 342 RETURN_FALSE; 343 } 344 345 /* we checked that it will fit: */ 346 offset = (int32_t) loffset; 347 348 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ 349 350 if (needle_len == 0) { 351 352 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC ); 353 354 RETURN_FALSE; 355 } 356 357 is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0; 358 359 if ( is_ascii ) { 360 unsigned char *needle_dup, *haystack_dup; 361 362 needle_dup = (unsigned char *)estrndup((char *)needle, needle_len); 363 php_strtolower((char *)needle_dup, needle_len); 364 haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len); 365 php_strtolower((char *)haystack_dup, haystack_len); 366 367 ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset); 368 369 efree(haystack_dup); 370 efree(needle_dup); 371 372 if ( ret_pos >= 0 ) { 373 RETURN_LONG(ret_pos); 374 } 375 376 /* if the needle was ascii too, we are done */ 377 378 if ( grapheme_ascii_check(needle, needle_len) >= 0 ) { 379 RETURN_FALSE; 380 } 381 382 /* else we need to continue via utf16 */ 383 } 384 385 ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 1 /* f_ignore_case */ TSRMLS_CC); 386 387 if ( ret_pos >= 0 ) { 388 RETURN_LONG(ret_pos); 389 } else { 390 RETURN_FALSE; 391 } 392 393 394} 395/* }}} */ 396 397/* {{{ proto string grapheme_substr(string str, int start [, int length]) 398 Returns part of a string */ 399PHP_FUNCTION(grapheme_substr) 400{ 401 unsigned char *str, *sub_str; 402 UChar *ustr; 403 int str_len, sub_str_len, ustr_len; 404 long lstart = 0, length = 0; 405 int32_t start = 0; 406 int iter_val; 407 UErrorCode status; 408 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; 409 UBreakIterator* bi = NULL; 410 int sub_str_start_pos, sub_str_end_pos; 411 int32_t (*iter_func)(UBreakIterator *); 412 413 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) { 414 415 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 416 "grapheme_substr: unable to parse input param", 0 TSRMLS_CC ); 417 418 RETURN_FALSE; 419 } 420 421 if ( OUTSIDE_STRING(lstart, str_len) ) { 422 423 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC ); 424 425 RETURN_FALSE; 426 } 427 428 /* we checked that it will fit: */ 429 start = (int32_t) lstart; 430 431 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ 432 433 if ( grapheme_ascii_check(str, str_len) >= 0 ) { 434 grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len); 435 436 if ( NULL == sub_str ) { 437 RETURN_FALSE; 438 } 439 440 RETURN_STRINGL(((char *)sub_str), sub_str_len, 1); 441 } 442 443 ustr = NULL; 444 ustr_len = 0; 445 status = U_ZERO_ERROR; 446 intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status); 447 448 if ( U_FAILURE( status ) ) { 449 /* Set global error code. */ 450 intl_error_set_code( NULL, status TSRMLS_CC ); 451 452 /* Set error messages. */ 453 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); 454 if (ustr) { 455 efree( ustr ); 456 } 457 RETURN_FALSE; 458 } 459 460 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC ); 461 462 if( U_FAILURE(status) ) { 463 RETURN_FALSE; 464 } 465 466 ubrk_setText(bi, ustr, ustr_len, &status); 467 468 if ( start < 0 ) { 469 iter_func = ubrk_previous; 470 ubrk_last(bi); 471 iter_val = 1; 472 } 473 else { 474 iter_func = ubrk_next; 475 iter_val = -1; 476 } 477 478 sub_str_start_pos = 0; 479 480 while ( start ) { 481 sub_str_start_pos = iter_func(bi); 482 483 if ( UBRK_DONE == sub_str_start_pos ) { 484 break; 485 } 486 487 start += iter_val; 488 } 489 490 if ( 0 != start || sub_str_start_pos >= ustr_len ) { 491 492 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC ); 493 494 if (ustr) { 495 efree(ustr); 496 } 497 ubrk_close(bi); 498 RETURN_FALSE; 499 } 500 501 if (ZEND_NUM_ARGS() <= 2) { 502 503 /* no length supplied, return the rest of the string */ 504 505 sub_str = NULL; 506 sub_str_len = 0; 507 status = U_ZERO_ERROR; 508 intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status); 509 510 if (ustr) { 511 efree( ustr ); 512 } 513 ubrk_close( bi ); 514 515 if ( U_FAILURE( status ) ) { 516 /* Set global error code. */ 517 intl_error_set_code( NULL, status TSRMLS_CC ); 518 519 /* Set error messages. */ 520 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC ); 521 522 if (sub_str) { 523 efree( sub_str ); 524 } 525 526 RETURN_FALSE; 527 } 528 529 /* return the allocated string, not a duplicate */ 530 RETURN_STRINGL(((char *)sub_str), sub_str_len, 0); 531 } 532 533 /* find the end point of the string to return */ 534 535 if ( length < 0 ) { 536 iter_func = ubrk_previous; 537 ubrk_last(bi); 538 iter_val = 1; 539 } 540 else { 541 iter_func = ubrk_next; 542 iter_val = -1; 543 } 544 545 sub_str_end_pos = 0; 546 547 while ( length ) { 548 sub_str_end_pos = iter_func(bi); 549 550 if ( UBRK_DONE == sub_str_end_pos ) { 551 break; 552 } 553 554 length += iter_val; 555 } 556 557 if ( UBRK_DONE == sub_str_end_pos) { 558 if(length < 0) { 559 560 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC ); 561 562 efree(ustr); 563 ubrk_close(bi); 564 RETURN_FALSE; 565 } else { 566 sub_str_end_pos = ustr_len; 567 } 568 } 569 570 sub_str = NULL; 571 status = U_ZERO_ERROR; 572 intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status); 573 574 efree( ustr ); 575 ubrk_close( bi ); 576 577 if ( U_FAILURE( status ) ) { 578 /* Set global error code. */ 579 intl_error_set_code( NULL, status TSRMLS_CC ); 580 581 /* Set error messages. */ 582 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC ); 583 584 if ( NULL != sub_str ) 585 efree( sub_str ); 586 587 RETURN_FALSE; 588 } 589 590 /* return the allocated string, not a duplicate */ 591 RETURN_STRINGL(((char *)sub_str), sub_str_len, 0); 592 593} 594/* }}} */ 595 596/* {{{ strstr_common_handler */ 597static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case) 598{ 599 unsigned char *haystack, *needle, *found; 600 int haystack_len, needle_len; 601 int ret_pos, uchar_pos; 602 zend_bool part = 0; 603 604 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) { 605 606 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 607 "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC ); 608 609 RETURN_FALSE; 610 } 611 612 if (needle_len == 0) { 613 614 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC ); 615 616 RETURN_FALSE; 617 } 618 619 620 if ( !f_ignore_case ) { 621 622 /* ASCII optimization: quick check to see if the string might be there 623 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that 624 */ 625 found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len); 626 627 /* if it isn't there the we are done */ 628 if ( !found ) { 629 RETURN_FALSE; 630 } 631 632 /* if it is there, and if the haystack is ascii, we are all done */ 633 if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) { 634 size_t found_offset = found - haystack; 635 636 if (part) { 637 RETURN_STRINGL(((char *)haystack) , found_offset, 1); 638 } else { 639 RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1); 640 } 641 } 642 643 } 644 645 /* need to work in utf16 */ 646 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case TSRMLS_CC ); 647 648 if ( ret_pos < 0 ) { 649 RETURN_FALSE; 650 } 651 652 /* uchar_pos is the 'nth' Unicode character position of the needle */ 653 654 ret_pos = 0; 655 U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos); 656 657 if (part) { 658 RETURN_STRINGL(((char *)haystack), ret_pos, 1); 659 } 660 else { 661 RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1); 662 } 663 664} 665/* }}} */ 666 667/* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part]) 668 Finds first occurrence of a string within another */ 669PHP_FUNCTION(grapheme_strstr) 670{ 671 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */); 672} 673/* }}} */ 674 675/* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part]) 676 Finds first occurrence of a string within another */ 677PHP_FUNCTION(grapheme_stristr) 678{ 679 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */); 680} 681/* }}} */ 682 683/* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */ 684static inline int32_t 685grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len) 686{ 687 int pos = 0, prev_pos = 0; 688 int ret_pos = 0, prev_ret_pos = 0; 689 690 while ( 1 ) { 691 pos = ubrk_next(bi); 692 693 if ( UBRK_DONE == pos ) { 694 break; 695 } 696 697 /* if we are beyond our limit, then the loop is done */ 698 if ( pos > csize ) { 699 break; 700 } 701 702 /* update our pointer in the original UTF-8 buffer by as many characters 703 as ubrk_next iterated over */ 704 705 prev_ret_pos = ret_pos; 706 U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); 707 708 if ( prev_ret_pos == ret_pos ) { 709 /* something wrong - malformed utf8? */ 710 break; 711 } 712 713 prev_pos = pos; 714 } 715 716 return ret_pos; 717} 718/* }}} */ 719 720/* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */ 721static inline int32_t 722grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len) 723{ 724 int pos = 0, prev_pos = 0; 725 int ret_pos = 0, prev_ret_pos = 0; 726 727 while ( 1 ) { 728 pos = ubrk_next(bi); 729 730 if ( UBRK_DONE == pos ) { 731 break; 732 } 733 734 prev_ret_pos = ret_pos; 735 U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); 736 737 if ( ret_pos > bsize ) { 738 ret_pos = prev_ret_pos; 739 break; 740 } 741 742 if ( prev_ret_pos == ret_pos ) { 743 /* something wrong - malformed utf8? */ 744 break; 745 } 746 747 prev_pos = pos; 748 } 749 750 return ret_pos; 751} 752/* }}} */ 753 754/* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */ 755static inline int32_t 756grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len) 757{ 758 int pos = 0, next_pos = 0; 759 int ret_pos = 0; 760 761 while ( size ) { 762 next_pos = ubrk_next(bi); 763 764 if ( UBRK_DONE == next_pos ) { 765 break; 766 } 767 pos = next_pos; 768 size--; 769 } 770 771 /* pos is one past the last UChar - and represent the number of code units to 772 advance in the utf-8 buffer 773 */ 774 775 U8_FWD_N(pstr, ret_pos, str_len, pos); 776 777 return ret_pos; 778} 779/* }}} */ 780 781/* {{{ grapheme extract iter function pointer array */ 782typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/); 783 784static grapheme_extract_iter grapheme_extract_iters[] = { 785 &grapheme_extract_count_iter, 786 &grapheme_extract_bytecount_iter, 787 &grapheme_extract_charcount_iter, 788}; 789/* }}} */ 790 791/* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]]) 792 Function to extract a sequence of default grapheme clusters */ 793PHP_FUNCTION(grapheme_extract) 794{ 795 unsigned char *str, *pstr; 796 UChar *ustr; 797 int str_len, ustr_len; 798 long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */ 799 long lstart = 0; /* starting position in str in bytes */ 800 int32_t start = 0; 801 long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT; 802 UErrorCode status; 803 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; 804 UBreakIterator* bi = NULL; 805 int ret_pos; 806 zval *next = NULL; /* return offset of next part of the string */ 807 808 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) { 809 810 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 811 "grapheme_extract: unable to parse input param", 0 TSRMLS_CC ); 812 813 RETURN_FALSE; 814 } 815 816 if ( NULL != next ) { 817 if ( !PZVAL_IS_REF(next) ) { 818 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 819 "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC ); 820 821 RETURN_FALSE; 822 } 823 else { 824 /* initialize next */ 825 zval_dtor(next); 826 ZVAL_LONG(next, lstart); 827 } 828 } 829 830 if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) { 831 832 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 833 "grapheme_extract: unknown extract type param", 0 TSRMLS_CC ); 834 835 RETURN_FALSE; 836 } 837 838 if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) { 839 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC ); 840 RETURN_FALSE; 841 } 842 843 if ( size > INT32_MAX || size < 0) { 844 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC ); 845 RETURN_FALSE; 846 } 847 if (size == 0) { 848 RETURN_EMPTY_STRING(); 849 } 850 851 /* we checked that it will fit: */ 852 start = (int32_t) lstart; 853 854 pstr = str + start; 855 856 /* just in case pstr points in the middle of a character, move forward to the start of the next char */ 857 if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) { 858 unsigned char *str_end = str + str_len; 859 860 while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) { 861 pstr++; 862 if ( pstr >= str_end ) { 863 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 864 "grapheme_extract: invalid input string", 0 TSRMLS_CC ); 865 866 RETURN_FALSE; 867 } 868 } 869 } 870 871 str_len -= (pstr - str); 872 873 /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done. 874 (size + 1 because the size-th character might be the beginning of a grapheme cluster) 875 */ 876 877 if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) { 878 long nsize = ( size < str_len ? size : str_len ); 879 if ( NULL != next ) { 880 ZVAL_LONG(next, start+nsize); 881 } 882 RETURN_STRINGL(((char *)pstr), nsize, 1); 883 } 884 885 /* convert the strings to UTF-16. */ 886 ustr = NULL; 887 ustr_len = 0; 888 status = U_ZERO_ERROR; 889 intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status ); 890 891 if ( U_FAILURE( status ) ) { 892 /* Set global error code. */ 893 intl_error_set_code( NULL, status TSRMLS_CC ); 894 895 /* Set error messages. */ 896 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); 897 898 if ( NULL != ustr ) 899 efree( ustr ); 900 901 RETURN_FALSE; 902 } 903 904 bi = NULL; 905 status = U_ZERO_ERROR; 906 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC ); 907 908 ubrk_setText(bi, ustr, ustr_len, &status); 909 910 /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we 911 can't back up. So, we will not do anything. */ 912 913 /* now we need to find the end of the chunk the user wants us to return */ 914 915 ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len); 916 917 if (ustr) { 918 efree(ustr); 919 } 920 ubrk_close(bi); 921 922 if ( NULL != next ) { 923 ZVAL_LONG(next, start+ret_pos); 924 } 925 926 RETURN_STRINGL(((char *)pstr), ret_pos, 1); 927} 928 929/* }}} */ 930 931/* 932 * Local variables: 933 * tab-width: 4 934 * c-basic-offset: 4 935 * End: 936 * vim600: fdm=marker 937 * vim: noet sw=4 ts=4 938 */ 939 940