1/* 2 +----------------------------------------------------------------------+ 3 | PHP Version 5 | 4 +----------------------------------------------------------------------+ 5 | This source file is subject to version 3.01 of the PHP license, | 6 | that is bundled with this package in the file LICENSE, and is | 7 | available through the world-wide-web at the following url: | 8 | http://www.php.net/license/3_01.txt | 9 | If you did not receive a copy of the PHP license and are unable to | 10 | obtain it through the world-wide-web, please send a note to | 11 | license@php.net so we can mail you a copy immediately. | 12 +----------------------------------------------------------------------+ 13 | Author: Ed Batutis <ed@batutis.com> | 14 +----------------------------------------------------------------------+ 15 */ 16 17/* {{{ includes */ 18#ifdef HAVE_CONFIG_H 19#include "config.h" 20#endif 21 22#include <php.h> 23#include "grapheme.h" 24#include "grapheme_util.h" 25 26#include <unicode/utypes.h> 27#include <unicode/ucol.h> 28#include <unicode/ustring.h> 29#include <unicode/ubrk.h> 30 31#include "ext/standard/php_string.h" 32 33/* }}} */ 34 35#define GRAPHEME_EXTRACT_TYPE_COUNT 0 36#define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1 37#define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2 38#define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT 39#define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS 40 41 42/* {{{ grapheme_register_constants 43 * Register API constants 44 */ 45void grapheme_register_constants( INIT_FUNC_ARGS ) 46{ 47 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT); 48 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT); 49 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT); 50} 51/* }}} */ 52 53/* {{{ proto int grapheme_strlen(string str) 54 Get number of graphemes in a string */ 55PHP_FUNCTION(grapheme_strlen) 56{ 57 unsigned char* string; 58 int string_len; 59 UChar* ustring = NULL; 60 int ustring_len = 0; 61 int ret_len; 62 UErrorCode status; 63 64 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) { 65 66 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 67 "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC ); 68 69 RETURN_FALSE; 70 } 71 72 ret_len = grapheme_ascii_check(string, string_len); 73 74 if ( ret_len >= 0 ) 75 RETURN_LONG(ret_len); 76 77 /* convert the string to UTF-16. */ 78 status = U_ZERO_ERROR; 79 intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status ); 80 81 if ( U_FAILURE( status ) ) { 82 /* Set global error code. */ 83 intl_error_set_code( NULL, status TSRMLS_CC ); 84 85 /* Set error messages. */ 86 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); 87 if (ustring) { 88 efree( ustring ); 89 } 90 RETURN_NULL(); 91 } 92 93 ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC ); 94 95 if (ustring) { 96 efree( ustring ); 97 } 98 99 if (ret_len >= 0) { 100 RETVAL_LONG(ret_len); 101 } else { 102 RETVAL_FALSE; 103 } 104} 105/* }}} */ 106 107/* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ]) 108 Find position of first occurrence of a string within another */ 109PHP_FUNCTION(grapheme_strpos) 110{ 111 unsigned char *haystack, *needle; 112 int haystack_len, needle_len; 113 unsigned char *found; 114 long loffset = 0; 115 int32_t offset = 0; 116 int ret_pos, uchar_pos; 117 118 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) { 119 120 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 121 "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC ); 122 123 RETURN_FALSE; 124 } 125 126 if ( OUTSIDE_STRING(loffset, haystack_len) ) { 127 128 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); 129 130 RETURN_FALSE; 131 } 132 133 /* we checked that it will fit: */ 134 offset = (int32_t) loffset; 135 136 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ 137 138 if (needle_len == 0) { 139 140 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC ); 141 142 RETURN_FALSE; 143 } 144 145 146 /* quick check to see if the string might be there 147 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that 148 */ 149 found = (unsigned char *)php_memnstr((char *)haystack + offset, (char *)needle, needle_len, (char *)haystack + haystack_len); 150 151 /* if it isn't there the we are done */ 152 if (!found) { 153 RETURN_FALSE; 154 } 155 156 /* if it is there, and if the haystack is ascii, we are all done */ 157 if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) { 158 159 RETURN_LONG(found - haystack); 160 } 161 162 /* do utf16 part of the strpos */ 163 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 0 /* fIgnoreCase */ TSRMLS_CC ); 164 165 if ( ret_pos >= 0 ) { 166 RETURN_LONG(ret_pos + offset); 167 } else { 168 RETURN_FALSE; 169 } 170 171} 172/* }}} */ 173 174/* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ]) 175 Find position of first occurrence of a string within another, ignoring case differences */ 176PHP_FUNCTION(grapheme_stripos) 177{ 178 unsigned char *haystack, *needle, *haystack_dup, *needle_dup; 179 int haystack_len, needle_len; 180 unsigned char *found; 181 long loffset = 0; 182 int32_t offset = 0; 183 int ret_pos, uchar_pos; 184 int is_ascii; 185 186 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) { 187 188 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 189 "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC ); 190 191 RETURN_FALSE; 192 } 193 194 if ( OUTSIDE_STRING(loffset, haystack_len) ) { 195 196 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC ); 197 198 RETURN_FALSE; 199 } 200 201 /* we checked that it will fit: */ 202 offset = (int32_t) loffset; 203 204 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ 205 206 if (needle_len == 0) { 207 208 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC ); 209 210 RETURN_FALSE; 211 } 212 213 214 is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 ); 215 216 if ( is_ascii ) { 217 needle_dup = (unsigned char *)estrndup((char *)needle, needle_len); 218 php_strtolower((char *)needle_dup, needle_len); 219 haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len); 220 php_strtolower((char *)haystack_dup, haystack_len); 221 222 found = (unsigned char*) php_memnstr((char *)haystack_dup + offset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len); 223 224 efree(haystack_dup); 225 efree(needle_dup); 226 227 if (found) { 228 RETURN_LONG(found - haystack_dup); 229 } 230 231 /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */ 232 if ( grapheme_ascii_check(needle, needle_len) >= 0 ) { 233 RETURN_FALSE; 234 } 235 } 236 237 /* do utf16 part of the strpos */ 238 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 1 /* fIgnoreCase */ TSRMLS_CC ); 239 240 if ( ret_pos >= 0 ) { 241 RETURN_LONG(ret_pos + offset); 242 } else { 243 RETURN_FALSE; 244 } 245 246} 247/* }}} */ 248 249/* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset]) 250 Find position of last occurrence of a string within another */ 251PHP_FUNCTION(grapheme_strrpos) 252{ 253 unsigned char *haystack, *needle; 254 int haystack_len, needle_len; 255 long loffset = 0; 256 int32_t offset = 0; 257 int32_t ret_pos; 258 int is_ascii; 259 260 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) { 261 262 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 263 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC ); 264 265 RETURN_FALSE; 266 } 267 268 if ( OUTSIDE_STRING(loffset, haystack_len) ) { 269 270 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); 271 272 RETURN_FALSE; 273 } 274 275 /* we checked that it will fit: */ 276 offset = (int32_t) loffset; 277 278 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ 279 280 if (needle_len == 0) { 281 282 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC ); 283 284 RETURN_FALSE; 285 } 286 287 is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0; 288 289 if ( is_ascii ) { 290 291 ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset); 292 293 294 if ( ret_pos >= 0 ) { 295 RETURN_LONG(ret_pos); 296 } 297 298 /* if the needle was ascii too, we are done */ 299 300 if ( grapheme_ascii_check(needle, needle_len) >= 0 ) { 301 RETURN_FALSE; 302 } 303 304 /* else we need to continue via utf16 */ 305 } 306 307 ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 0 /* f_ignore_case */ TSRMLS_CC); 308 309 if ( ret_pos >= 0 ) { 310 RETURN_LONG(ret_pos); 311 } else { 312 RETURN_FALSE; 313 } 314 315 316} 317/* }}} */ 318 319/* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset]) 320 Find position of last occurrence of a string within another, ignoring case */ 321PHP_FUNCTION(grapheme_strripos) 322{ 323 unsigned char *haystack, *needle; 324 int haystack_len, needle_len; 325 long loffset = 0; 326 int32_t offset = 0; 327 int32_t ret_pos; 328 int is_ascii; 329 330 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) { 331 332 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 333 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC ); 334 335 RETURN_FALSE; 336 } 337 338 if ( OUTSIDE_STRING(loffset, haystack_len) ) { 339 340 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); 341 342 RETURN_FALSE; 343 } 344 345 /* we checked that it will fit: */ 346 offset = (int32_t) loffset; 347 348 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ 349 350 if (needle_len == 0) { 351 352 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC ); 353 354 RETURN_FALSE; 355 } 356 357 is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0; 358 359 if ( is_ascii ) { 360 unsigned char *needle_dup, *haystack_dup; 361 362 needle_dup = (unsigned char *)estrndup((char *)needle, needle_len); 363 php_strtolower((char *)needle_dup, needle_len); 364 haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len); 365 php_strtolower((char *)haystack_dup, haystack_len); 366 367 ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset); 368 369 efree(haystack_dup); 370 efree(needle_dup); 371 372 if ( ret_pos >= 0 ) { 373 RETURN_LONG(ret_pos); 374 } 375 376 /* if the needle was ascii too, we are done */ 377 378 if ( grapheme_ascii_check(needle, needle_len) >= 0 ) { 379 RETURN_FALSE; 380 } 381 382 /* else we need to continue via utf16 */ 383 } 384 385 ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 1 /* f_ignore_case */ TSRMLS_CC); 386 387 if ( ret_pos >= 0 ) { 388 RETURN_LONG(ret_pos); 389 } else { 390 RETURN_FALSE; 391 } 392 393 394} 395/* }}} */ 396 397/* {{{ proto string grapheme_substr(string str, int start [, int length]) 398 Returns part of a string */ 399PHP_FUNCTION(grapheme_substr) 400{ 401 unsigned char *str, *sub_str; 402 UChar *ustr; 403 int str_len, sub_str_len, ustr_len; 404 long lstart = 0, length = 0; 405 int32_t start = 0; 406 int iter_val; 407 UErrorCode status; 408 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; 409 UBreakIterator* bi = NULL; 410 int sub_str_start_pos, sub_str_end_pos; 411 int32_t (*iter_func)(UBreakIterator *); 412 413 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) { 414 415 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 416 "grapheme_substr: unable to parse input param", 0 TSRMLS_CC ); 417 418 RETURN_FALSE; 419 } 420 421 if ( OUTSIDE_STRING(lstart, str_len) ) { 422 423 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC ); 424 425 RETURN_FALSE; 426 } 427 428 /* we checked that it will fit: */ 429 start = (int32_t) lstart; 430 431 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ 432 433 if ( grapheme_ascii_check(str, str_len) >= 0 ) { 434 grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len); 435 436 if ( NULL == sub_str ) { 437 RETURN_FALSE; 438 } 439 440 RETURN_STRINGL(((char *)sub_str), sub_str_len, 1); 441 } 442 443 ustr = NULL; 444 ustr_len = 0; 445 status = U_ZERO_ERROR; 446 intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status); 447 448 if ( U_FAILURE( status ) ) { 449 /* Set global error code. */ 450 intl_error_set_code( NULL, status TSRMLS_CC ); 451 452 /* Set error messages. */ 453 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); 454 if (ustr) { 455 efree( ustr ); 456 } 457 RETURN_FALSE; 458 } 459 460 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC ); 461 462 if( U_FAILURE(status) ) { 463 RETURN_FALSE; 464 } 465 466 ubrk_setText(bi, ustr, ustr_len, &status); 467 468 if ( start < 0 ) { 469 iter_func = ubrk_previous; 470 ubrk_last(bi); 471 iter_val = 1; 472 } 473 else { 474 iter_func = ubrk_next; 475 iter_val = -1; 476 } 477 478 sub_str_start_pos = 0; 479 480 while ( start ) { 481 sub_str_start_pos = iter_func(bi); 482 483 if ( UBRK_DONE == sub_str_start_pos ) { 484 break; 485 } 486 487 start += iter_val; 488 } 489 490 if ( 0 != start || sub_str_start_pos >= ustr_len ) { 491 492 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC ); 493 494 if (ustr) { 495 efree(ustr); 496 } 497 ubrk_close(bi); 498 RETURN_FALSE; 499 } 500 501 if (ZEND_NUM_ARGS() <= 2) { 502 503 /* no length supplied, return the rest of the string */ 504 505 sub_str = NULL; 506 sub_str_len = 0; 507 status = U_ZERO_ERROR; 508 intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status); 509 510 if (ustr) { 511 efree( ustr ); 512 } 513 ubrk_close( bi ); 514 515 if ( U_FAILURE( status ) ) { 516 /* Set global error code. */ 517 intl_error_set_code( NULL, status TSRMLS_CC ); 518 519 /* Set error messages. */ 520 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC ); 521 522 if (sub_str) { 523 efree( sub_str ); 524 } 525 526 RETURN_FALSE; 527 } 528 529 /* return the allocated string, not a duplicate */ 530 RETURN_STRINGL(((char *)sub_str), sub_str_len, 0); 531 } 532 533 /* find the end point of the string to return */ 534 535 if ( length < 0 ) { 536 iter_func = ubrk_previous; 537 ubrk_last(bi); 538 iter_val = 1; 539 } 540 else { 541 iter_func = ubrk_next; 542 iter_val = -1; 543 } 544 545 sub_str_end_pos = 0; 546 547 while ( length ) { 548 sub_str_end_pos = iter_func(bi); 549 550 if ( UBRK_DONE == sub_str_end_pos ) { 551 break; 552 } 553 554 length += iter_val; 555 } 556 557 if ( UBRK_DONE == sub_str_end_pos && length < 0) { 558 559 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC ); 560 561 efree(ustr); 562 ubrk_close(bi); 563 RETURN_FALSE; 564 } 565 566 sub_str = NULL; 567 status = U_ZERO_ERROR; 568 intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status); 569 570 efree( ustr ); 571 ubrk_close( bi ); 572 573 if ( U_FAILURE( status ) ) { 574 /* Set global error code. */ 575 intl_error_set_code( NULL, status TSRMLS_CC ); 576 577 /* Set error messages. */ 578 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC ); 579 580 if ( NULL != sub_str ) 581 efree( sub_str ); 582 583 RETURN_FALSE; 584 } 585 586 /* return the allocated string, not a duplicate */ 587 RETURN_STRINGL(((char *)sub_str), sub_str_len, 0); 588 589} 590/* }}} */ 591 592/* {{{ strstr_common_handler */ 593static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case) 594{ 595 unsigned char *haystack, *needle, *found; 596 int haystack_len, needle_len; 597 int ret_pos, uchar_pos; 598 zend_bool part = 0; 599 600 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) { 601 602 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 603 "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC ); 604 605 RETURN_FALSE; 606 } 607 608 if (needle_len == 0) { 609 610 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC ); 611 612 RETURN_FALSE; 613 } 614 615 616 if ( !f_ignore_case ) { 617 618 /* ASCII optimization: quick check to see if the string might be there 619 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that 620 */ 621 found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len); 622 623 /* if it isn't there the we are done */ 624 if ( !found ) { 625 RETURN_FALSE; 626 } 627 628 /* if it is there, and if the haystack is ascii, we are all done */ 629 if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) { 630 size_t found_offset = found - haystack; 631 632 if (part) { 633 RETURN_STRINGL(((char *)haystack) , found_offset, 1); 634 } else { 635 RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1); 636 } 637 } 638 639 } 640 641 /* need to work in utf16 */ 642 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case TSRMLS_CC ); 643 644 if ( ret_pos < 0 ) { 645 RETURN_FALSE; 646 } 647 648 /* uchar_pos is the 'nth' Unicode character position of the needle */ 649 650 ret_pos = 0; 651 U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos); 652 653 if (part) { 654 RETURN_STRINGL(((char *)haystack), ret_pos, 1); 655 } 656 else { 657 RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1); 658 } 659 660} 661/* }}} */ 662 663/* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part]) 664 Finds first occurrence of a string within another */ 665PHP_FUNCTION(grapheme_strstr) 666{ 667 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */); 668} 669/* }}} */ 670 671/* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part]) 672 Finds first occurrence of a string within another */ 673PHP_FUNCTION(grapheme_stristr) 674{ 675 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */); 676} 677/* }}} */ 678 679/* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */ 680static inline int32_t 681grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len) 682{ 683 int pos = 0, prev_pos = 0; 684 int ret_pos = 0, prev_ret_pos = 0; 685 686 while ( 1 ) { 687 pos = ubrk_next(bi); 688 689 if ( UBRK_DONE == pos ) { 690 break; 691 } 692 693 /* if we are beyond our limit, then the loop is done */ 694 if ( pos > csize ) { 695 break; 696 } 697 698 /* update our pointer in the original UTF-8 buffer by as many characters 699 as ubrk_next iterated over */ 700 701 prev_ret_pos = ret_pos; 702 U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); 703 704 if ( prev_ret_pos == ret_pos ) { 705 /* something wrong - malformed utf8? */ 706 break; 707 } 708 709 prev_pos = pos; 710 } 711 712 return ret_pos; 713} 714/* }}} */ 715 716/* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */ 717static inline int32_t 718grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len) 719{ 720 int pos = 0, prev_pos = 0; 721 int ret_pos = 0, prev_ret_pos = 0; 722 723 while ( 1 ) { 724 pos = ubrk_next(bi); 725 726 if ( UBRK_DONE == pos ) { 727 break; 728 } 729 730 prev_ret_pos = ret_pos; 731 U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); 732 733 if ( ret_pos > bsize ) { 734 ret_pos = prev_ret_pos; 735 break; 736 } 737 738 if ( prev_ret_pos == ret_pos ) { 739 /* something wrong - malformed utf8? */ 740 break; 741 } 742 743 prev_pos = pos; 744 } 745 746 return ret_pos; 747} 748/* }}} */ 749 750/* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */ 751static inline int32_t 752grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len) 753{ 754 int pos = 0, next_pos = 0; 755 int ret_pos = 0; 756 757 while ( size ) { 758 next_pos = ubrk_next(bi); 759 760 if ( UBRK_DONE == next_pos ) { 761 break; 762 } 763 pos = next_pos; 764 size--; 765 } 766 767 /* pos is one past the last UChar - and represent the number of code units to 768 advance in the utf-8 buffer 769 */ 770 771 U8_FWD_N(pstr, ret_pos, str_len, pos); 772 773 return ret_pos; 774} 775/* }}} */ 776 777/* {{{ grapheme extract iter function pointer array */ 778typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/); 779 780static grapheme_extract_iter grapheme_extract_iters[] = { 781 &grapheme_extract_count_iter, 782 &grapheme_extract_bytecount_iter, 783 &grapheme_extract_charcount_iter, 784}; 785/* }}} */ 786 787/* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]]) 788 Function to extract a sequence of default grapheme clusters */ 789PHP_FUNCTION(grapheme_extract) 790{ 791 unsigned char *str, *pstr; 792 UChar *ustr; 793 int str_len, ustr_len; 794 long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */ 795 long lstart = 0; /* starting position in str in bytes */ 796 int32_t start = 0; 797 long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT; 798 UErrorCode status; 799 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; 800 UBreakIterator* bi = NULL; 801 int ret_pos; 802 zval *next = NULL; /* return offset of next part of the string */ 803 804 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) { 805 806 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 807 "grapheme_extract: unable to parse input param", 0 TSRMLS_CC ); 808 809 RETURN_FALSE; 810 } 811 812 if ( NULL != next ) { 813 if ( !PZVAL_IS_REF(next) ) { 814 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 815 "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC ); 816 817 RETURN_FALSE; 818 } 819 else { 820 /* initialize next */ 821 zval_dtor(next); 822 ZVAL_LONG(next, lstart); 823 } 824 } 825 826 if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) { 827 828 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 829 "grapheme_extract: unknown extract type param", 0 TSRMLS_CC ); 830 831 RETURN_FALSE; 832 } 833 834 if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) { 835 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC ); 836 RETURN_FALSE; 837 } 838 839 if ( size > INT32_MAX || size < 0) { 840 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC ); 841 RETURN_FALSE; 842 } 843 if (size == 0) { 844 RETURN_EMPTY_STRING(); 845 } 846 847 /* we checked that it will fit: */ 848 start = (int32_t) lstart; 849 850 pstr = str + start; 851 852 /* just in case pstr points in the middle of a character, move forward to the start of the next char */ 853 if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) { 854 unsigned char *str_end = str + str_len; 855 856 while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) { 857 pstr++; 858 if ( pstr >= str_end ) { 859 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, 860 "grapheme_extract: invalid input string", 0 TSRMLS_CC ); 861 862 RETURN_FALSE; 863 } 864 } 865 } 866 867 str_len -= (pstr - str); 868 869 /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done. 870 (size + 1 because the size-th character might be the beginning of a grapheme cluster) 871 */ 872 873 if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) { 874 long nsize = ( size < str_len ? size : str_len ); 875 if ( NULL != next ) { 876 ZVAL_LONG(next, start+nsize); 877 } 878 RETURN_STRINGL(((char *)pstr), nsize, 1); 879 } 880 881 /* convert the strings to UTF-16. */ 882 ustr = NULL; 883 ustr_len = 0; 884 status = U_ZERO_ERROR; 885 intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status ); 886 887 if ( U_FAILURE( status ) ) { 888 /* Set global error code. */ 889 intl_error_set_code( NULL, status TSRMLS_CC ); 890 891 /* Set error messages. */ 892 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); 893 894 if ( NULL != ustr ) 895 efree( ustr ); 896 897 RETURN_FALSE; 898 } 899 900 bi = NULL; 901 status = U_ZERO_ERROR; 902 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC ); 903 904 ubrk_setText(bi, ustr, ustr_len, &status); 905 906 /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we 907 can't back up. So, we will not do anything. */ 908 909 /* now we need to find the end of the chunk the user wants us to return */ 910 911 ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len); 912 913 if (ustr) { 914 efree(ustr); 915 } 916 ubrk_close(bi); 917 918 if ( NULL != next ) { 919 ZVAL_LONG(next, start+ret_pos); 920 } 921 922 RETURN_STRINGL(((char *)pstr), ret_pos, 1); 923} 924 925/* }}} */ 926 927/* 928 * Local variables: 929 * tab-width: 4 930 * c-basic-offset: 4 931 * End: 932 * vim600: fdm=marker 933 * vim: noet sw=4 ts=4 934 */ 935 936