1/*
2   +----------------------------------------------------------------------+
3   | PHP Version 5                                                        |
4   +----------------------------------------------------------------------+
5   | This source file is subject to version 3.01 of the PHP license,      |
6   | that is bundled with this package in the file LICENSE, and is        |
7   | available through the world-wide-web at the following url:           |
8   | http://www.php.net/license/3_01.txt                                  |
9   | If you did not receive a copy of the PHP license and are unable to   |
10   | obtain it through the world-wide-web, please send a note to          |
11   | license@php.net so we can mail you a copy immediately.               |
12   +----------------------------------------------------------------------+
13   | Author: Ed Batutis <ed@batutis.com>                                  |
14   +----------------------------------------------------------------------+
15 */
16
17/* {{{ includes */
18#ifdef HAVE_CONFIG_H
19#include "config.h"
20#endif
21
22#include <php.h>
23#include "grapheme.h"
24#include "grapheme_util.h"
25
26#include <unicode/utypes.h>
27#include <unicode/ucol.h>
28#include <unicode/ustring.h>
29#include <unicode/ubrk.h>
30
31#include "ext/standard/php_string.h"
32
33/* }}} */
34
35#define GRAPHEME_EXTRACT_TYPE_COUNT     0
36#define GRAPHEME_EXTRACT_TYPE_MAXBYTES  1
37#define GRAPHEME_EXTRACT_TYPE_MAXCHARS  2
38#define GRAPHEME_EXTRACT_TYPE_MIN   GRAPHEME_EXTRACT_TYPE_COUNT
39#define GRAPHEME_EXTRACT_TYPE_MAX   GRAPHEME_EXTRACT_TYPE_MAXCHARS
40
41
42/* {{{ grapheme_register_constants
43 * Register API constants
44 */
45void grapheme_register_constants( INIT_FUNC_ARGS )
46{
47    REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
48    REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
49    REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
50}
51/* }}} */
52
53/* {{{ proto int grapheme_strlen(string str)
54   Get number of graphemes in a string */
55PHP_FUNCTION(grapheme_strlen)
56{
57    unsigned char* string;
58    int string_len;
59    UChar* ustring = NULL;
60    int ustring_len = 0;
61    int ret_len;
62    UErrorCode status;
63
64    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
65
66        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
67             "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
68
69        RETURN_FALSE;
70    }
71
72    ret_len = grapheme_ascii_check(string, string_len);
73
74    if ( ret_len >= 0 )
75        RETURN_LONG(ret_len);
76
77    /* convert the string to UTF-16. */
78    status = U_ZERO_ERROR;
79    intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
80
81    if ( U_FAILURE( status ) ) {
82        /* Set global error code. */
83        intl_error_set_code( NULL, status TSRMLS_CC );
84
85        /* Set error messages. */
86        intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
87        if (ustring) {
88            efree( ustring );
89        }
90        RETURN_NULL();
91    }
92
93    ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
94
95    if (ustring) {
96        efree( ustring );
97    }
98
99    if (ret_len >= 0) {
100        RETVAL_LONG(ret_len);
101    } else {
102        RETVAL_FALSE;
103    }
104}
105/* }}} */
106
107/* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
108   Find position of first occurrence of a string within another */
109PHP_FUNCTION(grapheme_strpos)
110{
111    unsigned char *haystack, *needle;
112    int haystack_len, needle_len;
113    unsigned char *found;
114    long loffset = 0;
115    int32_t offset = 0, noffset = 0;
116    int ret_pos;
117
118    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
119
120        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
121             "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
122
123        RETURN_FALSE;
124    }
125
126    if ( OUTSIDE_STRING(loffset, haystack_len) ) {
127
128        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
129
130        RETURN_FALSE;
131    }
132
133    /* we checked that it will fit: */
134    offset = (int32_t) loffset;
135    noffset = offset >= 0 ? offset : haystack_len + offset;
136
137    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
138
139    if (needle_len == 0) {
140
141        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
142
143        RETURN_FALSE;
144    }
145
146
147    /* quick check to see if the string might be there
148     * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
149    */
150    found = (unsigned char *)php_memnstr((char *)haystack + noffset, (char *)needle, needle_len, (char *)haystack + haystack_len);
151
152    /* if it isn't there the we are done */
153    if (!found) {
154        RETURN_FALSE;
155    }
156
157    /* if it is there, and if the haystack is ascii, we are all done */
158    if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
159
160        RETURN_LONG(found - haystack);
161    }
162
163    /* do utf16 part of the strpos */
164    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ TSRMLS_CC );
165
166    if ( ret_pos >= 0 ) {
167        RETURN_LONG(ret_pos);
168    } else {
169        RETURN_FALSE;
170    }
171
172}
173/* }}} */
174
175/* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
176   Find position of first occurrence of a string within another, ignoring case differences */
177PHP_FUNCTION(grapheme_stripos)
178{
179    unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
180    int haystack_len, needle_len;
181    unsigned char *found;
182    long loffset = 0;
183    int32_t offset = 0;
184    int ret_pos;
185    int is_ascii;
186
187    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
188
189        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
190             "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
191
192        RETURN_FALSE;
193    }
194
195    if ( OUTSIDE_STRING(loffset, haystack_len) ) {
196
197        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
198
199        RETURN_FALSE;
200    }
201
202    /* we checked that it will fit: */
203    offset = (int32_t) loffset;
204
205    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
206
207    if (needle_len == 0) {
208
209        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
210
211        RETURN_FALSE;
212    }
213
214
215    is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
216
217    if ( is_ascii ) {
218        int32_t noffset = offset >= 0 ? offset : haystack_len + offset;
219        needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
220        php_strtolower((char *)needle_dup, needle_len);
221        haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
222        php_strtolower((char *)haystack_dup, haystack_len);
223
224        found = (unsigned char*) php_memnstr((char *)haystack_dup + noffset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
225
226        efree(haystack_dup);
227        efree(needle_dup);
228
229        if (found) {
230            RETURN_LONG(found - haystack_dup);
231        }
232
233        /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
234        if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
235            RETURN_FALSE;
236        }
237    }
238
239    /* do utf16 part of the strpos */
240    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ TSRMLS_CC );
241
242    if ( ret_pos >= 0 ) {
243        RETURN_LONG(ret_pos);
244    } else {
245        RETURN_FALSE;
246    }
247
248}
249/* }}} */
250
251/* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
252   Find position of last occurrence of a string within another */
253PHP_FUNCTION(grapheme_strrpos)
254{
255    unsigned char *haystack, *needle;
256    int haystack_len, needle_len;
257    long loffset = 0;
258    int32_t offset = 0;
259    int32_t ret_pos;
260    int is_ascii;
261
262    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
263
264        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
265             "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
266
267        RETURN_FALSE;
268    }
269
270    if ( OUTSIDE_STRING(loffset, haystack_len) ) {
271
272        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
273
274        RETURN_FALSE;
275    }
276
277    /* we checked that it will fit: */
278    offset = (int32_t) loffset;
279
280    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
281
282    if (needle_len == 0) {
283
284        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
285
286        RETURN_FALSE;
287    }
288
289    is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
290
291    if ( is_ascii ) {
292
293        ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
294
295
296        if ( ret_pos >= 0 ) {
297            RETURN_LONG(ret_pos);
298        }
299
300        /* if the needle was ascii too, we are done */
301
302        if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
303            RETURN_FALSE;
304        }
305
306        /* else we need to continue via utf16 */
307    }
308
309    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */ TSRMLS_CC);
310
311    if ( ret_pos >= 0 ) {
312        RETURN_LONG(ret_pos);
313    } else {
314        RETURN_FALSE;
315    }
316
317
318}
319/* }}} */
320
321/* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
322   Find position of last occurrence of a string within another, ignoring case */
323PHP_FUNCTION(grapheme_strripos)
324{
325    unsigned char *haystack, *needle;
326    int haystack_len, needle_len;
327    long loffset = 0;
328    int32_t offset = 0;
329    int32_t ret_pos;
330    int is_ascii;
331
332    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
333
334        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
335             "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
336
337        RETURN_FALSE;
338    }
339
340    if ( OUTSIDE_STRING(loffset, haystack_len) ) {
341
342        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
343
344        RETURN_FALSE;
345    }
346
347    /* we checked that it will fit: */
348    offset = (int32_t) loffset;
349
350    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
351
352    if (needle_len == 0) {
353
354        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
355
356        RETURN_FALSE;
357    }
358
359    is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
360
361    if ( is_ascii ) {
362        unsigned char *needle_dup, *haystack_dup;
363
364        needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
365        php_strtolower((char *)needle_dup, needle_len);
366        haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
367        php_strtolower((char *)haystack_dup, haystack_len);
368
369        ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
370
371        efree(haystack_dup);
372        efree(needle_dup);
373
374        if ( ret_pos >= 0 ) {
375            RETURN_LONG(ret_pos);
376        }
377
378        /* if the needle was ascii too, we are done */
379
380        if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
381            RETURN_FALSE;
382        }
383
384        /* else we need to continue via utf16 */
385    }
386
387    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */ TSRMLS_CC);
388
389    if ( ret_pos >= 0 ) {
390        RETURN_LONG(ret_pos);
391    } else {
392        RETURN_FALSE;
393    }
394
395
396}
397/* }}} */
398
399/* {{{ proto string grapheme_substr(string str, int start [, int length])
400   Returns part of a string */
401PHP_FUNCTION(grapheme_substr)
402{
403    unsigned char *str, *sub_str;
404    UChar *ustr;
405    int str_len, sub_str_len, ustr_len;
406    long lstart = 0, length = 0;
407    int32_t start = 0;
408    int iter_val;
409    UErrorCode status;
410    unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
411    UBreakIterator* bi = NULL;
412    int sub_str_start_pos, sub_str_end_pos;
413    int32_t (*iter_func)(UBreakIterator *);
414
415    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
416
417        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
418             "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
419
420        RETURN_FALSE;
421    }
422
423    if ( OUTSIDE_STRING(lstart, str_len) ) {
424
425        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
426
427        RETURN_FALSE;
428    }
429
430    /* we checked that it will fit: */
431    start = (int32_t) lstart;
432
433    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
434
435    if ( grapheme_ascii_check(str, str_len) >= 0 ) {
436        grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
437
438        if ( NULL == sub_str ) {
439            intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 TSRMLS_CC );
440            RETURN_FALSE;
441        }
442
443        RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
444    }
445
446    ustr = NULL;
447    ustr_len = 0;
448    status = U_ZERO_ERROR;
449    intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
450
451    if ( U_FAILURE( status ) ) {
452        /* Set global error code. */
453        intl_error_set_code( NULL, status TSRMLS_CC );
454
455        /* Set error messages. */
456        intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
457        if (ustr) {
458            efree( ustr );
459        }
460        RETURN_FALSE;
461    }
462
463    bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
464
465    if( U_FAILURE(status) ) {
466        RETURN_FALSE;
467    }
468
469    ubrk_setText(bi, ustr, ustr_len,    &status);
470
471    if ( start < 0 ) {
472        iter_func = ubrk_previous;
473        ubrk_last(bi);
474        iter_val = 1;
475    }
476    else {
477        iter_func = ubrk_next;
478        iter_val = -1;
479    }
480
481    sub_str_start_pos = 0;
482
483    while ( start ) {
484        sub_str_start_pos = iter_func(bi);
485
486        if ( UBRK_DONE == sub_str_start_pos ) {
487            break;
488        }
489
490        start += iter_val;
491    }
492
493    if ( 0 != start || sub_str_start_pos >= ustr_len ) {
494
495        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
496
497        if (ustr) {
498            efree(ustr);
499        }
500        ubrk_close(bi);
501        RETURN_FALSE;
502    }
503
504    if (ZEND_NUM_ARGS() <= 2) {
505
506        /* no length supplied, return the rest of the string */
507
508        sub_str = NULL;
509        sub_str_len = 0;
510        status = U_ZERO_ERROR;
511        intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
512
513        if (ustr) {
514            efree( ustr );
515        }
516        ubrk_close( bi );
517
518        if ( U_FAILURE( status ) ) {
519            /* Set global error code. */
520            intl_error_set_code( NULL, status TSRMLS_CC );
521
522            /* Set error messages. */
523            intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
524
525            if (sub_str) {
526                efree( sub_str );
527            }
528
529            RETURN_FALSE;
530        }
531
532        /* return the allocated string, not a duplicate */
533        RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
534    }
535
536    if(length == 0) {
537        /* empty length - we've validated start, we can return "" now */
538        if (ustr) {
539            efree(ustr);
540        }
541        ubrk_close(bi);
542        RETURN_EMPTY_STRING();
543    }
544
545    /* find the end point of the string to return */
546
547    if ( length < 0 ) {
548        iter_func = ubrk_previous;
549        ubrk_last(bi);
550        iter_val = 1;
551    }
552    else {
553        iter_func = ubrk_next;
554        iter_val = -1;
555    }
556
557    sub_str_end_pos = 0;
558
559    while ( length ) {
560        sub_str_end_pos = iter_func(bi);
561
562        if ( UBRK_DONE == sub_str_end_pos ) {
563            break;
564        }
565
566        length += iter_val;
567    }
568
569    ubrk_close(bi);
570
571    if ( UBRK_DONE == sub_str_end_pos) {
572        if(length < 0) {
573            intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
574
575            efree(ustr);
576            RETURN_FALSE;
577        } else {
578            sub_str_end_pos = ustr_len;
579        }
580    }
581
582    if(sub_str_start_pos > sub_str_end_pos) {
583        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 TSRMLS_CC );
584
585        efree(ustr);
586        RETURN_FALSE;
587    }
588
589    sub_str = NULL;
590    status = U_ZERO_ERROR;
591    intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
592
593    efree( ustr );
594
595    if ( U_FAILURE( status ) ) {
596        /* Set global error code. */
597        intl_error_set_code( NULL, status TSRMLS_CC );
598
599        /* Set error messages. */
600        intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
601
602        if ( NULL != sub_str )
603            efree( sub_str );
604
605        RETURN_FALSE;
606    }
607
608     /* return the allocated string, not a duplicate */
609    RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
610
611}
612/* }}} */
613
614/* {{{  strstr_common_handler */
615static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
616{
617    unsigned char *haystack, *needle, *found;
618    int haystack_len, needle_len;
619    int ret_pos, uchar_pos;
620    zend_bool part = 0;
621
622    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
623
624        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
625             "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
626
627        RETURN_FALSE;
628    }
629
630    if (needle_len == 0) {
631
632        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
633
634        RETURN_FALSE;
635    }
636
637
638    if ( !f_ignore_case ) {
639
640        /* ASCII optimization: quick check to see if the string might be there
641         * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
642        */
643        found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
644
645        /* if it isn't there the we are done */
646        if ( !found ) {
647            RETURN_FALSE;
648        }
649
650        /* if it is there, and if the haystack is ascii, we are all done */
651        if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
652            size_t found_offset = found - haystack;
653
654            if (part) {
655                RETURN_STRINGL(((char *)haystack) , found_offset, 1);
656            } else {
657                RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
658            }
659        }
660
661    }
662
663    /* need to work in utf16 */
664    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ TSRMLS_CC );
665
666    if ( ret_pos < 0 ) {
667        RETURN_FALSE;
668    }
669
670    /* uchar_pos is the 'nth' Unicode character position of the needle */
671
672    ret_pos = 0;
673    U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
674
675    if (part) {
676        RETURN_STRINGL(((char *)haystack), ret_pos, 1);
677    }
678    else {
679        RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
680    }
681
682}
683/* }}} */
684
685/* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
686   Finds first occurrence of a string within another */
687PHP_FUNCTION(grapheme_strstr)
688{
689    strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
690}
691/* }}} */
692
693/* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
694   Finds first occurrence of a string within another */
695PHP_FUNCTION(grapheme_stristr)
696{
697    strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
698}
699/* }}} */
700
701/* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
702static inline int32_t
703grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
704{
705    int pos = 0, prev_pos = 0;
706    int ret_pos = 0, prev_ret_pos = 0;
707
708    while ( 1 ) {
709        pos = ubrk_next(bi);
710
711        if ( UBRK_DONE == pos ) {
712            break;
713        }
714
715        /* if we are beyond our limit, then the loop is done */
716        if ( pos > csize ) {
717            break;
718        }
719
720        /* update our pointer in the original UTF-8 buffer by as many characters
721           as ubrk_next iterated over */
722
723        prev_ret_pos = ret_pos;
724        U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
725
726        if ( prev_ret_pos == ret_pos ) {
727            /* something wrong - malformed utf8? */
728            break;
729        }
730
731        prev_pos = pos;
732    }
733
734    return ret_pos;
735}
736/* }}} */
737
738/* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
739static inline int32_t
740grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
741{
742    int pos = 0, prev_pos = 0;
743    int ret_pos = 0, prev_ret_pos = 0;
744
745    while ( 1 ) {
746        pos = ubrk_next(bi);
747
748        if ( UBRK_DONE == pos ) {
749            break;
750        }
751
752        prev_ret_pos = ret_pos;
753        U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
754
755        if ( ret_pos > bsize ) {
756            ret_pos = prev_ret_pos;
757            break;
758        }
759
760        if ( prev_ret_pos == ret_pos ) {
761            /* something wrong - malformed utf8? */
762            break;
763        }
764
765        prev_pos = pos;
766    }
767
768    return ret_pos;
769}
770/* }}} */
771
772/* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
773static inline int32_t
774grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
775{
776    int pos = 0, next_pos = 0;
777    int ret_pos = 0;
778
779    while ( size ) {
780        next_pos = ubrk_next(bi);
781
782        if ( UBRK_DONE == next_pos ) {
783            break;
784        }
785        pos = next_pos;
786        size--;
787    }
788
789    /* pos is one past the last UChar - and represent the number of code units to
790        advance in the utf-8 buffer
791    */
792
793    U8_FWD_N(pstr, ret_pos, str_len, pos);
794
795    return ret_pos;
796}
797/* }}} */
798
799/* {{{ grapheme extract iter function pointer array */
800typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
801
802static grapheme_extract_iter grapheme_extract_iters[] = {
803    &grapheme_extract_count_iter,
804    &grapheme_extract_bytecount_iter,
805    &grapheme_extract_charcount_iter,
806};
807/* }}} */
808
809/* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
810    Function to extract a sequence of default grapheme clusters */
811PHP_FUNCTION(grapheme_extract)
812{
813    unsigned char *str, *pstr;
814    UChar *ustr;
815    int str_len, ustr_len;
816    long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
817    long lstart = 0; /* starting position in str in bytes */
818    int32_t start = 0;
819    long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
820    UErrorCode status;
821    unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
822    UBreakIterator* bi = NULL;
823    int ret_pos;
824    zval *next = NULL; /* return offset of next part of the string */
825
826    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
827
828        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
829             "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
830
831        RETURN_FALSE;
832    }
833
834    if ( NULL != next ) {
835        if ( !PZVAL_IS_REF(next) ) {
836            intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
837                 "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
838
839            RETURN_FALSE;
840        }
841        else {
842            /* initialize next */
843            zval_dtor(next);
844            ZVAL_LONG(next, lstart);
845        }
846    }
847
848    if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
849
850        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
851             "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
852
853        RETURN_FALSE;
854    }
855
856    if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
857        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC );
858        RETURN_FALSE;
859    }
860
861    if ( size > INT32_MAX || size < 0) {
862        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC );
863        RETURN_FALSE;
864    }
865    if (size == 0) {
866        RETURN_EMPTY_STRING();
867    }
868
869    /* we checked that it will fit: */
870    start = (int32_t) lstart;
871
872    pstr = str + start;
873
874    /* just in case pstr points in the middle of a character, move forward to the start of the next char */
875    if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
876        unsigned char *str_end = str + str_len;
877
878        while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
879            pstr++;
880            if ( pstr >= str_end ) {
881                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
882                                "grapheme_extract: invalid input string", 0 TSRMLS_CC );
883
884                RETURN_FALSE;
885            }
886        }
887    }
888
889    str_len -= (pstr - str);
890
891    /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
892        (size + 1 because the size-th character might be the beginning of a grapheme cluster)
893     */
894
895    if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
896        long nsize = ( size < str_len ? size : str_len );
897        if ( NULL != next ) {
898            ZVAL_LONG(next, start+nsize);
899        }
900        RETURN_STRINGL(((char *)pstr), nsize, 1);
901    }
902
903    /* convert the strings to UTF-16. */
904    ustr = NULL;
905    ustr_len = 0;
906    status = U_ZERO_ERROR;
907    intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
908
909    if ( U_FAILURE( status ) ) {
910        /* Set global error code. */
911        intl_error_set_code( NULL, status TSRMLS_CC );
912
913        /* Set error messages. */
914        intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
915
916        if ( NULL != ustr )
917            efree( ustr );
918
919        RETURN_FALSE;
920    }
921
922    bi = NULL;
923    status = U_ZERO_ERROR;
924    bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
925
926    ubrk_setText(bi, ustr, ustr_len, &status);
927
928    /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
929        can't back up. So, we will not do anything. */
930
931    /* now we need to find the end of the chunk the user wants us to return */
932
933    ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
934
935    if (ustr) {
936        efree(ustr);
937    }
938    ubrk_close(bi);
939
940    if ( NULL != next ) {
941        ZVAL_LONG(next, start+ret_pos);
942    }
943
944    RETURN_STRINGL(((char *)pstr), ret_pos, 1);
945}
946
947/* }}} */
948
949/*
950 * Local variables:
951 * tab-width: 4
952 * c-basic-offset: 4
953 * End:
954 * vim600: fdm=marker
955 * vim: noet sw=4 ts=4
956 */
957
958