1/*
2   +----------------------------------------------------------------------+
3   | PHP Version 7                                                        |
4   +----------------------------------------------------------------------+
5   | This source file is subject to version 3.01 of the PHP license,      |
6   | that is bundled with this package in the file LICENSE, and is        |
7   | available through the world-wide-web at the following url:           |
8   | http://www.php.net/license/3_01.txt                                  |
9   | If you did not receive a copy of the PHP license and are unable to   |
10   | obtain it through the world-wide-web, please send a note to          |
11   | license@php.net so we can mail you a copy immediately.               |
12   +----------------------------------------------------------------------+
13   | Author: Ed Batutis <ed@batutis.com>                                  |
14   +----------------------------------------------------------------------+
15 */
16
17/* {{{ includes */
18#ifdef HAVE_CONFIG_H
19#include "config.h"
20#endif
21
22#include <php.h>
23#include "grapheme.h"
24#include "grapheme_util.h"
25
26#include <unicode/utypes.h>
27#include <unicode/ucol.h>
28#include <unicode/ustring.h>
29#include <unicode/ubrk.h>
30
31#include "ext/standard/php_string.h"
32
33/* }}} */
34
35#define GRAPHEME_EXTRACT_TYPE_COUNT     0
36#define GRAPHEME_EXTRACT_TYPE_MAXBYTES  1
37#define GRAPHEME_EXTRACT_TYPE_MAXCHARS  2
38#define GRAPHEME_EXTRACT_TYPE_MIN   GRAPHEME_EXTRACT_TYPE_COUNT
39#define GRAPHEME_EXTRACT_TYPE_MAX   GRAPHEME_EXTRACT_TYPE_MAXCHARS
40
41
42/* {{{ grapheme_register_constants
43 * Register API constants
44 */
45void grapheme_register_constants( INIT_FUNC_ARGS )
46{
47    REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
48    REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
49    REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
50}
51/* }}} */
52
53/* {{{ proto size_t grapheme_strlen(string str)
54   Get number of graphemes in a string */
55PHP_FUNCTION(grapheme_strlen)
56{
57    char* string;
58    size_t string_len;
59    UChar* ustring = NULL;
60    int ustring_len = 0;
61    zend_long ret_len;
62    UErrorCode status;
63
64    if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
65        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
66             "grapheme_strlen: unable to parse input param", 0 );
67        RETURN_FALSE;
68    }
69
70    ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
71
72    if ( ret_len >= 0 )
73        RETURN_LONG(string_len);
74
75    /* convert the string to UTF-16. */
76    status = U_ZERO_ERROR;
77    intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
78
79    if ( U_FAILURE( status ) ) {
80        /* Set global error code. */
81        intl_error_set_code( NULL, status );
82
83        /* Set error messages. */
84        intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
85        if (ustring) {
86            efree( ustring );
87        }
88        RETURN_NULL();
89    }
90
91    ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
92
93    if (ustring) {
94        efree( ustring );
95    }
96
97    if (ret_len >= 0) {
98        RETVAL_LONG(ret_len);
99    } else {
100        RETVAL_FALSE;
101    }
102}
103/* }}} */
104
105/* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
106   Find position of first occurrence of a string within another */
107PHP_FUNCTION(grapheme_strpos)
108{
109    char *haystack, *needle;
110    size_t haystack_len, needle_len;
111    const char *found;
112    zend_long loffset = 0;
113    int32_t offset = 0;
114    zend_long ret_pos;
115
116    if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
117        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
118             "grapheme_strpos: unable to parse input param", 0 );
119        RETURN_FALSE;
120    }
121
122    if ( OUTSIDE_STRING(loffset, haystack_len) ) {
123        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
124        RETURN_FALSE;
125    }
126
127    /* we checked that it will fit: */
128    offset = (int32_t) loffset;
129
130    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
131
132    if (needle_len == 0) {
133        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
134        RETURN_FALSE;
135    }
136
137
138    /* quick check to see if the string might be there
139     * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
140    */
141    found = php_memnstr(haystack + offset, needle, needle_len, haystack + haystack_len);
142
143    /* if it isn't there the we are done */
144    if (!found) {
145        RETURN_FALSE;
146    }
147
148    /* if it is there, and if the haystack is ascii, we are all done */
149    if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
150        RETURN_LONG(found - haystack);
151    }
152
153    /* do utf16 part of the strpos */
154    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
155
156    if ( ret_pos >= 0 ) {
157        RETURN_LONG(ret_pos);
158    } else {
159        RETURN_FALSE;
160    }
161
162}
163/* }}} */
164
165/* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
166   Find position of first occurrence of a string within another, ignoring case differences */
167PHP_FUNCTION(grapheme_stripos)
168{
169    char *haystack, *needle, *haystack_dup, *needle_dup;
170    size_t haystack_len, needle_len;
171    const char *found;
172    zend_long loffset = 0;
173    int32_t offset = 0;
174    zend_long ret_pos;
175    int is_ascii;
176
177    if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
178        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
179             "grapheme_stripos: unable to parse input param", 0 );
180        RETURN_FALSE;
181    }
182
183    if ( OUTSIDE_STRING(loffset, haystack_len) ) {
184        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 );
185        RETURN_FALSE;
186    }
187
188    /* we checked that it will fit: */
189    offset = (int32_t) loffset;
190
191    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
192
193    if (needle_len == 0) {
194        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 );
195        RETURN_FALSE;
196    }
197
198
199    is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
200
201    if ( is_ascii ) {
202        needle_dup = estrndup(needle, needle_len);
203        php_strtolower(needle_dup, needle_len);
204        haystack_dup = estrndup(haystack, haystack_len);
205        php_strtolower(haystack_dup, haystack_len);
206
207        found = php_memnstr(haystack_dup + offset, needle_dup, needle_len, haystack_dup + haystack_len);
208
209        efree(haystack_dup);
210        efree(needle_dup);
211
212        if (found) {
213            RETURN_LONG(found - haystack_dup);
214        }
215
216        /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
217        if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
218            RETURN_FALSE;
219        }
220    }
221
222    /* do utf16 part of the strpos */
223    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
224
225    if ( ret_pos >= 0 ) {
226        RETURN_LONG(ret_pos);
227    } else {
228        RETURN_FALSE;
229    }
230
231}
232/* }}} */
233
234/* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
235   Find position of last occurrence of a string within another */
236PHP_FUNCTION(grapheme_strrpos)
237{
238    char *haystack, *needle;
239    size_t haystack_len, needle_len;
240    zend_long loffset = 0;
241    int32_t offset = 0;
242    zend_long ret_pos;
243    int is_ascii;
244
245    if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
246        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
247             "grapheme_strrpos: unable to parse input param", 0 );
248        RETURN_FALSE;
249    }
250
251    if ( OUTSIDE_STRING(loffset, haystack_len) ) {
252        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
253        RETURN_FALSE;
254    }
255
256    /* we checked that it will fit: */
257    offset = (int32_t) loffset;
258
259    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
260
261    if (needle_len == 0) {
262        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
263        RETURN_FALSE;
264    }
265
266    is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
267
268    if ( is_ascii ) {
269
270        ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
271
272        if ( ret_pos >= 0 ) {
273            RETURN_LONG(ret_pos);
274        }
275
276        /* if the needle was ascii too, we are done */
277
278        if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
279            RETURN_FALSE;
280        }
281
282        /* else we need to continue via utf16 */
283    }
284
285    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
286
287    if ( ret_pos >= 0 ) {
288        RETURN_LONG(ret_pos);
289    } else {
290        RETURN_FALSE;
291    }
292
293
294}
295/* }}} */
296
297/* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
298   Find position of last occurrence of a string within another, ignoring case */
299PHP_FUNCTION(grapheme_strripos)
300{
301    char *haystack, *needle;
302    size_t haystack_len, needle_len;
303    zend_long loffset = 0;
304    int32_t offset = 0;
305    zend_long ret_pos;
306    int is_ascii;
307
308    if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
309        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
310             "grapheme_strrpos: unable to parse input param", 0 );
311        RETURN_FALSE;
312    }
313
314    if ( OUTSIDE_STRING(loffset, haystack_len) ) {
315        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
316        RETURN_FALSE;
317    }
318
319    /* we checked that it will fit: */
320    offset = (int32_t) loffset;
321
322    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
323
324    if (needle_len == 0) {
325        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
326        RETURN_FALSE;
327    }
328
329    is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
330
331    if ( is_ascii ) {
332        char *needle_dup, *haystack_dup;
333
334        needle_dup = estrndup(needle, needle_len);
335        php_strtolower(needle_dup, needle_len);
336        haystack_dup = estrndup(haystack, haystack_len);
337        php_strtolower(haystack_dup, haystack_len);
338
339        ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
340
341        efree(haystack_dup);
342        efree(needle_dup);
343
344        if ( ret_pos >= 0 ) {
345            RETURN_LONG(ret_pos);
346        }
347
348        /* if the needle was ascii too, we are done */
349
350        if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
351            RETURN_FALSE;
352        }
353
354        /* else we need to continue via utf16 */
355    }
356
357    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */);
358
359    if ( ret_pos >= 0 ) {
360        RETURN_LONG(ret_pos);
361    } else {
362        RETURN_FALSE;
363    }
364
365
366}
367/* }}} */
368
369/* {{{ proto string grapheme_substr(string str, int start [, int length])
370   Returns part of a string */
371PHP_FUNCTION(grapheme_substr)
372{
373    char *str;
374    zend_string *u8_sub_str;
375    UChar *ustr;
376    size_t str_len;
377    int32_t ustr_len;
378    zend_long lstart = 0, length = 0;
379    int32_t start = 0;
380    int iter_val;
381    UErrorCode status;
382    unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
383    UBreakIterator* bi = NULL;
384    int sub_str_start_pos, sub_str_end_pos;
385    int32_t (*iter_func)(UBreakIterator *);
386    int no_length = 1;
387
388    if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", (char **)&str, &str_len, &lstart, &length, &no_length) == FAILURE) {
389        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
390             "grapheme_substr: unable to parse input param", 0 );
391        RETURN_FALSE;
392    }
393
394    if ( OUTSIDE_STRING(lstart, str_len)) {
395        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
396        RETURN_FALSE;
397    }
398
399    /* we checked that it will fit: */
400    start = (int32_t) lstart;
401
402    if(no_length) {
403        length = str_len;
404    }
405
406    if(length < INT32_MIN) {
407        length = INT32_MIN;
408    } else if(length > INT32_MAX) {
409        length = INT32_MAX;
410    }
411
412    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
413
414    if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
415        int32_t asub_str_len;
416        char *sub_str;
417        grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
418
419        if ( NULL == sub_str ) {
420            intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
421            RETURN_FALSE;
422        }
423
424        RETURN_STRINGL(sub_str, asub_str_len);
425    }
426
427    ustr = NULL;
428    ustr_len = 0;
429    status = U_ZERO_ERROR;
430    intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
431
432    if ( U_FAILURE( status ) ) {
433        /* Set global error code. */
434        intl_error_set_code( NULL, status );
435
436        /* Set error messages. */
437        intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
438        if (ustr) {
439            efree( ustr );
440        }
441        RETURN_FALSE;
442    }
443
444    bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
445
446    if( U_FAILURE(status) ) {
447        RETURN_FALSE;
448    }
449
450    ubrk_setText(bi, ustr, ustr_len,    &status);
451
452    if ( start < 0 ) {
453        iter_func = ubrk_previous;
454        ubrk_last(bi);
455        iter_val = 1;
456    }
457    else {
458        iter_func = ubrk_next;
459        iter_val = -1;
460    }
461
462    sub_str_start_pos = 0;
463
464    while ( start ) {
465        sub_str_start_pos = iter_func(bi);
466
467        if ( UBRK_DONE == sub_str_start_pos ) {
468            break;
469        }
470
471        start += iter_val;
472    }
473
474    if ( 0 != start || sub_str_start_pos >= ustr_len ) {
475
476        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
477
478        if (ustr) {
479            efree(ustr);
480        }
481        ubrk_close(bi);
482        RETURN_FALSE;
483    }
484
485    /* OK to convert here since if str_len were big, convert above would fail */
486    if (length >= (int32_t)str_len) {
487
488        /* no length supplied or length is too big, return the rest of the string */
489
490        status = U_ZERO_ERROR;
491        u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
492
493        if (ustr) {
494            efree( ustr );
495        }
496        ubrk_close( bi );
497
498        if ( !u8_sub_str ) {
499            /* Set global error code. */
500            intl_error_set_code( NULL, status );
501
502            /* Set error messages. */
503            intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
504
505            RETURN_FALSE;
506        }
507
508        /* return the allocated string, not a duplicate */
509        RETVAL_NEW_STR(u8_sub_str);
510        return;
511    }
512
513    if(length == 0) {
514        /* empty length - we've validated start, we can return "" now */
515        if (ustr) {
516            efree(ustr);
517        }
518        ubrk_close(bi);
519        RETURN_EMPTY_STRING();
520    }
521
522    /* find the end point of the string to return */
523
524    if ( length < 0 ) {
525        iter_func = ubrk_previous;
526        ubrk_last(bi);
527        iter_val = 1;
528    }
529    else {
530        iter_func = ubrk_next;
531        iter_val = -1;
532    }
533
534    sub_str_end_pos = 0;
535
536    while ( length ) {
537        sub_str_end_pos = iter_func(bi);
538
539        if ( UBRK_DONE == sub_str_end_pos ) {
540            break;
541        }
542
543        length += iter_val;
544    }
545
546    ubrk_close(bi);
547
548    if ( UBRK_DONE == sub_str_end_pos) {
549        if(length < 0) {
550            intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 );
551
552            efree(ustr);
553            RETURN_FALSE;
554        } else {
555            sub_str_end_pos = ustr_len;
556        }
557    }
558
559    if(sub_str_start_pos > sub_str_end_pos) {
560        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 );
561
562        efree(ustr);
563        RETURN_FALSE;
564    }
565
566    status = U_ZERO_ERROR;
567    u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
568
569    efree( ustr );
570
571    if ( !u8_sub_str ) {
572        /* Set global error code. */
573        intl_error_set_code( NULL, status );
574
575        /* Set error messages. */
576        intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
577
578        RETURN_FALSE;
579    }
580
581     /* return the allocated string, not a duplicate */
582    RETVAL_NEW_STR(u8_sub_str);
583}
584/* }}} */
585
586/* {{{  strstr_common_handler */
587static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
588{
589    char *haystack, *needle;
590    const char *found;
591    size_t haystack_len, needle_len;
592    int32_t ret_pos, uchar_pos;
593    zend_bool part = 0;
594
595    if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
596
597        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
598             "grapheme_strstr: unable to parse input param", 0 );
599
600        RETURN_FALSE;
601    }
602
603    if (needle_len == 0) {
604
605        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
606
607        RETURN_FALSE;
608    }
609
610
611    if ( !f_ignore_case ) {
612
613        /* ASCII optimization: quick check to see if the string might be there
614         * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
615        */
616        found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
617
618        /* if it isn't there the we are done */
619        if ( !found ) {
620            RETURN_FALSE;
621        }
622
623        /* if it is there, and if the haystack is ascii, we are all done */
624        if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
625            size_t found_offset = found - haystack;
626
627            if (part) {
628                RETURN_STRINGL(haystack, found_offset);
629            } else {
630                RETURN_STRINGL(found, haystack_len - found_offset);
631            }
632        }
633
634    }
635
636    /* need to work in utf16 */
637    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
638
639    if ( ret_pos < 0 ) {
640        RETURN_FALSE;
641    }
642
643    /* uchar_pos is the 'nth' Unicode character position of the needle */
644
645    ret_pos = 0;
646    U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
647
648    if (part) {
649        RETURN_STRINGL(haystack, ret_pos);
650    } else {
651        RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
652    }
653
654}
655/* }}} */
656
657/* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
658   Finds first occurrence of a string within another */
659PHP_FUNCTION(grapheme_strstr)
660{
661    strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
662}
663/* }}} */
664
665/* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
666   Finds first occurrence of a string within another */
667PHP_FUNCTION(grapheme_stristr)
668{
669    strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
670}
671/* }}} */
672
673/* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
674static inline int32_t
675grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
676{
677    int pos = 0, prev_pos = 0;
678    int ret_pos = 0, prev_ret_pos = 0;
679
680    while ( 1 ) {
681        pos = ubrk_next(bi);
682
683        if ( UBRK_DONE == pos ) {
684            break;
685        }
686
687        /* if we are beyond our limit, then the loop is done */
688        if ( pos > csize ) {
689            break;
690        }
691
692        /* update our pointer in the original UTF-8 buffer by as many characters
693           as ubrk_next iterated over */
694
695        prev_ret_pos = ret_pos;
696        U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
697
698        if ( prev_ret_pos == ret_pos ) {
699            /* something wrong - malformed utf8? */
700            break;
701        }
702
703        prev_pos = pos;
704    }
705
706    return ret_pos;
707}
708/* }}} */
709
710/* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
711static inline int32_t
712grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
713{
714    int pos = 0, prev_pos = 0;
715    int ret_pos = 0, prev_ret_pos = 0;
716
717    while ( 1 ) {
718        pos = ubrk_next(bi);
719
720        if ( UBRK_DONE == pos ) {
721            break;
722        }
723
724        prev_ret_pos = ret_pos;
725        U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
726
727        if ( ret_pos > bsize ) {
728            ret_pos = prev_ret_pos;
729            break;
730        }
731
732        if ( prev_ret_pos == ret_pos ) {
733            /* something wrong - malformed utf8? */
734            break;
735        }
736
737        prev_pos = pos;
738    }
739
740    return ret_pos;
741}
742/* }}} */
743
744/* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
745static inline int32_t
746grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
747{
748    int pos = 0, next_pos = 0;
749    int ret_pos = 0;
750
751    while ( size ) {
752        next_pos = ubrk_next(bi);
753
754        if ( UBRK_DONE == next_pos ) {
755            break;
756        }
757        pos = next_pos;
758        size--;
759    }
760
761    /* pos is one past the last UChar - and represent the number of code units to
762        advance in the utf-8 buffer
763    */
764
765    U8_FWD_N(pstr, ret_pos, str_len, pos);
766
767    return ret_pos;
768}
769/* }}} */
770
771/* {{{ grapheme extract iter function pointer array */
772typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
773
774static grapheme_extract_iter grapheme_extract_iters[] = {
775    &grapheme_extract_count_iter,
776    &grapheme_extract_bytecount_iter,
777    &grapheme_extract_charcount_iter,
778};
779/* }}} */
780
781/* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
782    Function to extract a sequence of default grapheme clusters */
783PHP_FUNCTION(grapheme_extract)
784{
785    char *str, *pstr;
786    UChar *ustr;
787    size_t str_len;
788    int32_t ustr_len;
789    zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
790    zend_long lstart = 0; /* starting position in str in bytes */
791    int32_t start = 0;
792    zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
793    UErrorCode status;
794    unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
795    UBreakIterator* bi = NULL;
796    int ret_pos;
797    zval *next = NULL; /* return offset of next part of the string */
798
799    if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
800        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
801             "grapheme_extract: unable to parse input param", 0 );
802        RETURN_FALSE;
803    }
804
805    if ( NULL != next ) {
806        if ( !Z_ISREF_P(next) ) {
807            intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
808                 "grapheme_extract: 'next' was not passed by reference", 0 );
809            RETURN_FALSE;
810        } else {
811            ZVAL_DEREF(next);
812            /* initialize next */
813            SEPARATE_ZVAL(next);
814            zval_dtor(next);
815            ZVAL_LONG(next, lstart);
816        }
817    }
818
819    if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
820        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
821             "grapheme_extract: unknown extract type param", 0 );
822        RETURN_FALSE;
823    }
824
825    if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
826        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
827        RETURN_FALSE;
828    }
829
830    if ( size > INT32_MAX || size < 0) {
831        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 );
832        RETURN_FALSE;
833    }
834    if (size == 0) {
835        RETURN_EMPTY_STRING();
836    }
837
838    /* we checked that it will fit: */
839    start = (int32_t) lstart;
840
841    pstr = str + start;
842
843    /* just in case pstr points in the middle of a character, move forward to the start of the next char */
844    if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
845        char *str_end = str + str_len;
846
847        while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
848            pstr++;
849            if ( pstr >= str_end ) {
850                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
851                                "grapheme_extract: invalid input string", 0 );
852
853                RETURN_FALSE;
854            }
855        }
856    }
857
858    str_len -= (pstr - str);
859
860    /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
861        (size + 1 because the size-th character might be the beginning of a grapheme cluster)
862     */
863
864    if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
865        size_t nsize = MIN(size, str_len);
866        if ( NULL != next ) {
867            ZVAL_LONG(next, start+nsize);
868        }
869        RETURN_STRINGL(pstr, nsize);
870    }
871
872    /* convert the strings to UTF-16. */
873    ustr = NULL;
874    ustr_len = 0;
875    status = U_ZERO_ERROR;
876    intl_convert_utf8_to_utf16(&ustr, &ustr_len, pstr, str_len, &status );
877
878    if ( U_FAILURE( status ) ) {
879        /* Set global error code. */
880        intl_error_set_code( NULL, status );
881
882        /* Set error messages. */
883        intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
884
885        if ( NULL != ustr )
886            efree( ustr );
887
888        RETURN_FALSE;
889    }
890
891    bi = NULL;
892    status = U_ZERO_ERROR;
893    bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
894
895    ubrk_setText(bi, ustr, ustr_len, &status);
896
897    /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
898        can't back up. So, we will not do anything. */
899
900    /* now we need to find the end of the chunk the user wants us to return */
901    /* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
902    ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
903
904    if (ustr) {
905        efree(ustr);
906    }
907    ubrk_close(bi);
908
909    if ( NULL != next ) {
910        ZVAL_LONG(next, start+ret_pos);
911    }
912
913    RETURN_STRINGL(((char *)pstr), ret_pos);
914}
915
916/* }}} */
917
918/*
919 * Local variables:
920 * tab-width: 4
921 * c-basic-offset: 4
922 * End:
923 * vim600: fdm=marker
924 * vim: noet sw=4 ts=4
925 */
926
927