1/*
2   +----------------------------------------------------------------------+
3   | PHP Version 5                                                        |
4   +----------------------------------------------------------------------+
5   | This source file is subject to version 3.01 of the PHP license,      |
6   | that is bundled with this package in the file LICENSE, and is        |
7   | available through the world-wide-web at the following url:           |
8   | http://www.php.net/license/3_01.txt                                  |
9   | If you did not receive a copy of the PHP license and are unable to   |
10   | obtain it through the world-wide-web, please send a note to          |
11   | license@php.net so we can mail you a copy immediately.               |
12   +----------------------------------------------------------------------+
13   | Author: Ed Batutis <ed@batutis.com>                                  |
14   +----------------------------------------------------------------------+
15 */
16
17/* {{{ includes */
18#ifdef HAVE_CONFIG_H
19#include "config.h"
20#endif
21
22#include <php.h>
23#include "grapheme.h"
24#include "grapheme_util.h"
25
26#include <unicode/utypes.h>
27#include <unicode/ucol.h>
28#include <unicode/ustring.h>
29#include <unicode/ubrk.h>
30
31#include "ext/standard/php_string.h"
32
33/* }}} */
34
35#define GRAPHEME_EXTRACT_TYPE_COUNT     0
36#define GRAPHEME_EXTRACT_TYPE_MAXBYTES  1
37#define GRAPHEME_EXTRACT_TYPE_MAXCHARS  2
38#define GRAPHEME_EXTRACT_TYPE_MIN   GRAPHEME_EXTRACT_TYPE_COUNT
39#define GRAPHEME_EXTRACT_TYPE_MAX   GRAPHEME_EXTRACT_TYPE_MAXCHARS
40
41
42/* {{{ grapheme_register_constants
43 * Register API constants
44 */
45void grapheme_register_constants( INIT_FUNC_ARGS )
46{
47    REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
48    REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
49    REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
50}
51/* }}} */
52
53/* {{{ proto size_t grapheme_strlen(string str)
54   Get number of graphemes in a string */
55PHP_FUNCTION(grapheme_strlen)
56{
57    char* string;
58    size_t string_len;
59    UChar* ustring = NULL;
60    int ustring_len = 0;
61    zend_long ret_len;
62    UErrorCode status;
63
64    if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
65        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
66             "grapheme_strlen: unable to parse input param", 0 );
67        RETURN_FALSE;
68    }
69
70    ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
71
72    if ( ret_len >= 0 )
73        RETURN_LONG(string_len);
74
75    /* convert the string to UTF-16. */
76    status = U_ZERO_ERROR;
77    intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
78
79    if ( U_FAILURE( status ) ) {
80        /* Set global error code. */
81        intl_error_set_code( NULL, status );
82
83        /* Set error messages. */
84        intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
85        if (ustring) {
86            efree( ustring );
87        }
88        RETURN_NULL();
89    }
90
91    ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
92
93    if (ustring) {
94        efree( ustring );
95    }
96
97    if (ret_len >= 0) {
98        RETVAL_LONG(ret_len);
99    } else {
100        RETVAL_FALSE;
101    }
102}
103/* }}} */
104
105/* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
106   Find position of first occurrence of a string within another */
107PHP_FUNCTION(grapheme_strpos)
108{
109    char *haystack, *needle;
110    size_t haystack_len, needle_len;
111    const char *found;
112    zend_long loffset = 0;
113    int32_t offset = 0;
114    zend_long ret_pos;
115
116    if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
117        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
118             "grapheme_strpos: unable to parse input param", 0 );
119        RETURN_FALSE;
120    }
121
122    if ( OUTSIDE_STRING(loffset, haystack_len) ) {
123        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
124        RETURN_FALSE;
125    }
126
127    /* we checked that it will fit: */
128    offset = (int32_t) loffset;
129
130    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
131
132    if (needle_len == 0) {
133        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
134        RETURN_FALSE;
135    }
136
137
138    /* quick check to see if the string might be there
139     * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
140    */
141    found = php_memnstr(haystack + offset, needle, needle_len, haystack + haystack_len);
142
143    /* if it isn't there the we are done */
144    if (!found) {
145        RETURN_FALSE;
146    }
147
148    /* if it is there, and if the haystack is ascii, we are all done */
149    if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
150        RETURN_LONG(found - haystack);
151    }
152
153    /* do utf16 part of the strpos */
154    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
155
156    if ( ret_pos >= 0 ) {
157        RETURN_LONG(ret_pos);
158    } else {
159        RETURN_FALSE;
160    }
161
162}
163/* }}} */
164
165/* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
166   Find position of first occurrence of a string within another, ignoring case differences */
167PHP_FUNCTION(grapheme_stripos)
168{
169    char *haystack, *needle, *haystack_dup, *needle_dup;
170    size_t haystack_len, needle_len;
171    const char *found;
172    zend_long loffset = 0;
173    int32_t offset = 0;
174    zend_long ret_pos;
175    int is_ascii;
176
177    if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
178        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
179             "grapheme_stripos: unable to parse input param", 0 );
180        RETURN_FALSE;
181    }
182
183    if ( OUTSIDE_STRING(loffset, haystack_len) ) {
184        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 );
185        RETURN_FALSE;
186    }
187
188    /* we checked that it will fit: */
189    offset = (int32_t) loffset;
190
191    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
192
193    if (needle_len == 0) {
194        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 );
195        RETURN_FALSE;
196    }
197
198
199    is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
200
201    if ( is_ascii ) {
202        needle_dup = estrndup(needle, needle_len);
203        php_strtolower(needle_dup, needle_len);
204        haystack_dup = estrndup(haystack, haystack_len);
205        php_strtolower(haystack_dup, haystack_len);
206
207        found = php_memnstr(haystack_dup + offset, needle_dup, needle_len, haystack_dup + haystack_len);
208
209        efree(haystack_dup);
210        efree(needle_dup);
211
212        if (found) {
213            RETURN_LONG(found - haystack_dup);
214        }
215
216        /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
217        if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
218            RETURN_FALSE;
219        }
220    }
221
222    /* do utf16 part of the strpos */
223    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
224
225    if ( ret_pos >= 0 ) {
226        RETURN_LONG(ret_pos);
227    } else {
228        RETURN_FALSE;
229    }
230
231}
232/* }}} */
233
234/* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
235   Find position of last occurrence of a string within another */
236PHP_FUNCTION(grapheme_strrpos)
237{
238    char *haystack, *needle;
239    size_t haystack_len, needle_len;
240    zend_long loffset = 0;
241    int32_t offset = 0;
242    zend_long ret_pos;
243    int is_ascii;
244
245    if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
246        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
247             "grapheme_strrpos: unable to parse input param", 0 );
248        RETURN_FALSE;
249    }
250
251    if ( OUTSIDE_STRING(loffset, haystack_len) ) {
252        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
253        RETURN_FALSE;
254    }
255
256    /* we checked that it will fit: */
257    offset = (int32_t) loffset;
258
259    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
260
261    if (needle_len == 0) {
262        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
263        RETURN_FALSE;
264    }
265
266    is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
267
268    if ( is_ascii ) {
269
270        ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
271
272        if ( ret_pos >= 0 ) {
273            RETURN_LONG(ret_pos);
274        }
275
276        /* if the needle was ascii too, we are done */
277
278        if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
279            RETURN_FALSE;
280        }
281
282        /* else we need to continue via utf16 */
283    }
284
285    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
286
287    if ( ret_pos >= 0 ) {
288        RETURN_LONG(ret_pos);
289    } else {
290        RETURN_FALSE;
291    }
292
293
294}
295/* }}} */
296
297/* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
298   Find position of last occurrence of a string within another, ignoring case */
299PHP_FUNCTION(grapheme_strripos)
300{
301    char *haystack, *needle;
302    size_t haystack_len, needle_len;
303    zend_long loffset = 0;
304    int32_t offset = 0;
305    zend_long ret_pos;
306    int is_ascii;
307
308    if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
309        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
310             "grapheme_strrpos: unable to parse input param", 0 );
311        RETURN_FALSE;
312    }
313
314    if ( OUTSIDE_STRING(loffset, haystack_len) ) {
315        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
316        RETURN_FALSE;
317    }
318
319    /* we checked that it will fit: */
320    offset = (int32_t) loffset;
321
322    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
323
324    if (needle_len == 0) {
325        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
326        RETURN_FALSE;
327    }
328
329    is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
330
331    if ( is_ascii ) {
332        char *needle_dup, *haystack_dup;
333
334        needle_dup = estrndup(needle, needle_len);
335        php_strtolower(needle_dup, needle_len);
336        haystack_dup = estrndup(haystack, haystack_len);
337        php_strtolower(haystack_dup, haystack_len);
338
339        ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
340
341        efree(haystack_dup);
342        efree(needle_dup);
343
344        if ( ret_pos >= 0 ) {
345            RETURN_LONG(ret_pos);
346        }
347
348        /* if the needle was ascii too, we are done */
349
350        if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
351            RETURN_FALSE;
352        }
353
354        /* else we need to continue via utf16 */
355    }
356
357    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */);
358
359    if ( ret_pos >= 0 ) {
360        RETURN_LONG(ret_pos);
361    } else {
362        RETURN_FALSE;
363    }
364
365
366}
367/* }}} */
368
369/* {{{ proto string grapheme_substr(string str, int start [, int length])
370   Returns part of a string */
371PHP_FUNCTION(grapheme_substr)
372{
373    char *str, *sub_str;
374    UChar *ustr;
375    size_t str_len;
376    int32_t ustr_len;
377    size_t sub_str_len;
378    zend_long lstart = 0, length = 0;
379    int32_t start = 0;
380    int iter_val;
381    UErrorCode status;
382    unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
383    UBreakIterator* bi = NULL;
384    int sub_str_start_pos, sub_str_end_pos;
385    int32_t (*iter_func)(UBreakIterator *);
386    int no_length = 1;
387
388    if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", (char **)&str, &str_len, &lstart, &length, &no_length) == FAILURE) {
389        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
390             "grapheme_substr: unable to parse input param", 0 );
391        RETURN_FALSE;
392    }
393
394    if ( OUTSIDE_STRING(lstart, str_len)) {
395        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
396        RETURN_FALSE;
397    }
398
399    /* we checked that it will fit: */
400    start = (int32_t) lstart;
401
402    if(no_length) {
403        length = str_len;
404    }
405
406    if(length < INT32_MIN) {
407        length = INT32_MIN;
408    } else if(length > INT32_MAX) {
409        length = INT32_MAX;
410    }
411
412    /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
413
414    if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
415        int32_t asub_str_len;
416        grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
417
418        if ( NULL == sub_str ) {
419            intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
420            RETURN_FALSE;
421        }
422
423        RETURN_STRINGL(sub_str, asub_str_len);
424    }
425
426    ustr = NULL;
427    ustr_len = 0;
428    status = U_ZERO_ERROR;
429    intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
430
431    if ( U_FAILURE( status ) ) {
432        /* Set global error code. */
433        intl_error_set_code( NULL, status );
434
435        /* Set error messages. */
436        intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
437        if (ustr) {
438            efree( ustr );
439        }
440        RETURN_FALSE;
441    }
442
443    bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
444
445    if( U_FAILURE(status) ) {
446        RETURN_FALSE;
447    }
448
449    ubrk_setText(bi, ustr, ustr_len,    &status);
450
451    if ( start < 0 ) {
452        iter_func = ubrk_previous;
453        ubrk_last(bi);
454        iter_val = 1;
455    }
456    else {
457        iter_func = ubrk_next;
458        iter_val = -1;
459    }
460
461    sub_str_start_pos = 0;
462
463    while ( start ) {
464        sub_str_start_pos = iter_func(bi);
465
466        if ( UBRK_DONE == sub_str_start_pos ) {
467            break;
468        }
469
470        start += iter_val;
471    }
472
473    if ( 0 != start || sub_str_start_pos >= ustr_len ) {
474
475        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
476
477        if (ustr) {
478            efree(ustr);
479        }
480        ubrk_close(bi);
481        RETURN_FALSE;
482    }
483
484    /* OK to convert here since if str_len were big, convert above would fail */
485    if (length >= (int32_t)str_len) {
486
487        /* no length supplied or length is too big, return the rest of the string */
488
489        sub_str = NULL;
490        sub_str_len = 0;
491        status = U_ZERO_ERROR;
492        intl_convert_utf16_to_utf8(&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
493
494        if (ustr) {
495            efree( ustr );
496        }
497        ubrk_close( bi );
498
499        if ( U_FAILURE( status ) ) {
500            /* Set global error code. */
501            intl_error_set_code( NULL, status );
502
503            /* Set error messages. */
504            intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
505
506            if (sub_str) {
507                efree( sub_str );
508            }
509
510            RETURN_FALSE;
511        }
512
513        /* return the allocated string, not a duplicate */
514        RETVAL_STRINGL(sub_str, sub_str_len);
515        //???
516        efree(sub_str);
517        return;
518    }
519
520    if(length == 0) {
521        /* empty length - we've validated start, we can return "" now */
522        if (ustr) {
523            efree(ustr);
524        }
525        ubrk_close(bi);
526        RETURN_EMPTY_STRING();
527    }
528
529    /* find the end point of the string to return */
530
531    if ( length < 0 ) {
532        iter_func = ubrk_previous;
533        ubrk_last(bi);
534        iter_val = 1;
535    }
536    else {
537        iter_func = ubrk_next;
538        iter_val = -1;
539    }
540
541    sub_str_end_pos = 0;
542
543    while ( length ) {
544        sub_str_end_pos = iter_func(bi);
545
546        if ( UBRK_DONE == sub_str_end_pos ) {
547            break;
548        }
549
550        length += iter_val;
551    }
552
553    ubrk_close(bi);
554
555    if ( UBRK_DONE == sub_str_end_pos) {
556        if(length < 0) {
557            intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 );
558
559            efree(ustr);
560            RETURN_FALSE;
561        } else {
562            sub_str_end_pos = ustr_len;
563        }
564    }
565
566    if(sub_str_start_pos > sub_str_end_pos) {
567        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 );
568
569        efree(ustr);
570        RETURN_FALSE;
571    }
572
573    sub_str = NULL;
574    status = U_ZERO_ERROR;
575    intl_convert_utf16_to_utf8(&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
576
577    efree( ustr );
578
579    if ( U_FAILURE( status ) ) {
580        /* Set global error code. */
581        intl_error_set_code( NULL, status );
582
583        /* Set error messages. */
584        intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
585
586        if ( NULL != sub_str )
587            efree( sub_str );
588
589        RETURN_FALSE;
590    }
591
592     /* return the allocated string, not a duplicate */
593    RETVAL_STRINGL(sub_str, sub_str_len);
594    //????
595    efree(sub_str);
596
597}
598/* }}} */
599
600/* {{{  strstr_common_handler */
601static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
602{
603    char *haystack, *needle;
604    const char *found;
605    size_t haystack_len, needle_len;
606    int32_t ret_pos, uchar_pos;
607    zend_bool part = 0;
608
609    if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
610
611        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
612             "grapheme_strstr: unable to parse input param", 0 );
613
614        RETURN_FALSE;
615    }
616
617    if (needle_len == 0) {
618
619        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
620
621        RETURN_FALSE;
622    }
623
624
625    if ( !f_ignore_case ) {
626
627        /* ASCII optimization: quick check to see if the string might be there
628         * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
629        */
630        found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
631
632        /* if it isn't there the we are done */
633        if ( !found ) {
634            RETURN_FALSE;
635        }
636
637        /* if it is there, and if the haystack is ascii, we are all done */
638        if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
639            size_t found_offset = found - haystack;
640
641            if (part) {
642                RETURN_STRINGL(haystack, found_offset);
643            } else {
644                RETURN_STRINGL(found, haystack_len - found_offset);
645            }
646        }
647
648    }
649
650    /* need to work in utf16 */
651    ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
652
653    if ( ret_pos < 0 ) {
654        RETURN_FALSE;
655    }
656
657    /* uchar_pos is the 'nth' Unicode character position of the needle */
658
659    ret_pos = 0;
660    U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
661
662    if (part) {
663        RETURN_STRINGL(haystack, ret_pos);
664    } else {
665        RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
666    }
667
668}
669/* }}} */
670
671/* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
672   Finds first occurrence of a string within another */
673PHP_FUNCTION(grapheme_strstr)
674{
675    strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
676}
677/* }}} */
678
679/* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
680   Finds first occurrence of a string within another */
681PHP_FUNCTION(grapheme_stristr)
682{
683    strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
684}
685/* }}} */
686
687/* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
688static inline int32_t
689grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
690{
691    int pos = 0, prev_pos = 0;
692    int ret_pos = 0, prev_ret_pos = 0;
693
694    while ( 1 ) {
695        pos = ubrk_next(bi);
696
697        if ( UBRK_DONE == pos ) {
698            break;
699        }
700
701        /* if we are beyond our limit, then the loop is done */
702        if ( pos > csize ) {
703            break;
704        }
705
706        /* update our pointer in the original UTF-8 buffer by as many characters
707           as ubrk_next iterated over */
708
709        prev_ret_pos = ret_pos;
710        U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
711
712        if ( prev_ret_pos == ret_pos ) {
713            /* something wrong - malformed utf8? */
714            break;
715        }
716
717        prev_pos = pos;
718    }
719
720    return ret_pos;
721}
722/* }}} */
723
724/* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
725static inline int32_t
726grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
727{
728    int pos = 0, prev_pos = 0;
729    int ret_pos = 0, prev_ret_pos = 0;
730
731    while ( 1 ) {
732        pos = ubrk_next(bi);
733
734        if ( UBRK_DONE == pos ) {
735            break;
736        }
737
738        prev_ret_pos = ret_pos;
739        U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
740
741        if ( ret_pos > bsize ) {
742            ret_pos = prev_ret_pos;
743            break;
744        }
745
746        if ( prev_ret_pos == ret_pos ) {
747            /* something wrong - malformed utf8? */
748            break;
749        }
750
751        prev_pos = pos;
752    }
753
754    return ret_pos;
755}
756/* }}} */
757
758/* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
759static inline int32_t
760grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
761{
762    int pos = 0, next_pos = 0;
763    int ret_pos = 0;
764
765    while ( size ) {
766        next_pos = ubrk_next(bi);
767
768        if ( UBRK_DONE == next_pos ) {
769            break;
770        }
771        pos = next_pos;
772        size--;
773    }
774
775    /* pos is one past the last UChar - and represent the number of code units to
776        advance in the utf-8 buffer
777    */
778
779    U8_FWD_N(pstr, ret_pos, str_len, pos);
780
781    return ret_pos;
782}
783/* }}} */
784
785/* {{{ grapheme extract iter function pointer array */
786typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
787
788static grapheme_extract_iter grapheme_extract_iters[] = {
789    &grapheme_extract_count_iter,
790    &grapheme_extract_bytecount_iter,
791    &grapheme_extract_charcount_iter,
792};
793/* }}} */
794
795/* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
796    Function to extract a sequence of default grapheme clusters */
797PHP_FUNCTION(grapheme_extract)
798{
799    char *str, *pstr;
800    UChar *ustr;
801    size_t str_len;
802    int32_t ustr_len;
803    zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
804    zend_long lstart = 0; /* starting position in str in bytes */
805    int32_t start = 0;
806    zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
807    UErrorCode status;
808    unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
809    UBreakIterator* bi = NULL;
810    int ret_pos;
811    zval *next = NULL; /* return offset of next part of the string */
812
813    if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
814        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
815             "grapheme_extract: unable to parse input param", 0 );
816        RETURN_FALSE;
817    }
818
819    if ( NULL != next ) {
820        if ( !Z_ISREF_P(next) ) {
821            intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
822                 "grapheme_extract: 'next' was not passed by reference", 0 );
823            RETURN_FALSE;
824        } else {
825            ZVAL_DEREF(next);
826            /* initialize next */
827            SEPARATE_ZVAL(next);
828            zval_dtor(next);
829            ZVAL_LONG(next, lstart);
830        }
831    }
832
833    if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
834        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
835             "grapheme_extract: unknown extract type param", 0 );
836        RETURN_FALSE;
837    }
838
839    if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
840        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
841        RETURN_FALSE;
842    }
843
844    if ( size > INT32_MAX || size < 0) {
845        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 );
846        RETURN_FALSE;
847    }
848    if (size == 0) {
849        RETURN_EMPTY_STRING();
850    }
851
852    /* we checked that it will fit: */
853    start = (int32_t) lstart;
854
855    pstr = str + start;
856
857    /* just in case pstr points in the middle of a character, move forward to the start of the next char */
858    if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
859        char *str_end = str + str_len;
860
861        while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
862            pstr++;
863            if ( pstr >= str_end ) {
864                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
865                                "grapheme_extract: invalid input string", 0 );
866
867                RETURN_FALSE;
868            }
869        }
870    }
871
872    str_len -= (pstr - str);
873
874    /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
875        (size + 1 because the size-th character might be the beginning of a grapheme cluster)
876     */
877
878    if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
879        size_t nsize = MIN(size, str_len);
880        if ( NULL != next ) {
881            ZVAL_LONG(next, start+nsize);
882        }
883        RETURN_STRINGL(pstr, nsize);
884    }
885
886    /* convert the strings to UTF-16. */
887    ustr = NULL;
888    ustr_len = 0;
889    status = U_ZERO_ERROR;
890    intl_convert_utf8_to_utf16(&ustr, &ustr_len, pstr, str_len, &status );
891
892    if ( U_FAILURE( status ) ) {
893        /* Set global error code. */
894        intl_error_set_code( NULL, status );
895
896        /* Set error messages. */
897        intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
898
899        if ( NULL != ustr )
900            efree( ustr );
901
902        RETURN_FALSE;
903    }
904
905    bi = NULL;
906    status = U_ZERO_ERROR;
907    bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
908
909    ubrk_setText(bi, ustr, ustr_len, &status);
910
911    /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
912        can't back up. So, we will not do anything. */
913
914    /* now we need to find the end of the chunk the user wants us to return */
915    /* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
916    ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
917
918    if (ustr) {
919        efree(ustr);
920    }
921    ubrk_close(bi);
922
923    if ( NULL != next ) {
924        ZVAL_LONG(next, start+ret_pos);
925    }
926
927    RETURN_STRINGL(((char *)pstr), ret_pos);
928}
929
930/* }}} */
931
932/*
933 * Local variables:
934 * tab-width: 4
935 * c-basic-offset: 4
936 * End:
937 * vim600: fdm=marker
938 * vim: noet sw=4 ts=4
939 */
940
941