1/*
2   +----------------------------------------------------------------------+
3   | PHP Version 7														  |
4   +----------------------------------------------------------------------+
5   | This source file is subject to version 3.01 of the PHP license,	  |
6   | that is bundled with this package in the file LICENSE, and is		  |
7   | available through the world-wide-web at the following url:			  |
8   | http://www.php.net/license/3_01.txt								  |
9   | If you did not receive a copy of the PHP license and are unable to   |
10   | obtain it through the world-wide-web, please send a note to		  |
11   | license@php.net so we can mail you a copy immediately.				  |
12   +----------------------------------------------------------------------+
13   | Author: Ed Batutis <ed@batutis.com>								  |
14   +----------------------------------------------------------------------+
15 */
16
17/* {{{ includes */
18#ifdef HAVE_CONFIG_H
19#include "config.h"
20#endif
21
22#include <php.h>
23#include "grapheme.h"
24#include "grapheme_util.h"
25
26#include <unicode/utypes.h>
27#include <unicode/ucol.h>
28#include <unicode/ustring.h>
29#include <unicode/ubrk.h>
30
31#include "ext/standard/php_string.h"
32
33/* }}} */
34
35#define GRAPHEME_EXTRACT_TYPE_COUNT		0
36#define GRAPHEME_EXTRACT_TYPE_MAXBYTES	1
37#define GRAPHEME_EXTRACT_TYPE_MAXCHARS	2
38#define GRAPHEME_EXTRACT_TYPE_MIN	GRAPHEME_EXTRACT_TYPE_COUNT
39#define GRAPHEME_EXTRACT_TYPE_MAX	GRAPHEME_EXTRACT_TYPE_MAXCHARS
40
41
42/* {{{ grapheme_register_constants
43 * Register API constants
44 */
45void grapheme_register_constants( INIT_FUNC_ARGS )
46{
47	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
48	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
49	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
50}
51/* }}} */
52
53/* {{{ proto size_t grapheme_strlen(string str)
54   Get number of graphemes in a string */
55PHP_FUNCTION(grapheme_strlen)
56{
57	char* string;
58	size_t string_len;
59	UChar* ustring = NULL;
60	int ustring_len = 0;
61	zend_long ret_len;
62	UErrorCode status;
63
64	if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
65		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
66			 "grapheme_strlen: unable to parse input param", 0 );
67		RETURN_FALSE;
68	}
69
70	ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
71
72	if ( ret_len >= 0 )
73		RETURN_LONG(string_len);
74
75	/* convert the string to UTF-16. */
76	status = U_ZERO_ERROR;
77	intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
78
79	if ( U_FAILURE( status ) ) {
80		/* Set global error code. */
81		intl_error_set_code( NULL, status );
82
83		/* Set error messages. */
84		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
85		if (ustring) {
86			efree( ustring );
87		}
88		RETURN_NULL();
89	}
90
91	ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
92
93	if (ustring) {
94		efree( ustring );
95	}
96
97	if (ret_len >= 0) {
98		RETVAL_LONG(ret_len);
99	} else {
100		RETVAL_FALSE;
101	}
102}
103/* }}} */
104
105/* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
106   Find position of first occurrence of a string within another */
107PHP_FUNCTION(grapheme_strpos)
108{
109	char *haystack, *needle;
110	size_t haystack_len, needle_len;
111	const char *found;
112	zend_long loffset = 0;
113	int32_t offset = 0, noffset = 0;
114	zend_long ret_pos;
115
116	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
117		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
118			 "grapheme_strpos: unable to parse input param", 0 );
119		RETURN_FALSE;
120	}
121
122	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
123		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
124		RETURN_FALSE;
125	}
126
127	/* we checked that it will fit: */
128	offset = (int32_t) loffset;
129	noffset = offset >= 0 ? offset : haystack_len + offset;
130
131	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
132
133	if (needle_len == 0) {
134		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
135		RETURN_FALSE;
136	}
137
138	if (offset >= 0) {
139		/* quick check to see if the string might be there
140		 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
141		*/
142		found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
143
144		/* if it isn't there the we are done */
145		if (!found) {
146			RETURN_FALSE;
147		}
148
149		/* if it is there, and if the haystack is ascii, we are all done */
150		if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
151			RETURN_LONG(found - haystack);
152		}
153	}
154
155	/* do utf16 part of the strpos */
156	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
157
158	if ( ret_pos >= 0 ) {
159		RETURN_LONG(ret_pos);
160	} else {
161		RETURN_FALSE;
162	}
163
164}
165/* }}} */
166
167/* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
168   Find position of first occurrence of a string within another, ignoring case differences */
169PHP_FUNCTION(grapheme_stripos)
170{
171	char *haystack, *needle, *haystack_dup, *needle_dup;
172	size_t haystack_len, needle_len;
173	const char *found;
174	zend_long loffset = 0;
175	int32_t offset = 0;
176	zend_long ret_pos;
177	int is_ascii;
178
179	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
180		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
181			 "grapheme_stripos: unable to parse input param", 0 );
182		RETURN_FALSE;
183	}
184
185	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
186		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 );
187		RETURN_FALSE;
188	}
189
190	/* we checked that it will fit: */
191	offset = (int32_t) loffset;
192
193	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
194
195	if (needle_len == 0) {
196		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 );
197		RETURN_FALSE;
198	}
199
200	is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
201
202	if ( is_ascii ) {
203		int32_t noffset = offset >= 0 ? offset : haystack_len + offset;
204		needle_dup = estrndup(needle, needle_len);
205		php_strtolower(needle_dup, needle_len);
206		haystack_dup = estrndup(haystack, haystack_len);
207		php_strtolower(haystack_dup, haystack_len);
208
209		found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
210
211		efree(haystack_dup);
212		efree(needle_dup);
213
214		if (found) {
215			RETURN_LONG(found - haystack_dup);
216		}
217
218		/* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
219		if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
220			RETURN_FALSE;
221		}
222	}
223
224	/* do utf16 part of the strpos */
225	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
226
227	if ( ret_pos >= 0 ) {
228		RETURN_LONG(ret_pos);
229	} else {
230		RETURN_FALSE;
231	}
232
233}
234/* }}} */
235
236/* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
237   Find position of last occurrence of a string within another */
238PHP_FUNCTION(grapheme_strrpos)
239{
240	char *haystack, *needle;
241	size_t haystack_len, needle_len;
242	zend_long loffset = 0;
243	int32_t offset = 0;
244	zend_long ret_pos;
245	int is_ascii;
246
247	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
248		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
249			 "grapheme_strrpos: unable to parse input param", 0 );
250		RETURN_FALSE;
251	}
252
253	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
254		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
255		RETURN_FALSE;
256	}
257
258	/* we checked that it will fit: */
259	offset = (int32_t) loffset;
260
261	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
262
263	if (needle_len == 0) {
264		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
265		RETURN_FALSE;
266	}
267
268	is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
269
270	if ( is_ascii ) {
271
272		ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
273
274		if ( ret_pos >= 0 ) {
275			RETURN_LONG(ret_pos);
276		}
277
278		/* if the needle was ascii too, we are done */
279
280		if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
281			RETURN_FALSE;
282		}
283
284		/* else we need to continue via utf16 */
285	}
286
287	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
288
289	if ( ret_pos >= 0 ) {
290		RETURN_LONG(ret_pos);
291	} else {
292		RETURN_FALSE;
293	}
294
295
296}
297/* }}} */
298
299/* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
300   Find position of last occurrence of a string within another, ignoring case */
301PHP_FUNCTION(grapheme_strripos)
302{
303	char *haystack, *needle;
304	size_t haystack_len, needle_len;
305	zend_long loffset = 0;
306	int32_t offset = 0;
307	zend_long ret_pos;
308	int is_ascii;
309
310	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
311		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
312			 "grapheme_strrpos: unable to parse input param", 0 );
313		RETURN_FALSE;
314	}
315
316	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
317		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
318		RETURN_FALSE;
319	}
320
321	/* we checked that it will fit: */
322	offset = (int32_t) loffset;
323
324	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
325
326	if (needle_len == 0) {
327		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
328		RETURN_FALSE;
329	}
330
331	is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
332
333	if ( is_ascii ) {
334		char *needle_dup, *haystack_dup;
335
336		needle_dup = estrndup(needle, needle_len);
337		php_strtolower(needle_dup, needle_len);
338		haystack_dup = estrndup(haystack, haystack_len);
339		php_strtolower(haystack_dup, haystack_len);
340
341		ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
342
343		efree(haystack_dup);
344		efree(needle_dup);
345
346		if ( ret_pos >= 0 ) {
347			RETURN_LONG(ret_pos);
348		}
349
350		/* if the needle was ascii too, we are done */
351
352		if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
353			RETURN_FALSE;
354		}
355
356		/* else we need to continue via utf16 */
357	}
358
359	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */);
360
361	if ( ret_pos >= 0 ) {
362		RETURN_LONG(ret_pos);
363	} else {
364		RETURN_FALSE;
365	}
366
367
368}
369/* }}} */
370
371/* {{{ proto string grapheme_substr(string str, int start [, int length])
372   Returns part of a string */
373PHP_FUNCTION(grapheme_substr)
374{
375	char *str;
376	zend_string *u8_sub_str;
377	UChar *ustr;
378	size_t str_len;
379	int32_t ustr_len;
380	zend_long lstart = 0, length = 0;
381	int32_t start = 0;
382	int iter_val;
383	UErrorCode status;
384	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
385	UBreakIterator* bi = NULL;
386	int sub_str_start_pos, sub_str_end_pos;
387	int32_t (*iter_func)(UBreakIterator *);
388	zend_bool no_length = 1;
389
390	if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", &str, &str_len, &lstart, &length, &no_length) == FAILURE) {
391		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
392			 "grapheme_substr: unable to parse input param", 0 );
393		RETURN_FALSE;
394	}
395
396	if ( OUTSIDE_STRING(lstart, str_len)) {
397		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
398		RETURN_FALSE;
399	}
400
401	/* we checked that it will fit: */
402	start = (int32_t) lstart;
403
404	if(no_length) {
405		length = str_len;
406	}
407
408	if(length < INT32_MIN) {
409		length = INT32_MIN;
410	} else if(length > INT32_MAX) {
411		length = INT32_MAX;
412	}
413
414	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
415
416	if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
417		int32_t asub_str_len;
418		char *sub_str;
419		grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
420
421		if ( NULL == sub_str ) {
422			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
423			RETURN_FALSE;
424		}
425
426		RETURN_STRINGL(sub_str, asub_str_len);
427	}
428
429	ustr = NULL;
430	ustr_len = 0;
431	status = U_ZERO_ERROR;
432	intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
433
434	if ( U_FAILURE( status ) ) {
435		/* Set global error code. */
436		intl_error_set_code( NULL, status );
437
438		/* Set error messages. */
439		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
440		if (ustr) {
441			efree( ustr );
442		}
443		RETURN_FALSE;
444	}
445
446	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
447
448	if( U_FAILURE(status) ) {
449		RETURN_FALSE;
450	}
451
452	ubrk_setText(bi, ustr, ustr_len,	&status);
453
454	if ( start < 0 ) {
455		iter_func = ubrk_previous;
456		ubrk_last(bi);
457		iter_val = 1;
458	}
459	else {
460		iter_func = ubrk_next;
461		iter_val = -1;
462	}
463
464	sub_str_start_pos = 0;
465
466	while ( start ) {
467		sub_str_start_pos = iter_func(bi);
468
469		if ( UBRK_DONE == sub_str_start_pos ) {
470			break;
471		}
472
473		start += iter_val;
474	}
475
476	if ( 0 != start || sub_str_start_pos >= ustr_len ) {
477
478		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
479
480		if (ustr) {
481			efree(ustr);
482		}
483		ubrk_close(bi);
484		RETURN_FALSE;
485	}
486
487	/* OK to convert here since if str_len were big, convert above would fail */
488	if (length >= (int32_t)str_len) {
489
490		/* no length supplied or length is too big, return the rest of the string */
491
492		status = U_ZERO_ERROR;
493		u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
494
495		if (ustr) {
496			efree( ustr );
497		}
498		ubrk_close( bi );
499
500		if ( !u8_sub_str ) {
501			/* Set global error code. */
502			intl_error_set_code( NULL, status );
503
504			/* Set error messages. */
505			intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
506
507			RETURN_FALSE;
508		}
509
510		/* return the allocated string, not a duplicate */
511		RETVAL_NEW_STR(u8_sub_str);
512		return;
513	}
514
515	if(length == 0) {
516		/* empty length - we've validated start, we can return "" now */
517		if (ustr) {
518			efree(ustr);
519		}
520		ubrk_close(bi);
521		RETURN_EMPTY_STRING();
522	}
523
524	/* find the end point of the string to return */
525
526	if ( length < 0 ) {
527		iter_func = ubrk_previous;
528		ubrk_last(bi);
529		iter_val = 1;
530	}
531	else {
532		iter_func = ubrk_next;
533		iter_val = -1;
534	}
535
536	sub_str_end_pos = 0;
537
538	while ( length ) {
539		sub_str_end_pos = iter_func(bi);
540
541		if ( UBRK_DONE == sub_str_end_pos ) {
542			break;
543		}
544
545		length += iter_val;
546	}
547
548	ubrk_close(bi);
549
550	if ( UBRK_DONE == sub_str_end_pos) {
551		if(length < 0) {
552			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 );
553
554			efree(ustr);
555			RETURN_FALSE;
556		} else {
557			sub_str_end_pos = ustr_len;
558		}
559	}
560
561	if(sub_str_start_pos > sub_str_end_pos) {
562		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 );
563
564		efree(ustr);
565		RETURN_FALSE;
566	}
567
568	status = U_ZERO_ERROR;
569	u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
570
571	efree( ustr );
572
573	if ( !u8_sub_str ) {
574		/* Set global error code. */
575		intl_error_set_code( NULL, status );
576
577		/* Set error messages. */
578		intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
579
580		RETURN_FALSE;
581	}
582
583	 /* return the allocated string, not a duplicate */
584	RETVAL_NEW_STR(u8_sub_str);
585}
586/* }}} */
587
588/* {{{	strstr_common_handler */
589static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
590{
591	char *haystack, *needle;
592	const char *found;
593	size_t haystack_len, needle_len;
594	int32_t ret_pos, uchar_pos;
595	zend_bool part = 0;
596
597	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
598
599		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
600			 "grapheme_strstr: unable to parse input param", 0 );
601
602		RETURN_FALSE;
603	}
604
605	if (needle_len == 0) {
606
607		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
608
609		RETURN_FALSE;
610	}
611
612
613	if ( !f_ignore_case ) {
614
615		/* ASCII optimization: quick check to see if the string might be there
616		 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
617		*/
618		found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
619
620		/* if it isn't there the we are done */
621		if ( !found ) {
622			RETURN_FALSE;
623		}
624
625		/* if it is there, and if the haystack is ascii, we are all done */
626		if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
627			size_t found_offset = found - haystack;
628
629			if (part) {
630				RETURN_STRINGL(haystack, found_offset);
631			} else {
632				RETURN_STRINGL(found, haystack_len - found_offset);
633			}
634		}
635
636	}
637
638	/* need to work in utf16 */
639	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
640
641	if ( ret_pos < 0 ) {
642		RETURN_FALSE;
643	}
644
645	/* uchar_pos is the 'nth' Unicode character position of the needle */
646
647	ret_pos = 0;
648	U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
649
650	if (part) {
651		RETURN_STRINGL(haystack, ret_pos);
652	} else {
653		RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
654	}
655
656}
657/* }}} */
658
659/* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
660   Finds first occurrence of a string within another */
661PHP_FUNCTION(grapheme_strstr)
662{
663	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
664}
665/* }}} */
666
667/* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
668   Finds first occurrence of a string within another */
669PHP_FUNCTION(grapheme_stristr)
670{
671	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
672}
673/* }}} */
674
675/* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
676static inline int32_t
677grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
678{
679	int pos = 0, prev_pos = 0;
680	int ret_pos = 0, prev_ret_pos = 0;
681
682	while ( 1 ) {
683		pos = ubrk_next(bi);
684
685		if ( UBRK_DONE == pos ) {
686			break;
687		}
688
689		/* if we are beyond our limit, then the loop is done */
690		if ( pos > csize ) {
691			break;
692		}
693
694		/* update our pointer in the original UTF-8 buffer by as many characters
695		   as ubrk_next iterated over */
696
697		prev_ret_pos = ret_pos;
698		U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
699
700		if ( prev_ret_pos == ret_pos ) {
701			/* something wrong - malformed utf8? */
702			break;
703		}
704
705		prev_pos = pos;
706	}
707
708	return ret_pos;
709}
710/* }}} */
711
712/* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
713static inline int32_t
714grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
715{
716	int pos = 0, prev_pos = 0;
717	int ret_pos = 0, prev_ret_pos = 0;
718
719	while ( 1 ) {
720		pos = ubrk_next(bi);
721
722		if ( UBRK_DONE == pos ) {
723			break;
724		}
725
726		prev_ret_pos = ret_pos;
727		U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
728
729		if ( ret_pos > bsize ) {
730			ret_pos = prev_ret_pos;
731			break;
732		}
733
734		if ( prev_ret_pos == ret_pos ) {
735			/* something wrong - malformed utf8? */
736			break;
737		}
738
739		prev_pos = pos;
740	}
741
742	return ret_pos;
743}
744/* }}} */
745
746/* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
747static inline int32_t
748grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
749{
750	int pos = 0, next_pos = 0;
751	int ret_pos = 0;
752
753	while ( size ) {
754		next_pos = ubrk_next(bi);
755
756		if ( UBRK_DONE == next_pos ) {
757			break;
758		}
759		pos = next_pos;
760		size--;
761	}
762
763	/* pos is one past the last UChar - and represent the number of code units to
764		advance in the utf-8 buffer
765	*/
766
767	U8_FWD_N(pstr, ret_pos, str_len, pos);
768
769	return ret_pos;
770}
771/* }}} */
772
773/* {{{ grapheme extract iter function pointer array */
774typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
775
776static grapheme_extract_iter grapheme_extract_iters[] = {
777	&grapheme_extract_count_iter,
778	&grapheme_extract_bytecount_iter,
779	&grapheme_extract_charcount_iter,
780};
781/* }}} */
782
783/* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
784	Function to extract a sequence of default grapheme clusters */
785PHP_FUNCTION(grapheme_extract)
786{
787	char *str, *pstr;
788	UChar *ustr;
789	size_t str_len;
790	int32_t ustr_len;
791	zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
792	zend_long lstart = 0; /* starting position in str in bytes */
793	int32_t start = 0;
794	zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
795	UErrorCode status;
796	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
797	UBreakIterator* bi = NULL;
798	int ret_pos;
799	zval *next = NULL; /* return offset of next part of the string */
800
801	if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
802		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
803			 "grapheme_extract: unable to parse input param", 0 );
804		RETURN_FALSE;
805	}
806
807	if (lstart < 0) {
808		lstart += str_len;
809	}
810
811	if ( NULL != next ) {
812		if ( !Z_ISREF_P(next) ) {
813			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
814				 "grapheme_extract: 'next' was not passed by reference", 0 );
815			RETURN_FALSE;
816		} else {
817			ZVAL_DEREF(next);
818			/* initialize next */
819			SEPARATE_ZVAL_NOREF(next);
820			zval_dtor(next);
821            ZVAL_LONG(next, lstart);
822		}
823	}
824
825	if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
826		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
827			 "grapheme_extract: unknown extract type param", 0 );
828		RETURN_FALSE;
829	}
830
831	if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
832		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
833		RETURN_FALSE;
834	}
835
836	if ( size > INT32_MAX || size < 0) {
837		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 );
838		RETURN_FALSE;
839	}
840	if (size == 0) {
841		RETURN_EMPTY_STRING();
842	}
843
844	/* we checked that it will fit: */
845	start = (int32_t) lstart;
846
847	pstr = str + start;
848
849	/* just in case pstr points in the middle of a character, move forward to the start of the next char */
850	if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
851		char *str_end = str + str_len;
852
853		while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
854			pstr++;
855			if ( pstr >= str_end ) {
856				intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
857								"grapheme_extract: invalid input string", 0 );
858
859				RETURN_FALSE;
860			}
861		}
862	}
863
864	str_len -= (pstr - str);
865
866	/* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
867		(size + 1 because the size-th character might be the beginning of a grapheme cluster)
868	 */
869
870	if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
871        size_t nsize = MIN(size, str_len);
872		if ( NULL != next ) {
873			ZVAL_LONG(next, start+nsize);
874		}
875		RETURN_STRINGL(pstr, nsize);
876	}
877
878	/* convert the strings to UTF-16. */
879	ustr = NULL;
880	ustr_len = 0;
881	status = U_ZERO_ERROR;
882	intl_convert_utf8_to_utf16(&ustr, &ustr_len, pstr, str_len, &status );
883
884	if ( U_FAILURE( status ) ) {
885		/* Set global error code. */
886		intl_error_set_code( NULL, status );
887
888		/* Set error messages. */
889		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
890
891		if ( NULL != ustr )
892			efree( ustr );
893
894		RETURN_FALSE;
895	}
896
897	bi = NULL;
898	status = U_ZERO_ERROR;
899	bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
900
901	ubrk_setText(bi, ustr, ustr_len, &status);
902
903	/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
904		can't back up. So, we will not do anything. */
905
906	/* now we need to find the end of the chunk the user wants us to return */
907	/* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
908	ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
909
910	if (ustr) {
911		efree(ustr);
912	}
913	ubrk_close(bi);
914
915	if ( NULL != next ) {
916		ZVAL_LONG(next, start+ret_pos);
917	}
918
919	RETURN_STRINGL(((char *)pstr), ret_pos);
920}
921
922/* }}} */
923
924/*
925 * Local variables:
926 * tab-width: 4
927 * c-basic-offset: 4
928 * End:
929 * vim600: fdm=marker
930 * vim: noet sw=4 ts=4
931 */
932
933