1/*
2   +----------------------------------------------------------------------+
3   | PHP Version 5                                                        |
4   +----------------------------------------------------------------------+
5   | Copyright (c) 1997-2013 The PHP Group                                |
6   +----------------------------------------------------------------------+
7   | This source file is subject to version 3.01 of the PHP license,      |
8   | that is bundled with this package in the file LICENSE, and is        |
9   | available through the world-wide-web at the following url:           |
10   | http://www.php.net/license/3_01.txt                                  |
11   | If you did not receive a copy of the PHP license and are unable to   |
12   | obtain it through the world-wide-web, please send a note to          |
13   | license@php.net so we can mail you a copy immediately.               |
14   +----------------------------------------------------------------------+
15   | Authors: Rasmus Lerdorf <rasmus@php.net>                             |
16   |          Jaakko Hyv�tti <jaakko.hyvatti@iki.fi>                      |
17   |          Wez Furlong <wez@thebrainroom.com>                          |
18   +----------------------------------------------------------------------+
19*/
20
21/* $Id$ */
22
23/*
24 * HTML entity resources:
25 *
26 * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
27 * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
28 * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
29 *
30 * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
31 *
32 */
33
34#include "php.h"
35#if PHP_WIN32
36#include "config.w32.h"
37#else
38#include <php_config.h>
39#endif
40#include "html.h"
41#include "php_string.h"
42#include "SAPI.h"
43#if HAVE_LOCALE_H
44#include <locale.h>
45#endif
46#if HAVE_LANGINFO_H
47#include <langinfo.h>
48#endif
49
50#if HAVE_MBSTRING
51# include "ext/mbstring/mbstring.h"
52ZEND_EXTERN_MODULE_GLOBALS(mbstring)
53#endif
54
55enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
56                      cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
57                      cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
58                      cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
59                    };
60typedef const char *const entity_table_t;
61
62/* codepage 1252 is a Windows extension to iso-8859-1. */
63static entity_table_t ent_cp_1252[] = {
64    "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
65    "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
66    NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
67    "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
68    "oelig", NULL, NULL, "Yuml"
69};
70
71static entity_table_t ent_iso_8859_1[] = {
72    "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
73    "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
74    "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
75    "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
76    "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
77    "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
78    "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
79    "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
80    "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
81    "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
82    "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
83    "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
84    "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
85    "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
86    "uuml", "yacute", "thorn", "yuml"
87};
88
89static entity_table_t ent_iso_8859_15[] = {
90    "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
91    "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
92    "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
93    "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
94    "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
95    "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
96    "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
97    "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
98    "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
99    "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
100    "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
101    "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
102    "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
103    "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
104    "uuml", "yacute", "thorn", "yuml"
105};
106
107static entity_table_t ent_uni_338_402[] = {
108    /* 338 (0x0152) */
109    "OElig", "oelig", NULL, NULL, NULL, NULL,
110    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
111    /* 352 (0x0160) */
112    "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
113    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
114    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
115    /* 376 (0x0178) */
116    "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
117    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
118    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
119    /* 400 (0x0190) */
120    NULL, NULL, "fnof"
121};
122
123static entity_table_t ent_uni_spacing[] = {
124    /* 710 */
125    "circ",
126    /* 711 - 730 */
127    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
128    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
129    /* 731 - 732 */
130    NULL, "tilde"
131};
132
133static entity_table_t ent_uni_greek[] = {
134    /* 913 */
135    "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
136    "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
137    NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
138    /* 938 - 944 are not mapped */
139    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
140    "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
141    "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
142    "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
143    /* 970 - 976 are not mapped */
144    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
145    "thetasym", "upsih",
146    NULL, NULL, NULL,
147    "piv"
148};
149
150static entity_table_t ent_uni_punct[] = {
151    /* 8194 */
152    "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
153    "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
154    NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
155    /* 8216 */
156    "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
157    "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
158    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
159    /* 8242 */
160    "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
161    NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
162    "frasl"
163};
164
165static entity_table_t ent_uni_euro[] = {
166    "euro"
167};
168
169static entity_table_t ent_uni_8465_8501[] = {
170    /* 8465 */
171    "image", NULL, NULL, NULL, NULL, NULL, NULL,
172    /* 8472 */
173    "weierp", NULL, NULL, NULL,
174    /* 8476 */
175    "real", NULL, NULL, NULL, NULL, NULL,
176    /* 8482 */
177    "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
178    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179    /* 8501 */
180    "alefsym",
181};
182
183static entity_table_t ent_uni_8592_9002[] = {
184    /* 8592 (0x2190) */
185    "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
186    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
187    /* 8608 (0x21a0) */
188    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
189    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
190    /* 8624 (0x21b0) */
191    NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
192    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
193    /* 8640 (0x21c0) */
194    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
195    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
196    /* 8656 (0x21d0) */
197    "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL,
198    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
199    /* 8672 (0x21e0) */
200    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
201    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
202    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
203    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
204    /* 8704 (0x2200) */
205    "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla",
206    "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod",
207    /* 8720 (0x2210) */
208    NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast",
209    NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL,
210    /* 8736 (0x2220) */
211    "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and",
212    "or", "cap", "cup", "int", NULL, NULL, NULL, NULL,
213    /* 8752 (0x2230) */
214    NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL,
215    NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL,
216    /* 8768 (0x2240) */
217    NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL,
218    "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
219    /* 8784 (0x2250) */
220    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
221    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
222    /* 8800 (0x2260) */
223    "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL,
224    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
225    /* 8816 (0x2270) */
226    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
227    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
228    /* 8832 (0x2280) */
229    NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe",
230    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
231    /* 8848 (0x2290) */
232    NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
233    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
234    /* 8864 (0x22a0) */
235    NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
236    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
237    /* 8880 (0x22b0) */
238    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
239    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
240    /* 8896 (0x22c0) */
241    NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
242    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
243    /* 8912 (0x22d0) */
244    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
245    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
246    /* 8928 (0x22e0) */
247    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
248    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
249    /* 8944 (0x22f0) */
250    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
251    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
252    /* 8960 (0x2300) */
253    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
254    "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
255    /* 8976 (0x2310) */
256    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
257    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
258    /* 8992 (0x2320) */
259    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
260    NULL, "lang", "rang"
261};
262
263static entity_table_t ent_uni_9674[] = {
264    /* 9674 */
265    "loz"
266};
267
268static entity_table_t ent_uni_9824_9830[] = {
269    /* 9824 */
270    "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
271};
272
273static entity_table_t ent_koi8r[] = {
274    "#1105", /* "jo "*/
275    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
276    NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
277    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
278    "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092",
279    "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084",
280    "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090",
281    "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096",
282    "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041",
283    "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048",
284    "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055",
285    "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
286    "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063",
287    "#1066"
288};
289
290static entity_table_t ent_cp_1251[] = {
291    "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
292    "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
293    "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
294    "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
295    "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
296    "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
297    "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
298    "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
299    "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
300    "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
301    "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
302    "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
303    "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
304    "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
305    "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
306    "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
307    "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
308    "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
309    "#1103"
310};
311
312static entity_table_t ent_iso_8859_5[] = {
313    "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
314    "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
315    "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
316    "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
317    "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
318    "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
319    "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
320    "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
321    "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
322    "#1119"
323};
324
325static entity_table_t ent_cp_866[] = {
326
327    "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566",
328    "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552",
329    "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560",
330    "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608",
331    "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090",
332    "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
333    "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025",
334    "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118",
335    "#176", "#8729", "#183", "#8730", "#8470", "#164",  "#9632",
336    "#160"
337};
338
339/* MacRoman has a couple of low-ascii chars that need mapping too */
340/* Vertical tab (ASCII 11) is often used to store line breaks inside */
341/* DB exports, this mapping changes it to a space */
342static entity_table_t ent_macroman[] = {
343    "sp", NULL, NULL, NULL,
344    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
345    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
346    NULL, NULL, NULL, NULL, NULL, "quot", NULL,
347    NULL, NULL, "amp", NULL, NULL, NULL, NULL,
348    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
349    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
350    NULL, NULL, NULL, "lt", NULL, "gt", NULL,
351    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
352    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
353    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
354    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
355    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
356    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
357    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
358    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
359    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
360    NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
361    "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
362    "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
363    "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
364    "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
365    "cent", "pound", "sect", "bull", "para", "szlig", "reg",
366    "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
367    "infin", "plusmn", "le", "ge", "yen", "micro", "part",
368    "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
369    "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
370    "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
371    "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
372    "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
373    "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
374    "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
375    "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
376    "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
377    "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
378    "#733", "#731", "#711"
379};
380
381struct html_entity_map {
382    enum entity_charset charset;    /* charset identifier */
383    unsigned int basechar;          /* char code at start of table */
384    unsigned int endchar;           /* last char code in the table */
385    entity_table_t *table;          /* the table of mappings */
386};
387
388static const struct html_entity_map entity_map[] = {
389    { cs_cp1252,        0x80, 0x9f, ent_cp_1252 },
390    { cs_cp1252,        0xa0, 0xff, ent_iso_8859_1 },
391    { cs_8859_1,        0xa0, 0xff, ent_iso_8859_1 },
392    { cs_8859_15,       0xa0, 0xff, ent_iso_8859_15 },
393    { cs_utf_8,         0xa0, 0xff, ent_iso_8859_1 },
394    { cs_utf_8,         338,  402,  ent_uni_338_402 },
395    { cs_utf_8,         710,  732,  ent_uni_spacing },
396    { cs_utf_8,         913,  982,  ent_uni_greek },
397    { cs_utf_8,         8194, 8260, ent_uni_punct },
398    { cs_utf_8,         8364, 8364, ent_uni_euro },
399    { cs_utf_8,         8465, 8501, ent_uni_8465_8501 },
400    { cs_utf_8,         8592, 9002, ent_uni_8592_9002 },
401    { cs_utf_8,         9674, 9674, ent_uni_9674 },
402    { cs_utf_8,         9824, 9830, ent_uni_9824_9830 },
403    { cs_big5,          0xa0, 0xff, ent_iso_8859_1 },
404    { cs_gb2312,        0xa0, 0xff, ent_iso_8859_1 },
405    { cs_big5hkscs,     0xa0, 0xff, ent_iso_8859_1 },
406    { cs_sjis,          0xa0, 0xff, ent_iso_8859_1 },
407    { cs_eucjp,         0xa0, 0xff, ent_iso_8859_1 },
408    { cs_koi8r,         0xa3, 0xff, ent_koi8r },
409    { cs_cp1251,        0x80, 0xff, ent_cp_1251 },
410    { cs_8859_5,        0xc0, 0xff, ent_iso_8859_5 },
411    { cs_cp866,         0xc0, 0xff, ent_cp_866 },
412    { cs_macroman,      0x0b, 0xff, ent_macroman },
413    { cs_terminator }
414};
415
416static const struct {
417    const char *codeset;
418    enum entity_charset charset;
419} charset_map[] = {
420    { "ISO-8859-1",     cs_8859_1 },
421    { "ISO8859-1",      cs_8859_1 },
422    { "ISO-8859-15",    cs_8859_15 },
423    { "ISO8859-15",     cs_8859_15 },
424    { "utf-8",          cs_utf_8 },
425    { "cp1252",         cs_cp1252 },
426    { "Windows-1252",   cs_cp1252 },
427    { "1252",           cs_cp1252 },
428    { "BIG5",           cs_big5 },
429    { "950",            cs_big5 },
430    { "GB2312",         cs_gb2312 },
431    { "936",            cs_gb2312 },
432    { "BIG5-HKSCS",     cs_big5hkscs },
433    { "Shift_JIS",      cs_sjis },
434    { "SJIS",           cs_sjis },
435    { "932",            cs_sjis },
436    { "EUCJP",          cs_eucjp },
437    { "EUC-JP",         cs_eucjp },
438    { "KOI8-R",         cs_koi8r },
439    { "koi8-ru",        cs_koi8r },
440    { "koi8r",          cs_koi8r },
441    { "cp1251",         cs_cp1251 },
442    { "Windows-1251",   cs_cp1251 },
443    { "win-1251",       cs_cp1251 },
444    { "iso8859-5",      cs_8859_5 },
445    { "iso-8859-5",     cs_8859_5 },
446    { "cp866",          cs_cp866 },
447    { "866",            cs_cp866 },
448    { "ibm866",         cs_cp866 },
449    { "MacRoman",       cs_macroman },
450    { NULL }
451};
452
453static const struct {
454    unsigned short charcode;
455    char *entity;
456    int entitylen;
457    int flags;
458} basic_entities[] = {
459    { '"',  "&quot;",   6,  ENT_HTML_QUOTE_DOUBLE },
460    { '\'', "&#039;",   6,  ENT_HTML_QUOTE_SINGLE },
461    { '\'', "&#39;",    5,  ENT_HTML_QUOTE_SINGLE },
462    { '<',  "&lt;",     4,  0 },
463    { '>',  "&gt;",     4,  0 },
464    { 0, NULL, 0, 0 }
465};
466
467struct basic_entities_dec {
468    unsigned short charcode;
469    char entity[8];
470    int entitylen;
471};
472
473#define MB_RETURN { \
474            *newpos = pos;       \
475            mbseq[mbpos] = '\0'; \
476            *mbseqlen = mbpos;   \
477            return this_char; }
478
479#define MB_WRITE(mbchar) { \
480            mbspace--;  \
481            if (mbspace == 0) {      \
482                MB_RETURN;           \
483            }                        \
484            mbseq[mbpos++] = (mbchar); }
485
486/* skip one byte and return */
487#define MB_FAILURE(pos) do { \
488    *newpos = pos + 1; \
489    *status = FAILURE; \
490    return 0; \
491} while (0)
492
493#define CHECK_LEN(pos, chars_need)          \
494    if (chars_need < 1) {                       \
495        if((str_len - (pos)) < chars_need) {    \
496            *newpos = pos;                      \
497            *status = FAILURE;                  \
498            return 0;                           \
499        }                                       \
500    } else {                                    \
501        if((str_len - (pos)) < chars_need) {    \
502            *newpos = pos + 1;                  \
503            *status = FAILURE;                  \
504            return 0;                           \
505        }                                       \
506    }
507
508/* {{{ get_next_char
509 */
510inline static unsigned int get_next_char(enum entity_charset charset,
511        unsigned char * str,
512        int str_len,
513        int * newpos,
514        unsigned char * mbseq,
515        int * mbseqlen,
516        int *status)
517{
518    int pos = *newpos;
519    int mbpos = 0;
520    int mbspace = *mbseqlen;
521    unsigned int this_char = 0;
522    unsigned char next_char;
523
524    *status = SUCCESS;
525
526    if (mbspace <= 0) {
527        *mbseqlen = 0;
528        CHECK_LEN(pos, 1);
529        *newpos = pos + 1;
530        return str[pos];
531    }
532
533    switch (charset) {
534        case cs_utf_8:
535            {
536                unsigned char c;
537                CHECK_LEN(pos, 1);
538                c = str[pos];
539                if (c < 0x80) {
540                    MB_WRITE(c);
541                    this_char = c;
542                    pos++;
543                } else if (c < 0xc2) {
544                    MB_FAILURE(pos);
545                } else if (c < 0xe0) {
546                    CHECK_LEN(pos, 2);
547                    if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
548                        MB_FAILURE(pos);
549                    }
550                    this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
551                    if (this_char < 0x80) {
552                        MB_FAILURE(pos);
553                    }
554                    MB_WRITE((unsigned char)c);
555                    MB_WRITE((unsigned char)str[pos + 1]);
556                    pos += 2;
557                } else if (c < 0xf0) {
558                    CHECK_LEN(pos, 3);
559                    if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
560                        MB_FAILURE(pos);
561                    }
562                    if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
563                        MB_FAILURE(pos);
564                    }
565                    this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
566                    if (this_char < 0x800) {
567                        MB_FAILURE(pos);
568                    } else if (this_char >= 0xd800 && this_char <= 0xdfff) {
569                        MB_FAILURE(pos);
570                    }
571                    MB_WRITE((unsigned char)c);
572                    MB_WRITE((unsigned char)str[pos + 1]);
573                    MB_WRITE((unsigned char)str[pos + 2]);
574                    pos += 3;
575                } else if (c < 0xf5) {
576                    CHECK_LEN(pos, 4);
577                    if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
578                        MB_FAILURE(pos);
579                    }
580                    if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
581                        MB_FAILURE(pos);
582                    }
583                    if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
584                        MB_FAILURE(pos);
585                    }
586                    this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
587                    if (this_char < 0x10000 || this_char > 0x10FFFF) {
588                        MB_FAILURE(pos);
589                    }
590                    MB_WRITE((unsigned char)c);
591                    MB_WRITE((unsigned char)str[pos + 1]);
592                    MB_WRITE((unsigned char)str[pos + 2]);
593                    MB_WRITE((unsigned char)str[pos + 3]);
594                    pos += 4;
595                } else {
596                    MB_FAILURE(pos);
597                }
598            }
599            break;
600        case cs_big5:
601        case cs_gb2312:
602        case cs_big5hkscs:
603            {
604                CHECK_LEN(pos, 1);
605                this_char = str[pos++];
606                /* check if this is the first of a 2-byte sequence */
607                if (this_char >= 0x81 && this_char <= 0xfe) {
608                    /* peek at the next char */
609                    CHECK_LEN(pos, 1);
610                    next_char = str[pos++];
611                    if ((next_char >= 0x40 && next_char <= 0x7e) ||
612                            (next_char >= 0xa1 && next_char <= 0xfe)) {
613                        /* yes, this a wide char */
614                        MB_WRITE(this_char);
615                        MB_WRITE(next_char);
616                        this_char = (this_char << 8) | next_char;
617                    } else {
618                        MB_FAILURE(pos);
619                    }
620                } else {
621                    MB_WRITE(this_char);
622                }
623            }
624            break;
625        case cs_sjis:
626            {
627                CHECK_LEN(pos, 1);
628                this_char = str[pos++];
629                /* check if this is the first of a 2-byte sequence */
630                if ((this_char >= 0x81 && this_char <= 0x9f) ||
631                    (this_char >= 0xe0 && this_char <= 0xfc)) {
632                    /* peek at the next char */
633                    CHECK_LEN(pos, 1);
634                    next_char = str[pos++];
635                    if ((next_char >= 0x40 && next_char <= 0x7e) ||
636                        (next_char >= 0x80 && next_char <= 0xfc))
637                    {
638                        /* yes, this a wide char */
639                        MB_WRITE(this_char);
640                        MB_WRITE(next_char);
641                        this_char = (this_char << 8) | next_char;
642                    } else {
643                        MB_FAILURE(pos);
644                    }
645                } else {
646                    MB_WRITE(this_char);
647                }
648                break;
649            }
650        case cs_eucjp:
651            {
652                CHECK_LEN(pos, 1);
653                this_char = str[pos++];
654                /* check if this is the first of a multi-byte sequence */
655                if (this_char >= 0xa1 && this_char <= 0xfe) {
656                    /* peek at the next char */
657                    CHECK_LEN(pos, 1);
658                    next_char = str[pos++];
659                    if (next_char >= 0xa1 && next_char <= 0xfe) {
660                        /* yes, this a jis kanji char */
661                        MB_WRITE(this_char);
662                        MB_WRITE(next_char);
663                        this_char = (this_char << 8) | next_char;
664                    } else {
665                        MB_FAILURE(pos);
666                    }
667                } else if (this_char == 0x8e) {
668                    /* peek at the next char */
669                    CHECK_LEN(pos, 1);
670                    next_char = str[pos++];
671                    if (next_char >= 0xa1 && next_char <= 0xdf) {
672                        /* JIS X 0201 kana */
673                        MB_WRITE(this_char);
674                        MB_WRITE(next_char);
675                        this_char = (this_char << 8) | next_char;
676                    } else {
677                        MB_FAILURE(pos);
678                    }
679                } else if (this_char == 0x8f) {
680                    /* peek at the next two char */
681                    unsigned char next2_char;
682                    CHECK_LEN(pos, 2);
683                    next_char = str[pos];
684                    next2_char = str[pos + 1];
685                    pos += 2;
686                    if ((next_char >= 0xa1 && next_char <= 0xfe) &&
687                        (next2_char >= 0xa1 && next2_char <= 0xfe)) {
688                        /* JIS X 0212 hojo-kanji */
689                        MB_WRITE(this_char);
690                        MB_WRITE(next_char);
691                        MB_WRITE(next2_char);
692                        this_char = (this_char << 16) | (next_char << 8) | next2_char;
693                    } else {
694                        MB_FAILURE(pos);
695                    }
696                } else {
697                    MB_WRITE(this_char);
698                }
699                break;
700            }
701        default:
702            /* single-byte charsets */
703            CHECK_LEN(pos, 1);
704            this_char = str[pos++];
705            MB_WRITE(this_char);
706            break;
707    }
708    MB_RETURN;
709}
710/* }}} */
711
712/* {{{ entity_charset determine_charset
713 * returns the charset identifier based on current locale or a hint.
714 * defaults to iso-8859-1 */
715static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
716{
717    int i;
718    enum entity_charset charset = cs_8859_1;
719    int len = 0;
720    zval *uf_result = NULL;
721
722    /* Guarantee default behaviour for backwards compatibility */
723    if (charset_hint == NULL)
724        return cs_8859_1;
725
726    if ((len = strlen(charset_hint)) != 0) {
727        goto det_charset;
728    }
729#if HAVE_MBSTRING
730#if !defined(COMPILE_DL_MBSTRING)
731    /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
732    switch (MBSTRG(current_internal_encoding)) {
733        case mbfl_no_encoding_8859_1:
734            return cs_8859_1;
735
736        case mbfl_no_encoding_utf8:
737            return cs_utf_8;
738
739        case mbfl_no_encoding_euc_jp:
740        case mbfl_no_encoding_eucjp_win:
741            return cs_eucjp;
742
743        case mbfl_no_encoding_sjis:
744        case mbfl_no_encoding_sjis_open:
745        case mbfl_no_encoding_cp932:
746            return cs_sjis;
747
748        case mbfl_no_encoding_cp1252:
749            return cs_cp1252;
750
751        case mbfl_no_encoding_8859_15:
752            return cs_8859_15;
753
754        case mbfl_no_encoding_big5:
755            return cs_big5;
756
757        case mbfl_no_encoding_euc_cn:
758        case mbfl_no_encoding_hz:
759        case mbfl_no_encoding_cp936:
760            return cs_gb2312;
761
762        case mbfl_no_encoding_koi8r:
763            return cs_koi8r;
764
765        case mbfl_no_encoding_cp866:
766            return cs_cp866;
767
768        case mbfl_no_encoding_cp1251:
769            return cs_cp1251;
770
771        case mbfl_no_encoding_8859_5:
772            return cs_8859_5;
773
774        default:
775            ;
776    }
777#else
778    {
779        zval nm_mb_internal_encoding;
780
781        ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
782
783        if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
784
785            charset_hint = Z_STRVAL_P(uf_result);
786            len = Z_STRLEN_P(uf_result);
787
788            if (charset_hint != NULL && len != 0) {
789                if (len == 4) { /* sizeof(none|auto|pass)-1 */
790                    if (!memcmp("pass", charset_hint, sizeof("pass") - 1) ||
791                        !memcmp("auto", charset_hint, sizeof("auto") - 1) ||
792                        !memcmp("none", charset_hint, sizeof("none") - 1)) {
793
794                        charset_hint = NULL;
795                        len = 0;
796                    }
797                } else {
798                    /* Jump to det_charset only if mbstring isn't one of above eq pass, auto, none.
799                       Otherwise try default_charset next */
800                    goto det_charset;
801                }
802            }
803        }
804    }
805#endif
806#endif
807
808    charset_hint = SG(default_charset);
809    if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
810        goto det_charset;
811    }
812
813    /* try to detect the charset for the locale */
814#if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
815    charset_hint = nl_langinfo(CODESET);
816    if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
817        goto det_charset;
818    }
819#endif
820
821#if HAVE_LOCALE_H
822    /* try to figure out the charset from the locale */
823    {
824        char *localename;
825        char *dot, *at;
826
827        /* lang[_territory][.codeset][@modifier] */
828        localename = setlocale(LC_CTYPE, NULL);
829
830        dot = strchr(localename, '.');
831        if (dot) {
832            dot++;
833            /* locale specifies a codeset */
834            at = strchr(dot, '@');
835            if (at)
836                len = at - dot;
837            else
838                len = strlen(dot);
839            charset_hint = dot;
840        } else {
841            /* no explicit name; see if the name itself
842             * is the charset */
843            charset_hint = localename;
844            len = strlen(charset_hint);
845        }
846    }
847#endif
848
849det_charset:
850
851    if (charset_hint) {
852        int found = 0;
853
854        /* now walk the charset map and look for the codeset */
855        for (i = 0; charset_map[i].codeset; i++) {
856            if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
857                charset = charset_map[i].charset;
858                found = 1;
859                break;
860            }
861        }
862        if (!found) {
863            php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1",
864                    charset_hint);
865        }
866    }
867    if (uf_result != NULL) {
868        zval_ptr_dtor(&uf_result);
869    }
870    return charset;
871}
872/* }}} */
873
874/* {{{ php_utf32_utf8 */
875size_t php_utf32_utf8(unsigned char *buf, unsigned k)
876{
877    size_t retval = 0;
878
879    if (k < 0x80) {
880        buf[0] = k;
881        retval = 1;
882    } else if (k < 0x800) {
883        buf[0] = 0xc0 | (k >> 6);
884        buf[1] = 0x80 | (k & 0x3f);
885        retval = 2;
886    } else if (k < 0x10000) {
887        buf[0] = 0xe0 | (k >> 12);
888        buf[1] = 0x80 | ((k >> 6) & 0x3f);
889        buf[2] = 0x80 | (k & 0x3f);
890        retval = 3;
891    } else if (k < 0x200000) {
892        buf[0] = 0xf0 | (k >> 18);
893        buf[1] = 0x80 | ((k >> 12) & 0x3f);
894        buf[2] = 0x80 | ((k >> 6) & 0x3f);
895        buf[3] = 0x80 | (k & 0x3f);
896        retval = 4;
897    } else if (k < 0x4000000) {
898        buf[0] = 0xf8 | (k >> 24);
899        buf[1] = 0x80 | ((k >> 18) & 0x3f);
900        buf[2] = 0x80 | ((k >> 12) & 0x3f);
901        buf[3] = 0x80 | ((k >> 6) & 0x3f);
902        buf[4] = 0x80 | (k & 0x3f);
903        retval = 5;
904    } else {
905        buf[0] = 0xfc | (k >> 30);
906        buf[1] = 0x80 | ((k >> 24) & 0x3f);
907        buf[2] = 0x80 | ((k >> 18) & 0x3f);
908        buf[3] = 0x80 | ((k >> 12) & 0x3f);
909        buf[4] = 0x80 | ((k >> 6) & 0x3f);
910        buf[5] = 0x80 | (k & 0x3f);
911        retval = 6;
912    }
913    buf[retval] = '\0';
914
915    return retval;
916}
917/* }}} */
918
919/* {{{ php_unescape_html_entities
920 */
921PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
922{
923    int retlen;
924    int j, k;
925    char *replaced, *ret, *p, *q, *lim, *next;
926    enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
927    unsigned char replacement[15];
928    int replacement_len;
929
930    ret = estrndup(old, oldlen);
931    retlen = oldlen;
932    if (!retlen) {
933        goto empty_source;
934    }
935
936    if (all) {
937        /* look for a match in the maps for this charset */
938        for (j = 0; entity_map[j].charset != cs_terminator; j++) {
939            if (entity_map[j].charset != charset)
940                continue;
941
942            for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
943                unsigned char entity[32];
944                int entity_length = 0;
945
946                if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
947                    continue;
948
949                entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]);
950                if (entity_length >= sizeof(entity)) {
951                    continue;
952                }
953
954                /* When we have MBCS entities in the tables above, this will need to handle it */
955                replacement_len = 0;
956                switch (charset) {
957                    case cs_8859_1:
958                    case cs_cp1252:
959                    case cs_8859_15:
960                    case cs_cp1251:
961                    case cs_8859_5:
962                    case cs_cp866:
963                    case cs_koi8r:
964                        replacement[0] = k;
965                        replacement[1] = '\0';
966                        replacement_len = 1;
967                        break;
968
969                    case cs_big5:
970                    case cs_gb2312:
971                    case cs_big5hkscs:
972                    case cs_sjis:
973                    case cs_eucjp:
974                        /* we cannot properly handle those multibyte encodings
975                         * with php_str_to_str. skip it. */
976                        continue;
977
978                    case cs_utf_8:
979                        replacement_len = php_utf32_utf8(replacement, k);
980                        break;
981
982                    default:
983                        php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
984                        efree(ret);
985                        return NULL;
986                }
987
988                if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
989                    replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen);
990                    efree(ret);
991                    ret = replaced;
992                }
993            }
994        }
995    }
996
997    for (j = 0; basic_entities[j].charcode != 0; j++) {
998
999        if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
1000            continue;
1001
1002        replacement[0] = (unsigned char)basic_entities[j].charcode;
1003        replacement[1] = '\0';
1004
1005        if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {
1006            replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen);
1007            efree(ret);
1008            ret = replaced;
1009        }
1010    }
1011
1012    /* replace numeric entities & "&amp;" */
1013    lim = ret + retlen;
1014    for (p = ret, q = ret; p < lim;) {
1015        int code;
1016
1017        if (p[0] == '&') {
1018            if (p + 2 < lim) {
1019                if (p[1] == '#') {
1020                    int invalid_code = 0;
1021
1022                    if (p[2] == 'x' || p[2] == 'X') {
1023                        code = strtol(p + 3, &next, 16);
1024                    } else {
1025                        code = strtol(p + 2, &next, 10);
1026                    }
1027
1028                    if ((code == '\'' && !(quote_style & ENT_HTML_QUOTE_SINGLE)) ||
1029                        (code == '"' && !(quote_style & ENT_HTML_QUOTE_DOUBLE))) {
1030                        invalid_code = 1;
1031                    }
1032
1033                    if (next != NULL && *next == ';' && !invalid_code) {
1034                        switch (charset) {
1035                            case cs_utf_8:
1036                                q += php_utf32_utf8(q, code);
1037                                break;
1038
1039                            case cs_8859_1:
1040                            case cs_8859_5:
1041                            case cs_8859_15:
1042                                if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
1043                                    invalid_code = 1;
1044                                } else {
1045                                    *(q++) = code;
1046                                }
1047                                break;
1048
1049                            case cs_cp1252:
1050                                if (code > 0xff) {
1051                                    invalid_code = 1;
1052                                } else {
1053                                    *(q++) = code;
1054                                }
1055                                break;
1056
1057                            case cs_cp1251:
1058                            case cs_cp866:
1059                            case cs_big5:
1060                            case cs_big5hkscs:
1061                            case cs_sjis:
1062                            case cs_eucjp:
1063                                if (code >= 0x80) {
1064                                    invalid_code = 1;
1065                                } else {
1066                                    *(q++) = code;
1067                                }
1068                                break;
1069
1070                            case cs_gb2312:
1071                                if (code >= 0x81) {
1072                                    invalid_code = 1;
1073                                } else {
1074                                    *(q++) = code;
1075                                }
1076                                break;
1077
1078                            default:
1079                                /* for backwards compatilibity */
1080                                invalid_code = 1;
1081                                break;
1082                        }
1083                        if (invalid_code) {
1084                            for (; p <= next; p++) {
1085                                *(q++) = *p;
1086                            }
1087                        }
1088                        p = next + 1;
1089                    } else {
1090                        *(q++) = *(p++);
1091                        *(q++) = *(p++);
1092                    }
1093                } else if (p + 4 < lim &&
1094                            p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
1095                            p[4] == ';') {
1096                    *(q++) = '&';
1097                    p += 5;
1098                } else {
1099                    *(q++) = *(p++);
1100                    *(q++) = *(p++);
1101                }
1102            } else {
1103                *(q++) = *(p++);
1104            }
1105        } else {
1106            *(q++) = *(p++);
1107        }
1108    }
1109    *q = '\0';
1110    retlen = (size_t)(q - ret);
1111empty_source:
1112    *newlen = retlen;
1113    return ret;
1114}
1115/* }}} */
1116
1117PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
1118{
1119    return php_escape_html_entities_ex(old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC);
1120}
1121
1122
1123/* {{{ php_escape_html_entities
1124 */
1125PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC)
1126{
1127    int i, j, maxlen, len;
1128    char *replaced;
1129    enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
1130    int matches_map;
1131
1132    maxlen = 2 * oldlen;
1133    if (maxlen < 128)
1134        maxlen = 128;
1135    replaced = emalloc (maxlen);
1136    len = 0;
1137    i = 0;
1138    while (i < oldlen) {
1139        unsigned char mbsequence[16];   /* allow up to 15 characters in a multibyte sequence */
1140        int mbseqlen = sizeof(mbsequence);
1141        int status = SUCCESS;
1142        unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
1143
1144        if(status == FAILURE) {
1145            /* invalid MB sequence */
1146            if (quote_style & ENT_HTML_IGNORE_ERRORS) {
1147                continue;
1148            }
1149            efree(replaced);
1150            if(!PG(display_errors)) {
1151                php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument");
1152            }
1153            *newlen = 0;
1154            return STR_EMPTY_ALLOC();
1155        }
1156        matches_map = 0;
1157
1158        if (len + 16 > maxlen)
1159            replaced = erealloc (replaced, maxlen += 128);
1160
1161        if (all) {
1162            /* look for a match in the maps for this charset */
1163            unsigned char *rep = NULL;
1164
1165
1166            for (j = 0; entity_map[j].charset != cs_terminator; j++) {
1167                if (entity_map[j].charset == charset
1168                        && this_char >= entity_map[j].basechar
1169                        && this_char <= entity_map[j].endchar) {
1170                    rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar];
1171                    if (rep == NULL) {
1172                        /* there is no entity for this position; fall through and
1173                         * just output the character itself */
1174                        break;
1175                    }
1176
1177                    matches_map = 1;
1178                    break;
1179                }
1180            }
1181
1182            if (matches_map) {
1183                int l = strlen(rep);
1184                /* increase the buffer size */
1185                if (len + 2 + l >= maxlen) {
1186                    replaced = erealloc(replaced, maxlen += 128);
1187                }
1188
1189                replaced[len++] = '&';
1190                strlcpy(replaced + len, rep, maxlen);
1191                len += l;
1192                replaced[len++] = ';';
1193            }
1194        }
1195        if (!matches_map) {
1196            int is_basic = 0;
1197
1198            if (this_char == '&') {
1199                if (double_encode) {
1200encode_amp:
1201                    memcpy(replaced + len, "&amp;", sizeof("&amp;") - 1);
1202                    len += sizeof("&amp;") - 1;
1203                } else {
1204                    char *e = memchr(old + i, ';', oldlen - i);
1205                    char *s = old + i;
1206
1207                    if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */
1208                        goto encode_amp;
1209                    } else {
1210                        if (*s == '#') { /* numeric entities */
1211                            s++;
1212                            /* Hex (&#x5A;) */
1213                            if (*s == 'x' || *s == 'X') {
1214                                s++;
1215                                while (s < e) {
1216                                    if (!isxdigit((int)*(unsigned char *)s++)) {
1217                                        goto encode_amp;
1218                                    }
1219                                }
1220                            /* Dec (&#90;)*/
1221                            } else {
1222                                while (s < e) {
1223                                    if (!isdigit((int)*(unsigned char *)s++)) {
1224                                        goto encode_amp;
1225                                    }
1226                                }
1227                            }
1228                        } else { /* text entities */
1229                            while (s < e) {
1230                                if (!isalnum((int)*(unsigned char *)s++)) {
1231                                    goto encode_amp;
1232                                }
1233                            }
1234                        }
1235                        replaced[len++] = '&';
1236                    }
1237                }
1238                is_basic = 1;
1239            } else {
1240                for (j = 0; basic_entities[j].charcode != 0; j++) {
1241                    if ((basic_entities[j].charcode != this_char) ||
1242                            (basic_entities[j].flags &&
1243                            (quote_style & basic_entities[j].flags) == 0)) {
1244                        continue;
1245                    }
1246
1247                    memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
1248                    len += basic_entities[j].entitylen;
1249
1250                    is_basic = 1;
1251                    break;
1252                }
1253            }
1254
1255            if (!is_basic) {
1256                /* a wide char without a named entity; pass through the original sequence */
1257                if (mbseqlen > 1) {
1258                    memcpy(replaced + len, mbsequence, mbseqlen);
1259                    len += mbseqlen;
1260                } else {
1261                    replaced[len++] = (unsigned char)this_char;
1262                }
1263            }
1264        }
1265    }
1266    replaced[len] = '\0';
1267    *newlen = len;
1268
1269    return replaced;
1270
1271
1272}
1273/* }}} */
1274
1275/* {{{ php_html_entities
1276 */
1277static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
1278{
1279    char *str, *hint_charset = NULL;
1280    int str_len, hint_charset_len = 0;
1281    int len;
1282    long quote_style = ENT_COMPAT;
1283    char *replaced;
1284    zend_bool double_encode = 1;
1285
1286    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, &quote_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
1287        return;
1288    }
1289
1290    replaced = php_escape_html_entities_ex(str, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC);
1291    RETVAL_STRINGL(replaced, len, 0);
1292}
1293/* }}} */
1294
1295#define HTML_SPECIALCHARS   0
1296#define HTML_ENTITIES       1
1297
1298/* {{{ register_html_constants
1299 */
1300void register_html_constants(INIT_FUNC_ARGS)
1301{
1302    REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
1303    REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
1304    REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
1305    REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
1306    REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
1307    REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS);
1308}
1309/* }}} */
1310
1311/* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
1312   Convert special characters to HTML entities */
1313PHP_FUNCTION(htmlspecialchars)
1314{
1315    php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1316}
1317/* }}} */
1318
1319/* {{{ proto string htmlspecialchars_decode(string string [, int quote_style])
1320   Convert special HTML entities back to characters */
1321PHP_FUNCTION(htmlspecialchars_decode)
1322{
1323    char *str, *new_str, *e, *p;
1324    int len, j, i, new_len;
1325    long quote_style = ENT_COMPAT;
1326    struct basic_entities_dec basic_entities_dec[8];
1327
1328    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &quote_style) == FAILURE) {
1329        return;
1330    }
1331
1332    new_str = estrndup(str, len);
1333    new_len = len;
1334    e = new_str + new_len;
1335
1336    if (!(p = memchr(new_str, '&', new_len))) {
1337        RETURN_STRINGL(new_str, new_len, 0);
1338    }
1339
1340    for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
1341        if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
1342            continue;
1343        }
1344        basic_entities_dec[j].charcode = basic_entities[i].charcode;
1345        memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
1346        basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
1347        j++;
1348    }
1349    basic_entities_dec[j].charcode = '&';
1350    basic_entities_dec[j].entitylen = sizeof("&amp;") - 1;
1351    memcpy(basic_entities_dec[j].entity, "&amp;", sizeof("&amp;"));
1352    i = j + 1;
1353
1354    do {
1355        int l = e - p;
1356
1357        for (j = 0; j < i; j++) {
1358            if (basic_entities_dec[j].entitylen > l) {
1359                continue;
1360            }
1361            if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
1362                int e_len = basic_entities_dec[j].entitylen - 1;
1363
1364                *p++ = basic_entities_dec[j].charcode;
1365                memmove(p, p + e_len, (e - p - e_len));
1366                e -= e_len;
1367                goto done;
1368            }
1369        }
1370        p++;
1371
1372done:
1373        if (p >= e) {
1374            break;
1375        }
1376    } while ((p = memchr(p, '&', (e - p))));
1377
1378    new_len = e - new_str;
1379
1380    new_str[new_len] = '\0';
1381    RETURN_STRINGL(new_str, new_len, 0);
1382}
1383/* }}} */
1384
1385/* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])
1386   Convert all HTML entities to their applicable characters */
1387PHP_FUNCTION(html_entity_decode)
1388{
1389    char *str, *hint_charset = NULL;
1390    int str_len, hint_charset_len = 0, len;
1391    long quote_style = ENT_COMPAT;
1392    char *replaced;
1393
1394    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len,
1395                              &quote_style, &hint_charset, &hint_charset_len) == FAILURE) {
1396        return;
1397    }
1398
1399    replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
1400    if (replaced) {
1401        RETURN_STRINGL(replaced, len, 0);
1402    }
1403    RETURN_FALSE;
1404}
1405/* }}} */
1406
1407
1408/* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
1409   Convert all applicable characters to HTML entities */
1410PHP_FUNCTION(htmlentities)
1411{
1412    php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1413}
1414/* }}} */
1415
1416/* {{{ proto array get_html_translation_table([int table [, int quote_style [, string charset_hint]]])
1417   Returns the internal translation table used by htmlspecialchars and htmlentities */
1418PHP_FUNCTION(get_html_translation_table)
1419{
1420    long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
1421    unsigned int i;
1422    int j;
1423    unsigned char ind[5]; /* max # of 8-bit code units (4; for UTF-8) + 1 for \0 */
1424    void *dummy;
1425    char *charset_hint = NULL;
1426    int charset_hint_len;
1427    enum entity_charset charset;
1428
1429    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|lls",
1430            &which, &quote_style, &charset_hint, &charset_hint_len) == FAILURE) {
1431        return;
1432    }
1433
1434    charset = determine_charset(charset_hint TSRMLS_CC);
1435
1436    array_init(return_value);
1437
1438    switch (which) {
1439    case HTML_ENTITIES:
1440        for (j = 0; entity_map[j].charset != cs_terminator; j++) {
1441            if (entity_map[j].charset != charset)
1442                continue;
1443            for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
1444                char buffer[16];
1445                unsigned k;
1446                size_t written;
1447
1448                if (entity_map[j].table[i] == NULL)
1449                    continue;
1450
1451                k = i + entity_map[j].basechar;
1452
1453                switch (charset) {
1454                case cs_utf_8:
1455                    written = php_utf32_utf8(ind, k);
1456                    ind[written] = '\0';
1457                    break;
1458                case cs_big5:
1459                case cs_gb2312:
1460                case cs_big5hkscs:
1461                case cs_sjis:
1462                    /* we have no mappings for these, but if we had... */
1463                    /* break through */
1464                default: /* one byte */
1465                    written = 1;
1466                    ind[0] = (unsigned char)k;
1467                    ind[1] = '\0';
1468                    break;
1469                }
1470
1471                snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
1472                if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, written+1, &dummy) == FAILURE) {
1473                    /* in case of the single quote, which is repeated, the first one wins,
1474                        * so don't replace the existint mapping */
1475                    add_assoc_string(return_value, (const char*)ind, buffer, 1);
1476                }
1477            }
1478        }
1479        /* break thru */
1480
1481    case HTML_SPECIALCHARS:
1482        add_assoc_stringl(return_value, "&", "&amp;", sizeof("&amp;") - 1, 1);
1483        for (j = 0; basic_entities[j].charcode != 0; j++) {
1484            if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
1485                continue;
1486
1487            ind[0] = (unsigned char)basic_entities[j].charcode;
1488            ind[1] = '\0';
1489            if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, 2, &dummy) == FAILURE) {
1490                add_assoc_stringl(return_value, ind, basic_entities[j].entity,
1491                    basic_entities[j].entitylen, 1);
1492            }
1493        }
1494
1495        break;
1496    }
1497}
1498/* }}} */
1499
1500/*
1501 * Local variables:
1502 * tab-width: 4
1503 * c-basic-offset: 4
1504 * End:
1505 * vim600: sw=4 ts=4 fdm=marker
1506 * vim<600: sw=4 ts=4
1507 */
1508