1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 5 |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 1997-2008 The PHP Group |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
15 | Authors: Rasmus Lerdorf <rasmus@php.net> |
16 | Jaakko Hyvätti <jaakko.hyvatti@iki.fi> |
17 | Wez Furlong <wez@thebrainroom.com> |
18 +----------------------------------------------------------------------+
19 */
20
21 /* $Id: html.c,v 1.135 2008/08/18 03:26:06 moriyoshi Exp $ */
22
23 /*
24 * HTML entity resources:
25 *
26 * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
27 * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
28 * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
29 *
30 * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
31 *
32 * UNICODE NOTE:
33 * The way Unicode support is implemented (namely, IS_UNICODE support) is by
34 * converting the IS_UNICODE strings to UTF-8 and handing them off to existing
35 * implementation. This saves on redoing all the code that encodes and decodes
36 * entities to support UChar*, but it does result in slight performance loss.
37 * Whoever wants to do this properly, go ahead.
38 */
39
40 #include "php.h"
41 #if PHP_WIN32
42 #include "config.w32.h"
43 #else
44 #include <php_config.h>
45 #endif
46 #include "html.h"
47 #include "php_string.h"
48 #include "SAPI.h"
49 #if HAVE_LOCALE_H
50 #include <locale.h>
51 #endif
52 #if HAVE_LANGINFO_H
53 #include <langinfo.h>
54 #endif
55
56 #if HAVE_MBSTRING
57 # include "ext/mbstring/mbstring.h"
58 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
59 #endif
60
61 enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
62 cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
63 cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
64 cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
65 };
66 typedef const char *const entity_table_t;
67
68 /* codepage 1252 is a Windows extension to iso-8859-1. */
69 static entity_table_t ent_cp_1252[] = {
70 "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
71 "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
72 NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
73 "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
74 "oelig", NULL, NULL, "Yuml"
75 };
76
77 static entity_table_t ent_iso_8859_1[] = {
78 "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
79 "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
80 "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
81 "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
82 "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
83 "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
84 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
85 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
86 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
87 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
88 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
89 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
90 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
91 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
92 "uuml", "yacute", "thorn", "yuml"
93 };
94
95 static entity_table_t ent_iso_8859_15[] = {
96 "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
97 "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
98 "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
99 "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
100 "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
101 "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
102 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
103 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
104 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
105 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
106 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
107 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
108 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
109 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
110 "uuml", "yacute", "thorn", "yuml"
111 };
112
113 static entity_table_t ent_uni_338_402[] = {
114 /* 338 (0x0152) */
115 "OElig", "oelig", NULL, NULL, NULL, NULL,
116 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
117 /* 352 (0x0160) */
118 "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
119 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
120 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
121 /* 376 (0x0178) */
122 "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
123 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
124 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
125 /* 400 (0x0190) */
126 NULL, NULL, "fnof"
127 };
128
129 static entity_table_t ent_uni_spacing[] = {
130 /* 710 */
131 "circ",
132 /* 711 - 730 */
133 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
134 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
135 /* 731 - 732 */
136 NULL, "tilde"
137 };
138
139 static entity_table_t ent_uni_greek[] = {
140 /* 913 */
141 "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
142 "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
143 NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
144 /* 938 - 944 are not mapped */
145 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
146 "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
147 "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
148 "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
149 /* 970 - 976 are not mapped */
150 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
151 "thetasym", "upsih",
152 NULL, NULL, NULL,
153 "piv"
154 };
155
156 static entity_table_t ent_uni_punct[] = {
157 /* 8194 */
158 "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
159 "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
160 NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
161 /* 8216 */
162 "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
163 "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
164 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
165 /* 8242 */
166 "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
167 NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
168 "frasl"
169 };
170
171 static entity_table_t ent_uni_euro[] = {
172 "euro"
173 };
174
175 static entity_table_t ent_uni_8465_8501[] = {
176 /* 8465 */
177 "image", NULL, NULL, NULL, NULL, NULL, NULL,
178 /* 8472 */
179 "weierp", NULL, NULL, NULL,
180 /* 8476 */
181 "real", NULL, NULL, NULL, NULL, NULL,
182 /* 8482 */
183 "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
184 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
185 /* 8501 */
186 "alefsym",
187 };
188
189 static entity_table_t ent_uni_8592_9002[] = {
190 /* 8592 (0x2190) */
191 "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
192 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
193 /* 8608 (0x21a0) */
194 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
195 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
196 /* 8624 (0x21b0) */
197 NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
198 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
199 /* 8640 (0x21c0) */
200 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
201 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
202 /* 8656 (0x21d0) */
203 "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", NULL, NULL,
204 NULL, NULL, "lAarr", "rAarr", NULL, "rarrw", NULL, NULL,
205 /* 8672 (0x21e0) */
206 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
207 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
208 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
209 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
210 /* 8704 (0x2200) */
211 "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla",
212 "isin", "notin", "epsis", "ni", "notni", "bepsi", NULL, "prod",
213 /* 8720 (0x2210) */
214 "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast",
215 "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90",
216 /* 8736 (0x2220) */
217 "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
218 "or", "cap", "cup", "int", NULL, NULL, "conint", NULL,
219 /* 8752 (0x2230) */
220 NULL, NULL, NULL, NULL, "there4", "becaus", NULL, NULL,
221 NULL, NULL, NULL, NULL, "sim", "bsim", NULL, NULL,
222 /* 8768 (0x2240) */
223 "wreath", "nsim", NULL, "sime", "nsime", "cong", NULL, "ncong",
224 "asymp", "nap", "ape", NULL, "bcong", "asymp", "bump", "bumpe",
225 /* 8784 (0x2250) */
226 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
227 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
228 /* 8800 (0x2260) */
229 "ne", "equiv", NULL, NULL, "le", "ge", "lE", "gE",
230 "lnE", "gnE", "Lt", "Gt", "twixt", NULL, "nlt", "ngt",
231 /* 8816 (0x2270) */
232 "nles", "nges", "lsim", "gsim", NULL, NULL, "lg", "gl",
233 NULL, NULL, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
234 /* 8832 (0x2280) */
235 "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
236 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
237 /* 8848 (0x2290) */
238 NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
239 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
240 /* 8864 (0x22a0) */
241 NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
242 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
243 /* 8880 (0x22b0) */
244 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
245 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
246 /* 8896 (0x22c0) */
247 NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
248 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
249 /* 8912 (0x22d0) */
250 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
251 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
252 /* 8928 (0x22e0) */
253 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
254 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
255 /* 8944 (0x22f0) */
256 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
257 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
258 /* 8960 (0x2300) */
259 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
260 "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
261 /* 8976 (0x2310) */
262 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
263 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
264 /* 8992 (0x2320) */
265 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
266 NULL, "lang", "rang"
267 };
268
269 static entity_table_t ent_uni_9674[] = {
270 /* 9674 */
271 "loz"
272 };
273
274 static entity_table_t ent_uni_9824_9830[] = {
275 /* 9824 */
276 "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
277 };
278
279 static entity_table_t ent_koi8r[] = {
280 "#1105", /* "jo "*/
281 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
282 NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
283 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
284 "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092",
285 "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084",
286 "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090",
287 "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096",
288 "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041",
289 "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048",
290 "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055",
291 "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
292 "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063",
293 "#1066"
294 };
295
296 static entity_table_t ent_cp_1251[] = {
297 "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
298 "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
299 "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
300 "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
301 "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
302 "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
303 "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
304 "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
305 "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
306 "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
307 "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
308 "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
309 "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
310 "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
311 "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
312 "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
313 "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
314 "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
315 "#1103"
316 };
317
318 static entity_table_t ent_iso_8859_5[] = {
319 "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
320 "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
321 "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
322 "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
323 "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
324 "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
325 "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
326 "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
327 "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
328 "#1119"
329 };
330
331 static entity_table_t ent_cp_866[] = {
332
333 "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566",
334 "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552",
335 "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560",
336 "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608",
337 "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090",
338 "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
339 "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025",
340 "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118",
341 "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632",
342 "#160"
343 };
344
345 /* MacRoman has a couple of low-ascii chars that need mapping too */
346 /* Vertical tab (ASCII 11) is often used to store line breaks inside */
347 /* DB exports, this mapping changes it to a space */
348 static entity_table_t ent_macroman[] = {
349 "sp", NULL, NULL, NULL,
350 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
351 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
352 NULL, NULL, NULL, NULL, NULL, "quot", NULL,
353 NULL, NULL, "amp", NULL, NULL, NULL, NULL,
354 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
355 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
356 NULL, NULL, NULL, "lt", NULL, "gt", NULL,
357 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
358 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
359 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
360 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
361 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
362 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
363 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
364 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
365 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
366 NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
367 "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
368 "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
369 "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
370 "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
371 "cent", "pound", "sect", "bull", "para", "szlig", "reg",
372 "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
373 "infin", "plusmn", "le", "ge", "yen", "micro", "part",
374 "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
375 "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
376 "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
377 "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
378 "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
379 "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
380 "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
381 "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
382 "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
383 "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
384 "#733", "#731", "#711"
385 };
386
387 struct html_entity_map {
388 enum entity_charset charset; /* charset identifier */
389 unsigned short basechar; /* char code at start of table */
390 unsigned short endchar; /* last char code in the table */
391 entity_table_t *table; /* the table of mappings */
392 };
393
394 static const struct html_entity_map entity_map[] = {
395 { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
396 { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
397 { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
398 { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
399 { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
400 { cs_utf_8, 338, 402, ent_uni_338_402 },
401 { cs_utf_8, 710, 732, ent_uni_spacing },
402 { cs_utf_8, 913, 982, ent_uni_greek },
403 { cs_utf_8, 8194, 8260, ent_uni_punct },
404 { cs_utf_8, 8364, 8364, ent_uni_euro },
405 { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
406 { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
407 { cs_utf_8, 9674, 9674, ent_uni_9674 },
408 { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
409 { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
410 { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
411 { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
412 { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
413 { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
414 { cs_koi8r, 0xa3, 0xff, ent_koi8r },
415 { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
416 { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
417 { cs_cp866, 0xc0, 0xff, ent_cp_866 },
418 { cs_macroman, 0x0b, 0xff, ent_macroman },
419 { cs_terminator }
420 };
421
422 static const struct {
423 const char *codeset;
424 enum entity_charset charset;
425 } charset_map[] = {
426 { "ISO-8859-1", cs_8859_1 },
427 { "ISO8859-1", cs_8859_1 },
428 { "ISO-8859-15", cs_8859_15 },
429 { "ISO8859-15", cs_8859_15 },
430 { "utf-8", cs_utf_8 },
431 { "cp1252", cs_cp1252 },
432 { "Windows-1252", cs_cp1252 },
433 { "1252", cs_cp1252 },
434 { "BIG5", cs_big5 },
435 { "950", cs_big5 },
436 { "GB2312", cs_gb2312 },
437 { "936", cs_gb2312 },
438 { "BIG5-HKSCS", cs_big5hkscs },
439 { "Shift_JIS", cs_sjis },
440 { "SJIS", cs_sjis },
441 { "932", cs_sjis },
442 { "EUCJP", cs_eucjp },
443 { "EUC-JP", cs_eucjp },
444 { "KOI8-R", cs_koi8r },
445 { "koi8-ru", cs_koi8r },
446 { "koi8r", cs_koi8r },
447 { "cp1251", cs_cp1251 },
448 { "Windows-1251", cs_cp1251 },
449 { "win-1251", cs_cp1251 },
450 { "iso8859-5", cs_8859_5 },
451 { "iso-8859-5", cs_8859_5 },
452 { "cp866", cs_cp866 },
453 { "866", cs_cp866 },
454 { "ibm866", cs_cp866 },
455 { "MacRoman", cs_macroman },
456 { NULL }
457 };
458
459 static const struct {
460 unsigned short charcode;
461 char *entity;
462 int entitylen;
463 int flags;
464 } basic_entities[] = {
465 { '"', """, 6, ENT_HTML_QUOTE_DOUBLE },
466 { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE },
467 { '\'', "'", 5, ENT_HTML_QUOTE_SINGLE },
468 { '<', "<", 4, 0 },
469 { '>', ">", 4, 0 },
470 { 0, NULL, 0, 0 }
471 };
472
473 struct basic_entities_dec {
474 unsigned short charcode;
475 char entity[8];
476 int entitylen;
477 };
478
479 #define MB_RETURN { \
480 *newpos = pos; \
481 mbseq[mbpos] = '\0'; \
482 *mbseqlen = mbpos; \
483 return this_char; }
484
485 #define MB_WRITE(mbchar) { \
486 mbspace--; \
487 if (mbspace == 0) { \
488 MB_RETURN; \
489 } \
490 mbseq[mbpos++] = (mbchar); }
491
492 #define CHECK_LEN(pos, chars_need) \
493 if((str_len - (pos)) < chars_need) { \
494 *status = FAILURE; \
495 return 0; \
496 }
497
498 /* {{{ get_next_char
499 */
500 inline static unsigned short get_next_char(enum entity_charset charset,
501 unsigned char * str,
502 int str_len,
503 int * newpos,
504 unsigned char * mbseq,
505 int * mbseqlen,
506 int *status)
507 {
508 int pos = *newpos;
509 int mbpos = 0;
510 int mbspace = *mbseqlen;
511 unsigned short this_char = str[pos++];
512 unsigned char next_char;
513
514 *status = SUCCESS;
515
516 if (mbspace <= 0) {
517 *mbseqlen = 0;
518 return this_char;
519 }
520
521 MB_WRITE((unsigned char)this_char);
522
523 switch (charset) {
524 case cs_utf_8:
525 {
526 unsigned long utf = 0;
527 int stat = 0;
528 int more = 1;
529
530 /* unpack utf-8 encoding into a wide char.
531 * Code stolen from the mbstring extension */
532
533 do {
534 if (this_char < 0x80) {
535 more = 0;
536 break;
537 } else if (this_char < 0xc0) {
538 switch (stat) {
539 case 0x10: /* 2, 2nd */
540 case 0x21: /* 3, 3rd */
541 case 0x32: /* 4, 4th */
542 case 0x43: /* 5, 5th */
543 case 0x54: /* 6, 6th */
544 /* last byte in sequence */
545 more = 0;
546 utf |= (this_char & 0x3f);
547 this_char = (unsigned short)utf;
548 break;
549 case 0x20: /* 3, 2nd */
550 case 0x31: /* 4, 3rd */
551 case 0x42: /* 5, 4th */
552 case 0x53: /* 6, 5th */
553 /* penultimate char */
554 utf |= ((this_char & 0x3f) << 6);
555 stat++;
556 break;
557 case 0x30: /* 4, 2nd */
558 case 0x41: /* 5, 3rd */
559 case 0x52: /* 6, 4th */
560 utf |= ((this_char & 0x3f) << 12);
561 stat++;
562 break;
563 case 0x40: /* 5, 2nd */
564 case 0x51:
565 utf |= ((this_char & 0x3f) << 18);
566 stat++;
567 break;
568 case 0x50: /* 6, 2nd */
569 utf |= ((this_char & 0x3f) << 24);
570 stat++;
571 break;
572 default:
573 /* invalid */
574 *status = FAILURE;
575 more = 0;
576 }
577 }
578 /* lead byte */
579 else if (this_char < 0xe0) {
580 stat = 0x10; /* 2 byte */
581 utf = (this_char & 0x1f) << 6;
582 CHECK_LEN(pos, 1);
583 } else if (this_char < 0xf0) {
584 stat = 0x20; /* 3 byte */
585 utf = (this_char & 0xf) << 12;
586 CHECK_LEN(pos, 2);
587 } else if (this_char < 0xf8) {
588 stat = 0x30; /* 4 byte */
589 utf = (this_char & 0x7) << 18;
590 CHECK_LEN(pos, 3);
591 } else if (this_char < 0xfc) {
592 stat = 0x40; /* 5 byte */
593 utf = (this_char & 0x3) << 24;
594 CHECK_LEN(pos, 4);
595 } else if (this_char < 0xfe) {
596 stat = 0x50; /* 6 byte */
597 utf = (this_char & 0x1) << 30;
598 CHECK_LEN(pos, 5);
599 } else {
600 /* invalid; bail */
601 more = 0;
602 *status = FAILURE;
603 break;
604 }
605
606 if (more) {
607 this_char = str[pos++];
608 MB_WRITE((unsigned char)this_char);
609 }
610 } while (more);
611 }
612 break;
613 case cs_big5:
614 case cs_gb2312:
615 case cs_big5hkscs:
616 {
617 /* check if this is the first of a 2-byte sequence */
618 if (this_char >= 0xa1 && this_char <= 0xfe) {
619 /* peek at the next char */
620 CHECK_LEN(pos, 1);
621 next_char = str[pos];
622 if ((next_char >= 0x40 && next_char <= 0x7e) ||
623 (next_char >= 0xa1 && next_char <= 0xfe)) {
624 /* yes, this a wide char */
625 this_char <<= 8;
626 MB_WRITE(next_char);
627 this_char |= next_char;
628 pos++;
629 }
630
631 }
632 break;
633 }
634 case cs_sjis:
635 {
636 /* check if this is the first of a 2-byte sequence */
637 if ( (this_char >= 0x81 && this_char <= 0x9f) ||
638 (this_char >= 0xe0 && this_char <= 0xef)
639 ) {
640 /* peek at the next char */
641 CHECK_LEN(pos, 1);
642 next_char = str[pos];
643 if ((next_char >= 0x40 && next_char <= 0x7e) ||
644 (next_char >= 0x80 && next_char <= 0xfc))
645 {
646 /* yes, this a wide char */
647 this_char <<= 8;
648 MB_WRITE(next_char);
649 this_char |= next_char;
650 pos++;
651 }
652
653 }
654 break;
655 }
656 case cs_eucjp:
657 {
658 /* check if this is the first of a multi-byte sequence */
659 if (this_char >= 0xa1 && this_char <= 0xfe) {
660 /* peek at the next char */
661 CHECK_LEN(pos, 1);
662 next_char = str[pos];
663 if (next_char >= 0xa1 && next_char <= 0xfe) {
664 /* yes, this a jis kanji char */
665 this_char <<= 8;
666 MB_WRITE(next_char);
667 this_char |= next_char;
668 pos++;
669 }
670
671 } else if (this_char == 0x8e) {
672 /* peek at the next char */
673 CHECK_LEN(pos, 1);
674 next_char = str[pos];
675 if (next_char >= 0xa1 && next_char <= 0xdf) {
676 /* JIS X 0201 kana */
677 this_char <<= 8;
678 MB_WRITE(next_char);
679 this_char |= next_char;
680 pos++;
681 }
682
683 } else if (this_char == 0x8f) {
684 /* peek at the next two char */
685 unsigned char next2_char;
686 CHECK_LEN(pos, 2);
687 next_char = str[pos];
688 next2_char = str[pos+1];
689 if ((next_char >= 0xa1 && next_char <= 0xfe) &&
690 (next2_char >= 0xa1 && next2_char <= 0xfe)) {
691 /* JIS X 0212 hojo-kanji */
692 this_char <<= 8;
693 MB_WRITE(next_char);
694 this_char |= next_char;
695 pos++;
696 this_char <<= 8;
697 MB_WRITE(next2_char);
698 this_char |= next2_char;
699 pos++;
700 }
701
702 }
703 break;
704 }
705 default:
706 break;
707 }
708 MB_RETURN;
709 }
710 /* }}} */
711
712 /* {{{ entity_charset determine_charset
713 * returns the charset identifier based on current locale or a hint.
714 * defaults to iso-8859-1 */
715 static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
716 {
717 int i;
718 enum entity_charset charset = cs_8859_1;
719 int len = 0;
720 zval *uf_result = NULL;
721
722 /* Guarantee default behaviour for backwards compatibility */
723 if (charset_hint == NULL)
724 return cs_8859_1;
725
726 if ((len = strlen(charset_hint)) != 0) {
727 goto det_charset;
728 }
729 #if HAVE_MBSTRING
730 #if !defined(COMPILE_DL_MBSTRING)
731 /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
732 switch (MBSTRG(current_internal_encoding)) {
733 case mbfl_no_encoding_8859_1:
734 return cs_8859_1;
735
736 case mbfl_no_encoding_utf8:
737 return cs_utf_8;
738
739 case mbfl_no_encoding_euc_jp:
740 case mbfl_no_encoding_eucjp_win:
741 return cs_eucjp;
742
743 case mbfl_no_encoding_sjis:
744 case mbfl_no_encoding_sjis_win:
745 case mbfl_no_encoding_sjis_mac:
746 return cs_sjis;
747
748 case mbfl_no_encoding_cp1252:
749 return cs_cp1252;
750
751 case mbfl_no_encoding_8859_15:
752 return cs_8859_15;
753
754 case mbfl_no_encoding_big5:
755 return cs_big5;
756
757 case mbfl_no_encoding_euc_cn:
758 case mbfl_no_encoding_hz:
759 case mbfl_no_encoding_cp936:
760 return cs_gb2312;
761
762 case mbfl_no_encoding_koi8r:
763 return cs_koi8r;
764
765 case mbfl_no_encoding_cp866:
766 return cs_cp866;
767
768 case mbfl_no_encoding_cp1251:
769 return cs_cp1251;
770
771 case mbfl_no_encoding_8859_5:
772 return cs_8859_5;
773
774 default:
775 ;
776 }
777 #else
778 {
779 zval nm_mb_internal_encoding;
780
781 ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
782
783 if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
784
785 charset_hint = Z_STRVAL_P(uf_result);
786 len = Z_STRLEN_P(uf_result);
787
788 if (len == 4) { /* sizeof(none|auto|pass)-1 */
789 if (!memcmp("pass", charset_hint, sizeof("pass") - 1) ||
790 !memcmp("auto", charset_hint, sizeof("auto") - 1) ||
791 !memcmp("none", charset_hint, sizeof("none") - 1)) {
792
793 charset_hint = NULL;
794 len = 0;
795 }
796 }
797 goto det_charset;
798 }
799 }
800 #endif
801 #endif
802
803 charset_hint = SG(default_charset);
804 if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
805 goto det_charset;
806 }
807
808 /* try to detect the charset for the locale */
809 #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
810 charset_hint = nl_langinfo(CODESET);
811 if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
812 goto det_charset;
813 }
814 #endif
815
816 #if HAVE_LOCALE_H
817 /* try to figure out the charset from the locale */
818 {
819 char *localename;
820 char *dot, *at;
821
822 /* lang[_territory][.codeset][@modifier] */
823 localename = setlocale(LC_CTYPE, NULL);
824
825 dot = strchr(localename, '.');