PHP Cross Reference
Main Menu
/repository/ php-src/ ext/ standard/ html.c
CVS Log
CVS Blame

changes to
this file in
the last:
day
week
month
  1 /*
  2    +----------------------------------------------------------------------+
  3    | PHP Version 5                                                        |
  4    +----------------------------------------------------------------------+
  5    | Copyright (c) 1997-2008 The PHP Group                                |
  6    +----------------------------------------------------------------------+
  7    | This source file is subject to version 3.01 of the PHP license,      |
  8    | that is bundled with this package in the file LICENSE, and is        |
  9    | available through the world-wide-web at the following url:           |
 10    | http://www.php.net/license/3_01.txt                                  |
 11    | If you did not receive a copy of the PHP license and are unable to   |
 12    | obtain it through the world-wide-web, please send a note to          |
 13    | license@php.net so we can mail you a copy immediately.               |
 14    +----------------------------------------------------------------------+
 15    | Authors: Rasmus Lerdorf <rasmus@php.net>                             |
 16    |          Jaakko Hyvätti <jaakko.hyvatti@iki.fi>                      |
 17    |          Wez Furlong <wez@thebrainroom.com>                          |
 18    +----------------------------------------------------------------------+
 19 */
 20 
 21 /* $Id: html.c,v 1.135 2008/08/18 03:26:06 moriyoshi Exp $ */
 22 
 23 /*
 24  * HTML entity resources:
 25  *
 26  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
 27  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
 28  * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
 29  *
 30  * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
 31  * 
 32  * UNICODE NOTE:
 33  *  The way Unicode support is implemented (namely, IS_UNICODE support) is by
 34  *  converting the IS_UNICODE strings to UTF-8 and handing them off to existing
 35  *  implementation. This saves on redoing all the code that encodes and decodes
 36  *  entities to support UChar*, but it does result in slight performance loss.
 37  *  Whoever wants to do this properly, go ahead.
 38  */
 39 
 40 #include "php.h"
 41 #if PHP_WIN32
 42 #include "config.w32.h"
 43 #else
 44 #include <php_config.h>
 45 #endif
 46 #include "html.h"
 47 #include "php_string.h"
 48 #include "SAPI.h"
 49 #if HAVE_LOCALE_H
 50 #include <locale.h>
 51 #endif
 52 #if HAVE_LANGINFO_H
 53 #include <langinfo.h>
 54 #endif
 55 
 56 #if HAVE_MBSTRING
 57 # include "ext/mbstring/mbstring.h"
 58 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
 59 #endif
 60 
 61 enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
 62                                           cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, 
 63                                           cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
 64                                           cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
 65                                         };
 66 typedef const char *const entity_table_t;
 67 
 68 /* codepage 1252 is a Windows extension to iso-8859-1. */
 69 static entity_table_t ent_cp_1252[] = {
 70         "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
 71         "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
 72         NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
 73         "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
 74         "oelig", NULL, NULL, "Yuml" 
 75 };
 76 
 77 static entity_table_t ent_iso_8859_1[] = {
 78         "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
 79         "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
 80         "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
 81         "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
 82         "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
 83         "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
 84         "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
 85         "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
 86         "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
 87         "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
 88         "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
 89         "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
 90         "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
 91         "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
 92         "uuml", "yacute", "thorn", "yuml"
 93 };
 94 
 95 static entity_table_t ent_iso_8859_15[] = {
 96         "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
 97         "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
 98         "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
 99         "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
100         "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
101         "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
102         "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
103         "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
104         "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
105         "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
106         "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
107         "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
108         "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
109         "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
110         "uuml", "yacute", "thorn", "yuml"
111 };
112 
113 static entity_table_t ent_uni_338_402[] = {
114         /* 338 (0x0152) */
115         "OElig", "oelig", NULL, NULL, NULL, NULL,
116         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
117         /* 352 (0x0160) */
118         "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
119         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
120         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
121         /* 376 (0x0178) */
122         "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
123         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
124         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
125         /* 400 (0x0190) */
126         NULL, NULL, "fnof"
127 };
128 
129 static entity_table_t ent_uni_spacing[] = {
130         /* 710 */
131         "circ",
132         /* 711 - 730 */
133         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
134         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
135         /* 731 - 732 */
136         NULL, "tilde"
137 };
138 
139 static entity_table_t ent_uni_greek[] = {
140         /* 913 */
141         "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
142         "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
143         NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
144         /* 938 - 944 are not mapped */
145         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
146         "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
147         "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
148         "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
149         /* 970 - 976 are not mapped */
150         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
151         "thetasym", "upsih",
152         NULL, NULL, NULL,
153         "piv" 
154 };
155 
156 static entity_table_t ent_uni_punct[] = {
157         /* 8194 */
158         "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
159         "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
160         NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
161         /* 8216 */
162         "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
163         "dagger", "Dagger",     "bull", NULL, NULL, NULL, "hellip",
164         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
165         /* 8242 */
166         "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
167         NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
168         "frasl"
169 };
170 
171 static entity_table_t ent_uni_euro[] = {
172         "euro"
173 };
174 
175 static entity_table_t ent_uni_8465_8501[] = {
176         /* 8465 */
177         "image", NULL, NULL, NULL, NULL, NULL, NULL,
178         /* 8472 */
179         "weierp", NULL, NULL, NULL,
180         /* 8476 */
181         "real", NULL, NULL, NULL, NULL, NULL,
182         /* 8482 */
183         "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
184         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
185         /* 8501 */
186         "alefsym",
187 };
188 
189 static entity_table_t ent_uni_8592_9002[] = {
190         /* 8592 (0x2190) */
191         "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
192         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
193         /* 8608 (0x21a0) */
194         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
195         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
196         /* 8624 (0x21b0) */
197         NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
198         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
199         /* 8640 (0x21c0) */
200         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
201         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
202         /* 8656 (0x21d0) */
203         "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", NULL, NULL,
204         NULL, NULL, "lAarr", "rAarr", NULL, "rarrw", NULL, NULL,
205         /* 8672 (0x21e0) */
206         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
207         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
208         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
209         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
210         /* 8704 (0x2200) */
211         "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla",
212         "isin", "notin", "epsis", "ni", "notni", "bepsi", NULL, "prod",
213         /* 8720 (0x2210) */
214         "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast",
215         "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90",
216         /* 8736 (0x2220) */
217         "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
218         "or", "cap", "cup", "int", NULL, NULL, "conint", NULL,
219         /* 8752 (0x2230) */
220         NULL, NULL, NULL, NULL, "there4", "becaus", NULL, NULL,
221         NULL, NULL, NULL, NULL, "sim", "bsim", NULL, NULL,
222         /* 8768 (0x2240) */
223         "wreath", "nsim", NULL, "sime", "nsime", "cong", NULL, "ncong",
224         "asymp", "nap", "ape", NULL, "bcong", "asymp", "bump", "bumpe",
225         /* 8784 (0x2250) */
226         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
227         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
228         /* 8800 (0x2260) */
229         "ne", "equiv", NULL, NULL, "le", "ge", "lE", "gE",
230         "lnE", "gnE", "Lt", "Gt", "twixt", NULL, "nlt", "ngt",
231         /* 8816 (0x2270) */
232         "nles", "nges", "lsim", "gsim", NULL, NULL, "lg", "gl",
233         NULL, NULL, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
234         /* 8832 (0x2280) */
235         "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
236         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
237         /* 8848 (0x2290) */
238         NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
239         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
240         /* 8864 (0x22a0) */
241         NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
242         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
243         /* 8880 (0x22b0) */
244         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
245         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
246         /* 8896 (0x22c0) */
247         NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
248         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
249         /* 8912 (0x22d0) */
250         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
251         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
252         /* 8928 (0x22e0) */
253         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
254         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
255         /* 8944 (0x22f0) */
256         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
257         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
258         /* 8960 (0x2300) */
259         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
260         "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
261         /* 8976 (0x2310) */
262         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
263         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
264         /* 8992 (0x2320) */
265         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
266         NULL, "lang", "rang"
267 };
268 
269 static entity_table_t ent_uni_9674[] = {
270         /* 9674 */
271         "loz"
272 };
273 
274 static entity_table_t ent_uni_9824_9830[] = {
275         /* 9824 */
276         "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
277 };
278 
279 static entity_table_t ent_koi8r[] = {
280         "#1105", /* "jo "*/
281         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
282         NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
283         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
284         "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", 
285         "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", 
286         "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", 
287         "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", 
288         "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", 
289         "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", 
290         "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", 
291         "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
292         "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", 
293         "#1066"
294 };
295 
296 static entity_table_t ent_cp_1251[] = {
297         "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
298         "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
299         "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
300         "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
301         "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
302         "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
303         "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
304         "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
305         "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
306         "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
307         "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
308         "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
309         "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
310         "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
311         "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
312         "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
313         "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
314         "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
315         "#1103"
316 };
317 
318 static entity_table_t ent_iso_8859_5[] = {
319         "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
320         "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
321         "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
322         "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
323         "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
324         "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
325         "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
326         "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
327         "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
328         "#1119"
329 };
330 
331 static entity_table_t ent_cp_866[] = {
332 
333         "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", 
334         "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", 
335         "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", 
336         "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", 
337         "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", 
338         "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", 
339         "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", 
340         "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", 
341         "#176", "#8729", "#183", "#8730", "#8470", "#164",  "#9632", 
342         "#160"
343 };
344 
345 /* MacRoman has a couple of low-ascii chars that need mapping too */
346 /* Vertical tab (ASCII 11) is often used to store line breaks inside */
347 /* DB exports, this mapping changes it to a space */
348 static entity_table_t ent_macroman[] = {
349         "sp", NULL, NULL, NULL,
350         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
351         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
352         NULL, NULL, NULL, NULL, NULL, "quot", NULL,
353         NULL, NULL, "amp", NULL, NULL, NULL, NULL,
354         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
355         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
356         NULL, NULL, NULL, "lt", NULL, "gt", NULL,
357         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
358         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
359         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
360         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
361         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
362         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
363         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
364         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
365         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
366         NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
367         "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
368         "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
369         "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
370         "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
371         "cent", "pound", "sect", "bull", "para", "szlig", "reg",
372         "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
373         "infin", "plusmn", "le", "ge", "yen", "micro", "part",
374         "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
375         "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
376         "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
377         "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
378         "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
379         "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
380         "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
381         "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
382         "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
383         "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
384         "#733", "#731", "#711"
385 };
386 
387 struct html_entity_map {
388         enum entity_charset charset;    /* charset identifier */
389         unsigned short basechar;                        /* char code at start of table */
390         unsigned short endchar;                 /* last char code in the table */
391         entity_table_t *table;                  /* the table of mappings */
392 };
393 
394 static const struct html_entity_map entity_map[] = {
395         { cs_cp1252,            0x80, 0x9f, ent_cp_1252 },
396         { cs_cp1252,            0xa0, 0xff, ent_iso_8859_1 },
397         { cs_8859_1,            0xa0, 0xff, ent_iso_8859_1 },
398         { cs_8859_15,           0xa0, 0xff, ent_iso_8859_15 },
399         { cs_utf_8,             0xa0, 0xff, ent_iso_8859_1 },
400         { cs_utf_8,             338,  402,  ent_uni_338_402 },
401         { cs_utf_8,             710,  732,  ent_uni_spacing },
402         { cs_utf_8,             913,  982,  ent_uni_greek },
403         { cs_utf_8,             8194, 8260, ent_uni_punct },
404         { cs_utf_8,             8364, 8364, ent_uni_euro }, 
405         { cs_utf_8,             8465, 8501, ent_uni_8465_8501 },
406         { cs_utf_8,             8592, 9002, ent_uni_8592_9002 },
407         { cs_utf_8,             9674, 9674, ent_uni_9674 },
408         { cs_utf_8,             9824, 9830, ent_uni_9824_9830 },
409         { cs_big5,                      0xa0, 0xff, ent_iso_8859_1 },
410         { cs_gb2312,            0xa0, 0xff, ent_iso_8859_1 },
411         { cs_big5hkscs,         0xa0, 0xff, ent_iso_8859_1 },
412         { cs_sjis,                      0xa0, 0xff, ent_iso_8859_1 },
413         { cs_eucjp,                     0xa0, 0xff, ent_iso_8859_1 },
414         { cs_koi8r,                 0xa3, 0xff, ent_koi8r },
415         { cs_cp1251,            0x80, 0xff, ent_cp_1251 },
416         { cs_8859_5,            0xc0, 0xff, ent_iso_8859_5 },
417         { cs_cp866,                 0xc0, 0xff, ent_cp_866 },
418         { cs_macroman,          0x0b, 0xff, ent_macroman },
419         { cs_terminator }
420 };
421 
422 static const struct {
423         const char *codeset;
424         enum entity_charset charset;
425 } charset_map[] = {
426         { "ISO-8859-1",         cs_8859_1 },
427         { "ISO8859-1",          cs_8859_1 },
428         { "ISO-8859-15",        cs_8859_15 },
429         { "ISO8859-15",         cs_8859_15 },
430         { "utf-8",                      cs_utf_8 },
431         { "cp1252",             cs_cp1252 },
432         { "Windows-1252",       cs_cp1252 },
433         { "1252",           cs_cp1252 }, 
434         { "BIG5",                       cs_big5 },
435         { "950",            cs_big5 },
436         { "GB2312",                     cs_gb2312 },
437         { "936",            cs_gb2312 },
438         { "BIG5-HKSCS",         cs_big5hkscs },
439         { "Shift_JIS",          cs_sjis },
440         { "SJIS",               cs_sjis },
441         { "932",            cs_sjis },
442         { "EUCJP",              cs_eucjp },
443         { "EUC-JP",             cs_eucjp },
444         { "KOI8-R",         cs_koi8r },
445         { "koi8-ru",        cs_koi8r },
446         { "koi8r",          cs_koi8r },
447         { "cp1251",         cs_cp1251 },
448         { "Windows-1251",   cs_cp1251 },
449         { "win-1251",       cs_cp1251 },
450         { "iso8859-5",      cs_8859_5 },
451         { "iso-8859-5",     cs_8859_5 },
452         { "cp866",          cs_cp866 },
453         { "866",            cs_cp866 },    
454         { "ibm866",         cs_cp866 },
455         { "MacRoman",       cs_macroman },
456         { NULL }
457 };
458 
459 static const struct {
460         unsigned short charcode;
461         char *entity;
462         int entitylen;
463         int flags;
464 } basic_entities[] = {
465         { '"',  "&quot;",       6,      ENT_HTML_QUOTE_DOUBLE },
466         { '\'', "&#039;",       6,      ENT_HTML_QUOTE_SINGLE },
467         { '\'', "&#39;",        5,      ENT_HTML_QUOTE_SINGLE },
468         { '<',  "&lt;",         4,      0 },
469         { '>',  "&gt;",         4,      0 },
470         { 0, NULL, 0, 0 }
471 };
472         
473 struct basic_entities_dec {
474         unsigned short charcode;
475         char entity[8];
476         int entitylen;  
477 };
478         
479 #define MB_RETURN { \
480                         *newpos = pos;       \
481                         mbseq[mbpos] = '\0'; \
482                         *mbseqlen = mbpos;   \
483                         return this_char; }
484                                         
485 #define MB_WRITE(mbchar) { \
486                         mbspace--;  \
487                         if (mbspace == 0) {      \
488                                 MB_RETURN;           \
489                         }                        \
490                         mbseq[mbpos++] = (mbchar); }
491 
492 #define CHECK_LEN(pos, chars_need)                      \
493         if((str_len - (pos)) < chars_need) {    \
494                 *status = FAILURE;                                      \
495                 return 0;                                                       \
496         }
497 
498 /* {{{ get_next_char
499  */
500 inline static unsigned short get_next_char(enum entity_charset charset,
501                 unsigned char * str,
502                 int str_len,
503                 int * newpos,
504                 unsigned char * mbseq,
505                 int * mbseqlen, 
506                 int *status)
507 {
508         int pos = *newpos;
509         int mbpos = 0;
510         int mbspace = *mbseqlen;
511         unsigned short this_char = str[pos++];
512         unsigned char next_char;
513 
514         *status = SUCCESS;
515         
516         if (mbspace <= 0) {
517                 *mbseqlen = 0;
518                 return this_char;
519         }
520         
521         MB_WRITE((unsigned char)this_char);
522         
523         switch (charset) {
524                 case cs_utf_8:
525                         {
526                                 unsigned long utf = 0;
527                                 int stat = 0;
528                                 int more = 1;
529 
530                                 /* unpack utf-8 encoding into a wide char.
531                                  * Code stolen from the mbstring extension */
532 
533                                 do {
534                                         if (this_char < 0x80) {
535                                                 more = 0;
536                                                 break;
537                                         } else if (this_char < 0xc0) {
538                                                 switch (stat) {
539                                                         case 0x10:      /* 2, 2nd */
540                                                         case 0x21:      /* 3, 3rd */
541                                                         case 0x32:      /* 4, 4th */
542                                                         case 0x43:      /* 5, 5th */
543                                                         case 0x54:      /* 6, 6th */
544                                                                 /* last byte in sequence */
545                                                                 more = 0;
546                                                                 utf |= (this_char & 0x3f);
547                                                                 this_char = (unsigned short)utf;
548                                                                 break;
549                                                         case 0x20:      /* 3, 2nd */
550                                                         case 0x31:      /* 4, 3rd */
551                                                         case 0x42:      /* 5, 4th */
552                                                         case 0x53:      /* 6, 5th */
553                                                                 /* penultimate char */
554                                                                 utf |= ((this_char & 0x3f) << 6);
555                                                                 stat++;
556                                                                 break;
557                                                         case 0x30:      /* 4, 2nd */
558                                                         case 0x41:      /* 5, 3rd */
559                                                         case 0x52:      /* 6, 4th */
560                                                                 utf |= ((this_char & 0x3f) << 12);
561                                                                 stat++;
562                                                                 break;
563                                                         case 0x40:      /* 5, 2nd */
564                                                         case 0x51:
565                                                                 utf |= ((this_char & 0x3f) << 18);
566                                                                 stat++;
567                                                                 break;
568                                                         case 0x50:      /* 6, 2nd */
569                                                                 utf |= ((this_char & 0x3f) << 24);
570                                                                 stat++;
571                                                                 break;
572                                                         default:
573                                                                 /* invalid */
574                                                                 *status = FAILURE;
575                                                                 more = 0;
576                                                 }
577                                         }
578                                         /* lead byte */
579                                         else if (this_char < 0xe0) {
580                                                 stat = 0x10;    /* 2 byte */
581                                                 utf = (this_char & 0x1f) << 6;
582                                                 CHECK_LEN(pos, 1);
583                                         } else if (this_char < 0xf0) {
584                                                 stat = 0x20;    /* 3 byte */
585                                                 utf = (this_char & 0xf) << 12;
586                                                 CHECK_LEN(pos, 2);
587                                         } else if (this_char < 0xf8) {
588                                                 stat = 0x30;    /* 4 byte */
589                                                 utf = (this_char & 0x7) << 18;
590                                                 CHECK_LEN(pos, 3);
591                                         } else if (this_char < 0xfc) {
592                                                 stat = 0x40;    /* 5 byte */
593                                                 utf = (this_char & 0x3) << 24;
594                                                 CHECK_LEN(pos, 4);
595                                         } else if (this_char < 0xfe) {
596                                                 stat = 0x50;    /* 6 byte */
597                                                 utf = (this_char & 0x1) << 30;
598                                                 CHECK_LEN(pos, 5);
599                                         } else {
600                                                 /* invalid; bail */
601                                                 more = 0;
602                                                 *status = FAILURE;
603                                                 break;
604                                         }
605 
606                                         if (more) {
607                                                 this_char = str[pos++];
608                                                 MB_WRITE((unsigned char)this_char);
609                                         }
610                                 } while (more);
611                         }
612                         break;
613                 case cs_big5:
614                 case cs_gb2312:
615                 case cs_big5hkscs:
616                         {
617                                 /* check if this is the first of a 2-byte sequence */
618                                 if (this_char >= 0xa1 && this_char <= 0xfe) {
619                                         /* peek at the next char */
620                                         CHECK_LEN(pos, 1);
621                                         next_char = str[pos];
622                                         if ((next_char >= 0x40 && next_char <= 0x7e) ||
623                                                         (next_char >= 0xa1 && next_char <= 0xfe)) {
624                                                 /* yes, this a wide char */
625                                                 this_char <<= 8;
626                                                 MB_WRITE(next_char);
627                                                 this_char |= next_char;
628                                                 pos++;
629                                         }
630                                         
631                                 }
632                                 break;
633                         }
634                 case cs_sjis:
635                         {
636                                 /* check if this is the first of a 2-byte sequence */
637                                 if ( (this_char >= 0x81 && this_char <= 0x9f) ||
638                                          (this_char >= 0xe0 && this_char <= 0xef)
639                                         ) {
640                                         /* peek at the next char */
641                                         CHECK_LEN(pos, 1);
642                                         next_char = str[pos];
643                                         if ((next_char >= 0x40 && next_char <= 0x7e) ||
644                                                 (next_char >= 0x80 && next_char <= 0xfc))
645                                         {
646                                                 /* yes, this a wide char */
647                                                 this_char <<= 8;
648                                                 MB_WRITE(next_char);
649                                                 this_char |= next_char;
650                                                 pos++;
651                                         }
652                                         
653                                 }
654                                 break;
655                         }
656                 case cs_eucjp:
657                         {
658                                 /* check if this is the first of a multi-byte sequence */
659                                 if (this_char >= 0xa1 && this_char <= 0xfe) {
660                                         /* peek at the next char */
661                                         CHECK_LEN(pos, 1);
662                                         next_char = str[pos];
663                                         if (next_char >= 0xa1 && next_char <= 0xfe) {
664                                                 /* yes, this a jis kanji char */
665                                                 this_char <<= 8;
666                                                 MB_WRITE(next_char);
667                                                 this_char |= next_char;
668                                                 pos++;
669                                         }
670                                         
671                                 } else if (this_char == 0x8e) {
672                                         /* peek at the next char */
673                                         CHECK_LEN(pos, 1);
674                                         next_char = str[pos];
675                                         if (next_char >= 0xa1 && next_char <= 0xdf) {
676                                                 /* JIS X 0201 kana */
677                                                 this_char <<= 8;
678                                                 MB_WRITE(next_char);
679                                                 this_char |= next_char;
680                                                 pos++;
681                                         }
682                                         
683                                 } else if (this_char == 0x8f) {
684                                         /* peek at the next two char */
685                                         unsigned char next2_char;
686                                         CHECK_LEN(pos, 2);
687                                         next_char = str[pos];
688                                         next2_char = str[pos+1];
689                                         if ((next_char >= 0xa1 && next_char <= 0xfe) &&
690                                                 (next2_char >= 0xa1 && next2_char <= 0xfe)) {
691                                                 /* JIS X 0212 hojo-kanji */
692                                                 this_char <<= 8;
693                                                 MB_WRITE(next_char);
694                                                 this_char |= next_char;
695                                                 pos++;
696                                                 this_char <<= 8;
697                                                 MB_WRITE(next2_char);
698                                                 this_char |= next2_char;
699                                                 pos++;
700                                         }
701                                         
702                                 }
703                                 break;
704                         }
705                 default:
706                         break;
707         }
708         MB_RETURN;
709 }
710 /* }}} */
711 
712 /* {{{ entity_charset determine_charset
713  * returns the charset identifier based on current locale or a hint.
714  * defaults to iso-8859-1 */
715 static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
716 {
717         int i;
718         enum entity_charset charset = cs_8859_1;
719         int len = 0;
720         zval *uf_result = NULL;
721 
722         /* Guarantee default behaviour for backwards compatibility */
723         if (charset_hint == NULL)
724                 return cs_8859_1;
725 
726         if ((len = strlen(charset_hint)) != 0) {
727                 goto det_charset;
728         }
729 #if HAVE_MBSTRING
730 #if !defined(COMPILE_DL_MBSTRING)
731         /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
732         switch (MBSTRG(current_internal_encoding)) {
733                 case mbfl_no_encoding_8859_1:
734                         return cs_8859_1;
735 
736                 case mbfl_no_encoding_utf8:
737                         return cs_utf_8;
738 
739                 case mbfl_no_encoding_euc_jp:
740                 case mbfl_no_encoding_eucjp_win:
741                         return cs_eucjp;
742 
743                 case mbfl_no_encoding_sjis:
744                 case mbfl_no_encoding_sjis_win:
745                 case mbfl_no_encoding_sjis_mac:
746                         return cs_sjis;
747 
748                 case mbfl_no_encoding_cp1252:
749                         return cs_cp1252;
750 
751                 case mbfl_no_encoding_8859_15:
752                         return cs_8859_15;
753 
754                 case mbfl_no_encoding_big5:
755                         return cs_big5;
756 
757                 case mbfl_no_encoding_euc_cn:
758                 case mbfl_no_encoding_hz:
759                 case mbfl_no_encoding_cp936:
760                         return cs_gb2312;
761 
762                 case mbfl_no_encoding_koi8r:
763                         return cs_koi8r;
764 
765                 case mbfl_no_encoding_cp866:
766                         return cs_cp866;
767 
768                 case mbfl_no_encoding_cp1251:
769                         return cs_cp1251;
770 
771                 case mbfl_no_encoding_8859_5:
772                         return cs_8859_5;
773 
774                 default:
775                         ;
776         }
777 #else
778         {
779                 zval nm_mb_internal_encoding;
780 
781                 ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
782 
783                 if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
784 
785                         charset_hint = Z_STRVAL_P(uf_result);
786                         len = Z_STRLEN_P(uf_result);
787                         
788                         if (len == 4) { /* sizeof(none|auto|pass)-1 */
789                                 if (!memcmp("pass", charset_hint, sizeof("pass") - 1) || 
790                                     !memcmp("auto", charset_hint, sizeof("auto") - 1) || 
791                                     !memcmp("none", charset_hint, sizeof("none") - 1)) {
792                                         
793                                         charset_hint = NULL;
794                                         len = 0;
795                                 }
796                         }
797                         goto det_charset;
798                 }
799         }
800 #endif
801 #endif
802 
803         charset_hint = SG(default_charset);
804         if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
805                 goto det_charset;
806         }
807 
808         /* try to detect the charset for the locale */
809 #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
810         charset_hint = nl_langinfo(CODESET);
811         if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
812                 goto det_charset;
813         }
814 #endif
815 
816 #if HAVE_LOCALE_H
817         /* try to figure out the charset from the locale */
818         {
819                 char *localename;
820                 char *dot, *at;
821 
822                 /* lang[_territory][.codeset][@modifier] */
823                 localename = setlocale(LC_CTYPE, NULL);
824 
825                 dot = strchr(localename, '.');