1/* 2 +----------------------------------------------------------------------+ 3 | PHP Version 5 | 4 +----------------------------------------------------------------------+ 5 | Copyright (c) 1997-2013 The PHP Group | 6 +----------------------------------------------------------------------+ 7 | This source file is subject to version 3.01 of the PHP license, | 8 | that is bundled with this package in the file LICENSE, and is | 9 | available through the world-wide-web at the following url: | 10 | http://www.php.net/license/3_01.txt | 11 | If you did not receive a copy of the PHP license and are unable to | 12 | obtain it through the world-wide-web, please send a note to | 13 | license@php.net so we can mail you a copy immediately. | 14 +----------------------------------------------------------------------+ 15 | Authors: Rasmus Lerdorf <rasmus@php.net> | 16 | Jaakko Hyv�tti <jaakko.hyvatti@iki.fi> | 17 | Wez Furlong <wez@thebrainroom.com> | 18 +----------------------------------------------------------------------+ 19*/ 20 21/* $Id$ */ 22 23/* 24 * HTML entity resources: 25 * 26 * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp 27 * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp 28 * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT 29 * 30 * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2 31 * 32 */ 33 34#include "php.h" 35#if PHP_WIN32 36#include "config.w32.h" 37#else 38#include <php_config.h> 39#endif 40#include "html.h" 41#include "php_string.h" 42#include "SAPI.h" 43#if HAVE_LOCALE_H 44#include <locale.h> 45#endif 46#if HAVE_LANGINFO_H 47#include <langinfo.h> 48#endif 49 50#if HAVE_MBSTRING 51# include "ext/mbstring/mbstring.h" 52ZEND_EXTERN_MODULE_GLOBALS(mbstring) 53#endif 54 55enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, 56 cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, 57 cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r, 58 cs_cp1251, cs_8859_5, cs_cp866, cs_macroman 59 }; 60typedef const char *const entity_table_t; 61 62/* codepage 1252 is a Windows extension to iso-8859-1. */ 63static entity_table_t ent_cp_1252[] = { 64 "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger", 65 "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig", 66 NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo", 67 "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo", 68 "oelig", NULL, NULL, "Yuml" 69}; 70 71static entity_table_t ent_iso_8859_1[] = { 72 "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", 73 "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg", 74 "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro", 75 "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", 76 "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc", 77 "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", 78 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", 79 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", 80 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", 81 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", 82 "atilde", "auml", "aring", "aelig", "ccedil", "egrave", 83 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", 84 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", 85 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", 86 "uuml", "yacute", "thorn", "yuml" 87}; 88 89static entity_table_t ent_iso_8859_15[] = { 90 "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron", 91 "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg", 92 "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */ 93 "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm", 94 "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute", 95 "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", 96 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", 97 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", 98 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", 99 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", 100 "atilde", "auml", "aring", "aelig", "ccedil", "egrave", 101 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", 102 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", 103 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", 104 "uuml", "yacute", "thorn", "yuml" 105}; 106 107static entity_table_t ent_uni_338_402[] = { 108 /* 338 (0x0152) */ 109 "OElig", "oelig", NULL, NULL, NULL, NULL, 110 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 111 /* 352 (0x0160) */ 112 "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL, 113 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 114 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 115 /* 376 (0x0178) */ 116 "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL, 117 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 118 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 119 /* 400 (0x0190) */ 120 NULL, NULL, "fnof" 121}; 122 123static entity_table_t ent_uni_spacing[] = { 124 /* 710 */ 125 "circ", 126 /* 711 - 730 */ 127 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 128 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 129 /* 731 - 732 */ 130 NULL, "tilde" 131}; 132 133static entity_table_t ent_uni_greek[] = { 134 /* 913 */ 135 "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta", 136 "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", 137 NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega", 138 /* 938 - 944 are not mapped */ 139 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 140 "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", 141 "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", 142 "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", 143 /* 970 - 976 are not mapped */ 144 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 145 "thetasym", "upsih", 146 NULL, NULL, NULL, 147 "piv" 148}; 149 150static entity_table_t ent_uni_punct[] = { 151 /* 8194 */ 152 "ensp", "emsp", NULL, NULL, NULL, NULL, NULL, 153 "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm", 154 NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL, 155 /* 8216 */ 156 "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL, 157 "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", 158 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL, 159 /* 8242 */ 160 "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL, 161 NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL, 162 "frasl" 163}; 164 165static entity_table_t ent_uni_euro[] = { 166 "euro" 167}; 168 169static entity_table_t ent_uni_8465_8501[] = { 170 /* 8465 */ 171 "image", NULL, NULL, NULL, NULL, NULL, NULL, 172 /* 8472 */ 173 "weierp", NULL, NULL, NULL, 174 /* 8476 */ 175 "real", NULL, NULL, NULL, NULL, NULL, 176 /* 8482 */ 177 "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 178 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 179 /* 8501 */ 180 "alefsym", 181}; 182 183static entity_table_t ent_uni_8592_9002[] = { 184 /* 8592 (0x2190) */ 185 "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL, 186 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 187 /* 8608 (0x21a0) */ 188 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 189 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 190 /* 8624 (0x21b0) */ 191 NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL, 192 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 193 /* 8640 (0x21c0) */ 194 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 195 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 196 /* 8656 (0x21d0) */ 197 "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL, 198 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 199 /* 8672 (0x21e0) */ 200 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 201 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 202 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 203 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 204 /* 8704 (0x2200) */ 205 "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla", 206 "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod", 207 /* 8720 (0x2210) */ 208 NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast", 209 NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL, 210 /* 8736 (0x2220) */ 211 "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and", 212 "or", "cap", "cup", "int", NULL, NULL, NULL, NULL, 213 /* 8752 (0x2230) */ 214 NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL, 215 NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL, 216 /* 8768 (0x2240) */ 217 NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL, 218 "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL, 219 /* 8784 (0x2250) */ 220 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 221 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 222 /* 8800 (0x2260) */ 223 "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL, 224 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 225 /* 8816 (0x2270) */ 226 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 227 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 228 /* 8832 (0x2280) */ 229 NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe", 230 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 231 /* 8848 (0x2290) */ 232 NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes", 233 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 234 /* 8864 (0x22a0) */ 235 NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL, 236 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 237 /* 8880 (0x22b0) */ 238 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 239 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 240 /* 8896 (0x22c0) */ 241 NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL, 242 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 243 /* 8912 (0x22d0) */ 244 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 245 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 246 /* 8928 (0x22e0) */ 247 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 248 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 249 /* 8944 (0x22f0) */ 250 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 251 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 252 /* 8960 (0x2300) */ 253 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 254 "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL, 255 /* 8976 (0x2310) */ 256 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 257 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 258 /* 8992 (0x2320) */ 259 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 260 NULL, "lang", "rang" 261}; 262 263static entity_table_t ent_uni_9674[] = { 264 /* 9674 */ 265 "loz" 266}; 267 268static entity_table_t ent_uni_9824_9830[] = { 269 /* 9824 */ 270 "spades", NULL, NULL, "clubs", NULL, "hearts", "diams" 271}; 272 273static entity_table_t ent_koi8r[] = { 274 "#1105", /* "jo "*/ 275 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 276 NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */ 277 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 278 "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", 279 "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", 280 "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", 281 "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", 282 "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", 283 "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", 284 "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", 285 "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042", 286 "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", 287 "#1066" 288}; 289 290static entity_table_t ent_cp_1251[] = { 291 "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger", 292 "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036", 293 "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220", 294 "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250", 295 "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118", 296 "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy", 297 "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn", 298 "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105", 299 "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111", 300 "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046", 301 "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053", 302 "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060", 303 "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067", 304 "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074", 305 "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081", 306 "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088", 307 "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095", 308 "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102", 309 "#1103" 310}; 311 312static entity_table_t ent_iso_8859_5[] = { 313 "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062", 314 "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069", 315 "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076", 316 "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083", 317 "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090", 318 "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", 319 "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104", 320 "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111", 321 "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118", 322 "#1119" 323}; 324 325static entity_table_t ent_cp_866[] = { 326 327 "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", 328 "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", 329 "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", 330 "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", 331 "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", 332 "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", 333 "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", 334 "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", 335 "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632", 336 "#160" 337}; 338 339/* MacRoman has a couple of low-ascii chars that need mapping too */ 340/* Vertical tab (ASCII 11) is often used to store line breaks inside */ 341/* DB exports, this mapping changes it to a space */ 342static entity_table_t ent_macroman[] = { 343 "sp", NULL, NULL, NULL, 344 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 345 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 346 NULL, NULL, NULL, NULL, NULL, "quot", NULL, 347 NULL, NULL, "amp", NULL, NULL, NULL, NULL, 348 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 349 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 350 NULL, NULL, NULL, "lt", NULL, "gt", NULL, 351 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 352 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 353 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 354 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 355 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 356 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 357 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 358 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 359 NULL, NULL, NULL, NULL, NULL, NULL, NULL, 360 NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml", 361 "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring", 362 "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave", 363 "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml", 364 "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg", 365 "cent", "pound", "sect", "bull", "para", "szlig", "reg", 366 "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash", 367 "infin", "plusmn", "le", "ge", "yen", "micro", "part", 368 "sum", "prod", "pi", "int", "ordf", "ordm", "Omega", 369 "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof", 370 "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave", 371 "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo", 372 "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml", 373 "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger", 374 "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute", 375 "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute", 376 "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305", 377 "circ", "tilde", "macr", "#728", "#729", "#730", "cedil", 378 "#733", "#731", "#711" 379}; 380 381struct html_entity_map { 382 enum entity_charset charset; /* charset identifier */ 383 unsigned int basechar; /* char code at start of table */ 384 unsigned int endchar; /* last char code in the table */ 385 entity_table_t *table; /* the table of mappings */ 386}; 387 388static const struct html_entity_map entity_map[] = { 389 { cs_cp1252, 0x80, 0x9f, ent_cp_1252 }, 390 { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 }, 391 { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, 392 { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, 393 { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, 394 { cs_utf_8, 338, 402, ent_uni_338_402 }, 395 { cs_utf_8, 710, 732, ent_uni_spacing }, 396 { cs_utf_8, 913, 982, ent_uni_greek }, 397 { cs_utf_8, 8194, 8260, ent_uni_punct }, 398 { cs_utf_8, 8364, 8364, ent_uni_euro }, 399 { cs_utf_8, 8465, 8501, ent_uni_8465_8501 }, 400 { cs_utf_8, 8592, 9002, ent_uni_8592_9002 }, 401 { cs_utf_8, 9674, 9674, ent_uni_9674 }, 402 { cs_utf_8, 9824, 9830, ent_uni_9824_9830 }, 403 { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, 404 { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, 405 { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, 406 { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 }, 407 { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 }, 408 { cs_koi8r, 0xa3, 0xff, ent_koi8r }, 409 { cs_cp1251, 0x80, 0xff, ent_cp_1251 }, 410 { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 }, 411 { cs_cp866, 0xc0, 0xff, ent_cp_866 }, 412 { cs_macroman, 0x0b, 0xff, ent_macroman }, 413 { cs_terminator } 414}; 415 416static const struct { 417 const char *codeset; 418 enum entity_charset charset; 419} charset_map[] = { 420 { "ISO-8859-1", cs_8859_1 }, 421 { "ISO8859-1", cs_8859_1 }, 422 { "ISO-8859-15", cs_8859_15 }, 423 { "ISO8859-15", cs_8859_15 }, 424 { "utf-8", cs_utf_8 }, 425 { "cp1252", cs_cp1252 }, 426 { "Windows-1252", cs_cp1252 }, 427 { "1252", cs_cp1252 }, 428 { "BIG5", cs_big5 }, 429 { "950", cs_big5 }, 430 { "GB2312", cs_gb2312 }, 431 { "936", cs_gb2312 }, 432 { "BIG5-HKSCS", cs_big5hkscs }, 433 { "Shift_JIS", cs_sjis }, 434 { "SJIS", cs_sjis }, 435 { "932", cs_sjis }, 436 { "EUCJP", cs_eucjp }, 437 { "EUC-JP", cs_eucjp }, 438 { "KOI8-R", cs_koi8r }, 439 { "koi8-ru", cs_koi8r }, 440 { "koi8r", cs_koi8r }, 441 { "cp1251", cs_cp1251 }, 442 { "Windows-1251", cs_cp1251 }, 443 { "win-1251", cs_cp1251 }, 444 { "iso8859-5", cs_8859_5 }, 445 { "iso-8859-5", cs_8859_5 }, 446 { "cp866", cs_cp866 }, 447 { "866", cs_cp866 }, 448 { "ibm866", cs_cp866 }, 449 { "MacRoman", cs_macroman }, 450 { NULL } 451}; 452 453static const struct { 454 unsigned short charcode; 455 char *entity; 456 int entitylen; 457 int flags; 458} basic_entities[] = { 459 { '"', """, 6, ENT_HTML_QUOTE_DOUBLE }, 460 { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE }, 461 { '\'', "'", 5, ENT_HTML_QUOTE_SINGLE }, 462 { '<', "<", 4, 0 }, 463 { '>', ">", 4, 0 }, 464 { 0, NULL, 0, 0 } 465}; 466 467struct basic_entities_dec { 468 unsigned short charcode; 469 char entity[8]; 470 int entitylen; 471}; 472 473#define MB_RETURN { \ 474 *newpos = pos; \ 475 mbseq[mbpos] = '\0'; \ 476 *mbseqlen = mbpos; \ 477 return this_char; } 478 479#define MB_WRITE(mbchar) { \ 480 mbspace--; \ 481 if (mbspace == 0) { \ 482 MB_RETURN; \ 483 } \ 484 mbseq[mbpos++] = (mbchar); } 485 486/* skip one byte and return */ 487#define MB_FAILURE(pos) do { \ 488 *newpos = pos + 1; \ 489 *status = FAILURE; \ 490 return 0; \ 491} while (0) 492 493#define CHECK_LEN(pos, chars_need) \ 494 if (chars_need < 1) { \ 495 if((str_len - (pos)) < chars_need) { \ 496 *newpos = pos; \ 497 *status = FAILURE; \ 498 return 0; \ 499 } \ 500 } else { \ 501 if((str_len - (pos)) < chars_need) { \ 502 *newpos = pos + 1; \ 503 *status = FAILURE; \ 504 return 0; \ 505 } \ 506 } 507 508/* {{{ get_next_char 509 */ 510inline static unsigned int get_next_char(enum entity_charset charset, 511 unsigned char * str, 512 int str_len, 513 int * newpos, 514 unsigned char * mbseq, 515 int * mbseqlen, 516 int *status) 517{ 518 int pos = *newpos; 519 int mbpos = 0; 520 int mbspace = *mbseqlen; 521 unsigned int this_char = 0; 522 unsigned char next_char; 523 524 *status = SUCCESS; 525 526 if (mbspace <= 0) { 527 *mbseqlen = 0; 528 CHECK_LEN(pos, 1); 529 *newpos = pos + 1; 530 return str[pos]; 531 } 532 533 switch (charset) { 534 case cs_utf_8: 535 { 536 unsigned char c; 537 CHECK_LEN(pos, 1); 538 c = str[pos]; 539 if (c < 0x80) { 540 MB_WRITE(c); 541 this_char = c; 542 pos++; 543 } else if (c < 0xc2) { 544 MB_FAILURE(pos); 545 } else if (c < 0xe0) { 546 CHECK_LEN(pos, 2); 547 if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { 548 MB_FAILURE(pos); 549 } 550 this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f); 551 if (this_char < 0x80) { 552 MB_FAILURE(pos); 553 } 554 MB_WRITE((unsigned char)c); 555 MB_WRITE((unsigned char)str[pos + 1]); 556 pos += 2; 557 } else if (c < 0xf0) { 558 CHECK_LEN(pos, 3); 559 if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { 560 MB_FAILURE(pos); 561 } 562 if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) { 563 MB_FAILURE(pos); 564 } 565 this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f); 566 if (this_char < 0x800) { 567 MB_FAILURE(pos); 568 } else if (this_char >= 0xd800 && this_char <= 0xdfff) { 569 MB_FAILURE(pos); 570 } 571 MB_WRITE((unsigned char)c); 572 MB_WRITE((unsigned char)str[pos + 1]); 573 MB_WRITE((unsigned char)str[pos + 2]); 574 pos += 3; 575 } else if (c < 0xf5) { 576 CHECK_LEN(pos, 4); 577 if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { 578 MB_FAILURE(pos); 579 } 580 if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) { 581 MB_FAILURE(pos); 582 } 583 if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) { 584 MB_FAILURE(pos); 585 } 586 this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f); 587 if (this_char < 0x10000 || this_char > 0x10FFFF) { 588 MB_FAILURE(pos); 589 } 590 MB_WRITE((unsigned char)c); 591 MB_WRITE((unsigned char)str[pos + 1]); 592 MB_WRITE((unsigned char)str[pos + 2]); 593 MB_WRITE((unsigned char)str[pos + 3]); 594 pos += 4; 595 } else { 596 MB_FAILURE(pos); 597 } 598 } 599 break; 600 case cs_big5: 601 case cs_gb2312: 602 case cs_big5hkscs: 603 { 604 CHECK_LEN(pos, 1); 605 this_char = str[pos++]; 606 /* check if this is the first of a 2-byte sequence */ 607 if (this_char >= 0x81 && this_char <= 0xfe) { 608 /* peek at the next char */ 609 CHECK_LEN(pos, 1); 610 next_char = str[pos++]; 611 if ((next_char >= 0x40 && next_char <= 0x7e) || 612 (next_char >= 0xa1 && next_char <= 0xfe)) { 613 /* yes, this a wide char */ 614 MB_WRITE(this_char); 615 MB_WRITE(next_char); 616 this_char = (this_char << 8) | next_char; 617 } else { 618 MB_FAILURE(pos); 619 } 620 } else { 621 MB_WRITE(this_char); 622 } 623 } 624 break; 625 case cs_sjis: 626 { 627 CHECK_LEN(pos, 1); 628 this_char = str[pos++]; 629 /* check if this is the first of a 2-byte sequence */ 630 if ((this_char >= 0x81 && this_char <= 0x9f) || 631 (this_char >= 0xe0 && this_char <= 0xfc)) { 632 /* peek at the next char */ 633 CHECK_LEN(pos, 1); 634 next_char = str[pos++]; 635 if ((next_char >= 0x40 && next_char <= 0x7e) || 636 (next_char >= 0x80 && next_char <= 0xfc)) 637 { 638 /* yes, this a wide char */ 639 MB_WRITE(this_char); 640 MB_WRITE(next_char); 641 this_char = (this_char << 8) | next_char; 642 } else { 643 MB_FAILURE(pos); 644 } 645 } else { 646 MB_WRITE(this_char); 647 } 648 break; 649 } 650 case cs_eucjp: 651 { 652 CHECK_LEN(pos, 1); 653 this_char = str[pos++]; 654 /* check if this is the first of a multi-byte sequence */ 655 if (this_char >= 0xa1 && this_char <= 0xfe) { 656 /* peek at the next char */ 657 CHECK_LEN(pos, 1); 658 next_char = str[pos++]; 659 if (next_char >= 0xa1 && next_char <= 0xfe) { 660 /* yes, this a jis kanji char */ 661 MB_WRITE(this_char); 662 MB_WRITE(next_char); 663 this_char = (this_char << 8) | next_char; 664 } else { 665 MB_FAILURE(pos); 666 } 667 } else if (this_char == 0x8e) { 668 /* peek at the next char */ 669 CHECK_LEN(pos, 1); 670 next_char = str[pos++]; 671 if (next_char >= 0xa1 && next_char <= 0xdf) { 672 /* JIS X 0201 kana */ 673 MB_WRITE(this_char); 674 MB_WRITE(next_char); 675 this_char = (this_char << 8) | next_char; 676 } else { 677 MB_FAILURE(pos); 678 } 679 } else if (this_char == 0x8f) { 680 /* peek at the next two char */ 681 unsigned char next2_char; 682 CHECK_LEN(pos, 2); 683 next_char = str[pos]; 684 next2_char = str[pos + 1]; 685 pos += 2; 686 if ((next_char >= 0xa1 && next_char <= 0xfe) && 687 (next2_char >= 0xa1 && next2_char <= 0xfe)) { 688 /* JIS X 0212 hojo-kanji */ 689 MB_WRITE(this_char); 690 MB_WRITE(next_char); 691 MB_WRITE(next2_char); 692 this_char = (this_char << 16) | (next_char << 8) | next2_char; 693 } else { 694 MB_FAILURE(pos); 695 } 696 } else { 697 MB_WRITE(this_char); 698 } 699 break; 700 } 701 default: 702 /* single-byte charsets */ 703 CHECK_LEN(pos, 1); 704 this_char = str[pos++]; 705 MB_WRITE(this_char); 706 break; 707 } 708 MB_RETURN; 709} 710/* }}} */ 711 712/* {{{ entity_charset determine_charset 713 * returns the charset identifier based on current locale or a hint. 714 * defaults to iso-8859-1 */ 715static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC) 716{ 717 int i; 718 enum entity_charset charset = cs_8859_1; 719 int len = 0; 720 zval *uf_result = NULL; 721 722 /* Guarantee default behaviour for backwards compatibility */ 723 if (charset_hint == NULL) 724 return cs_8859_1; 725 726 if ((len = strlen(charset_hint)) != 0) { 727 goto det_charset; 728 } 729#if HAVE_MBSTRING 730#if !defined(COMPILE_DL_MBSTRING) 731 /* XXX: Ugly things. Why don't we look for a more sophisticated way? */ 732 switch (MBSTRG(current_internal_encoding)) { 733 case mbfl_no_encoding_8859_1: 734 return cs_8859_1; 735 736 case mbfl_no_encoding_utf8: 737 return cs_utf_8; 738 739 case mbfl_no_encoding_euc_jp: 740 case mbfl_no_encoding_eucjp_win: 741 return cs_eucjp; 742 743 case mbfl_no_encoding_sjis: 744 case mbfl_no_encoding_sjis_open: 745 case mbfl_no_encoding_cp932: 746 return cs_sjis; 747 748 case mbfl_no_encoding_cp1252: 749 return cs_cp1252; 750 751 case mbfl_no_encoding_8859_15: 752 return cs_8859_15; 753 754 case mbfl_no_encoding_big5: 755 return cs_big5; 756 757 case mbfl_no_encoding_euc_cn: 758 case mbfl_no_encoding_hz: 759 case mbfl_no_encoding_cp936: 760 return cs_gb2312; 761 762 case mbfl_no_encoding_koi8r: 763 return cs_koi8r; 764 765 case mbfl_no_encoding_cp866: 766 return cs_cp866; 767 768 case mbfl_no_encoding_cp1251: 769 return cs_cp1251; 770 771 case mbfl_no_encoding_8859_5: 772 return cs_8859_5; 773 774 default: 775 ; 776 } 777#else 778 { 779 zval nm_mb_internal_encoding; 780 781 ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0); 782 783 if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) { 784 785 charset_hint = Z_STRVAL_P(uf_result); 786 len = Z_STRLEN_P(uf_result); 787 788 if (charset_hint != NULL && len != 0) { 789 if (len == 4) { /* sizeof(none|auto|pass)-1 */ 790 if (!memcmp("pass", charset_hint, sizeof("pass") - 1) || 791 !memcmp("auto", charset_hint, sizeof("auto") - 1) || 792 !memcmp("none", charset_hint, sizeof("none") - 1)) { 793 794 charset_hint = NULL; 795 len = 0; 796 } 797 } else { 798 /* Jump to det_charset only if mbstring isn't one of above eq pass, auto, none. 799 Otherwise try default_charset next */ 800 goto det_charset; 801 } 802 } 803 } 804 } 805#endif 806#endif 807 808 charset_hint = SG(default_charset); 809 if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) { 810 goto det_charset; 811 } 812 813 /* try to detect the charset for the locale */ 814#if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET) 815 charset_hint = nl_langinfo(CODESET); 816 if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) { 817 goto det_charset; 818 } 819#endif 820 821#if HAVE_LOCALE_H 822 /* try to figure out the charset from the locale */ 823 { 824 char *localename; 825 char *dot, *at; 826 827 /* lang[_territory][.codeset][@modifier] */ 828 localename = setlocale(LC_CTYPE, NULL); 829 830 dot = strchr(localename, '.'); 831 if (dot) { 832 dot++; 833 /* locale specifies a codeset */ 834 at = strchr(dot, '@'); 835 if (at) 836 len = at - dot; 837 else 838 len = strlen(dot); 839 charset_hint = dot; 840 } else { 841 /* no explicit name; see if the name itself 842 * is the charset */ 843 charset_hint = localename; 844 len = strlen(charset_hint); 845 } 846 } 847#endif 848 849det_charset: 850 851 if (charset_hint) { 852 int found = 0; 853 854 /* now walk the charset map and look for the codeset */ 855 for (i = 0; charset_map[i].codeset; i++) { 856 if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) { 857 charset = charset_map[i].charset; 858 found = 1; 859 break; 860 } 861 } 862 if (!found) { 863 php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1", 864 charset_hint); 865 } 866 } 867 if (uf_result != NULL) { 868 zval_ptr_dtor(&uf_result); 869 } 870 return charset; 871} 872/* }}} */ 873 874/* {{{ php_utf32_utf8 */ 875size_t php_utf32_utf8(unsigned char *buf, unsigned k) 876{ 877 size_t retval = 0; 878 879 if (k < 0x80) { 880 buf[0] = k; 881 retval = 1; 882 } else if (k < 0x800) { 883 buf[0] = 0xc0 | (k >> 6); 884 buf[1] = 0x80 | (k & 0x3f); 885 retval = 2; 886 } else if (k < 0x10000) { 887 buf[0] = 0xe0 | (k >> 12); 888 buf[1] = 0x80 | ((k >> 6) & 0x3f); 889 buf[2] = 0x80 | (k & 0x3f); 890 retval = 3; 891 } else if (k < 0x200000) { 892 buf[0] = 0xf0 | (k >> 18); 893 buf[1] = 0x80 | ((k >> 12) & 0x3f); 894 buf[2] = 0x80 | ((k >> 6) & 0x3f); 895 buf[3] = 0x80 | (k & 0x3f); 896 retval = 4; 897 } else if (k < 0x4000000) { 898 buf[0] = 0xf8 | (k >> 24); 899 buf[1] = 0x80 | ((k >> 18) & 0x3f); 900 buf[2] = 0x80 | ((k >> 12) & 0x3f); 901 buf[3] = 0x80 | ((k >> 6) & 0x3f); 902 buf[4] = 0x80 | (k & 0x3f); 903 retval = 5; 904 } else { 905 buf[0] = 0xfc | (k >> 30); 906 buf[1] = 0x80 | ((k >> 24) & 0x3f); 907 buf[2] = 0x80 | ((k >> 18) & 0x3f); 908 buf[3] = 0x80 | ((k >> 12) & 0x3f); 909 buf[4] = 0x80 | ((k >> 6) & 0x3f); 910 buf[5] = 0x80 | (k & 0x3f); 911 retval = 6; 912 } 913 buf[retval] = '\0'; 914 915 return retval; 916} 917/* }}} */ 918 919/* {{{ php_unescape_html_entities 920 */ 921PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC) 922{ 923 int retlen; 924 int j, k; 925 char *replaced, *ret, *p, *q, *lim, *next; 926 enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC); 927 unsigned char replacement[15]; 928 int replacement_len; 929 930 ret = estrndup(old, oldlen); 931 retlen = oldlen; 932 if (!retlen) { 933 goto empty_source; 934 } 935 936 if (all) { 937 /* look for a match in the maps for this charset */ 938 for (j = 0; entity_map[j].charset != cs_terminator; j++) { 939 if (entity_map[j].charset != charset) 940 continue; 941 942 for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) { 943 unsigned char entity[32]; 944 int entity_length = 0; 945 946 if (entity_map[j].table[k - entity_map[j].basechar] == NULL) 947 continue; 948 949 entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]); 950 if (entity_length >= sizeof(entity)) { 951 continue; 952 } 953 954 /* When we have MBCS entities in the tables above, this will need to handle it */ 955 replacement_len = 0; 956 switch (charset) { 957 case cs_8859_1: 958 case cs_cp1252: 959 case cs_8859_15: 960 case cs_cp1251: 961 case cs_8859_5: 962 case cs_cp866: 963 case cs_koi8r: 964 replacement[0] = k; 965 replacement[1] = '\0'; 966 replacement_len = 1; 967 break; 968 969 case cs_big5: 970 case cs_gb2312: 971 case cs_big5hkscs: 972 case cs_sjis: 973 case cs_eucjp: 974 /* we cannot properly handle those multibyte encodings 975 * with php_str_to_str. skip it. */ 976 continue; 977 978 case cs_utf_8: 979 replacement_len = php_utf32_utf8(replacement, k); 980 break; 981 982 default: 983 php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!"); 984 efree(ret); 985 return NULL; 986 } 987 988 if (php_memnstr(ret, entity, entity_length, ret+retlen)) { 989 replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen); 990 efree(ret); 991 ret = replaced; 992 } 993 } 994 } 995 } 996 997 for (j = 0; basic_entities[j].charcode != 0; j++) { 998 999 if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0) 1000 continue; 1001 1002 replacement[0] = (unsigned char)basic_entities[j].charcode; 1003 replacement[1] = '\0'; 1004 1005 if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) { 1006 replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen); 1007 efree(ret); 1008 ret = replaced; 1009 } 1010 } 1011 1012 /* replace numeric entities & "&" */ 1013 lim = ret + retlen; 1014 for (p = ret, q = ret; p < lim;) { 1015 int code; 1016 1017 if (p[0] == '&') { 1018 if (p + 2 < lim) { 1019 if (p[1] == '#') { 1020 int invalid_code = 0; 1021 1022 if (p[2] == 'x' || p[2] == 'X') { 1023 code = strtol(p + 3, &next, 16); 1024 } else { 1025 code = strtol(p + 2, &next, 10); 1026 } 1027 1028 if ((code == '\'' && !(quote_style & ENT_HTML_QUOTE_SINGLE)) || 1029 (code == '"' && !(quote_style & ENT_HTML_QUOTE_DOUBLE))) { 1030 invalid_code = 1; 1031 } 1032 1033 if (next != NULL && *next == ';' && !invalid_code) { 1034 switch (charset) { 1035 case cs_utf_8: 1036 q += php_utf32_utf8(q, code); 1037 break; 1038 1039 case cs_8859_1: 1040 case cs_8859_5: 1041 case cs_8859_15: 1042 if ((code >= 0x80 && code < 0xa0) || code > 0xff) { 1043 invalid_code = 1; 1044 } else { 1045 *(q++) = code; 1046 } 1047 break; 1048 1049 case cs_cp1252: 1050 if (code > 0xff) { 1051 invalid_code = 1; 1052 } else { 1053 *(q++) = code; 1054 } 1055 break; 1056 1057 case cs_cp1251: 1058 case cs_cp866: 1059 case cs_big5: 1060 case cs_big5hkscs: 1061 case cs_sjis: 1062 case cs_eucjp: 1063 if (code >= 0x80) { 1064 invalid_code = 1; 1065 } else { 1066 *(q++) = code; 1067 } 1068 break; 1069 1070 case cs_gb2312: 1071 if (code >= 0x81) { 1072 invalid_code = 1; 1073 } else { 1074 *(q++) = code; 1075 } 1076 break; 1077 1078 default: 1079 /* for backwards compatilibity */ 1080 invalid_code = 1; 1081 break; 1082 } 1083 if (invalid_code) { 1084 for (; p <= next; p++) { 1085 *(q++) = *p; 1086 } 1087 } 1088 p = next + 1; 1089 } else { 1090 *(q++) = *(p++); 1091 *(q++) = *(p++); 1092 } 1093 } else if (p + 4 < lim && 1094 p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' && 1095 p[4] == ';') { 1096 *(q++) = '&'; 1097 p += 5; 1098 } else { 1099 *(q++) = *(p++); 1100 *(q++) = *(p++); 1101 } 1102 } else { 1103 *(q++) = *(p++); 1104 } 1105 } else { 1106 *(q++) = *(p++); 1107 } 1108 } 1109 *q = '\0'; 1110 retlen = (size_t)(q - ret); 1111empty_source: 1112 *newlen = retlen; 1113 return ret; 1114} 1115/* }}} */ 1116 1117PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC) 1118{ 1119 return php_escape_html_entities_ex(old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC); 1120} 1121 1122 1123/* {{{ php_escape_html_entities 1124 */ 1125PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC) 1126{ 1127 int i, j, maxlen, len; 1128 char *replaced; 1129 enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC); 1130 int matches_map; 1131 1132 maxlen = 2 * oldlen; 1133 if (maxlen < 128) 1134 maxlen = 128; 1135 replaced = emalloc (maxlen); 1136 len = 0; 1137 i = 0; 1138 while (i < oldlen) { 1139 unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */ 1140 int mbseqlen = sizeof(mbsequence); 1141 int status = SUCCESS; 1142 unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status); 1143 1144 if(status == FAILURE) { 1145 /* invalid MB sequence */ 1146 if (quote_style & ENT_HTML_IGNORE_ERRORS) { 1147 continue; 1148 } 1149 efree(replaced); 1150 if(!PG(display_errors)) { 1151 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument"); 1152 } 1153 *newlen = 0; 1154 return STR_EMPTY_ALLOC(); 1155 } 1156 matches_map = 0; 1157 1158 if (len + 16 > maxlen) 1159 replaced = erealloc (replaced, maxlen += 128); 1160 1161 if (all) { 1162 /* look for a match in the maps for this charset */ 1163 unsigned char *rep = NULL; 1164 1165 1166 for (j = 0; entity_map[j].charset != cs_terminator; j++) { 1167 if (entity_map[j].charset == charset 1168 && this_char >= entity_map[j].basechar 1169 && this_char <= entity_map[j].endchar) { 1170 rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar]; 1171 if (rep == NULL) { 1172 /* there is no entity for this position; fall through and 1173 * just output the character itself */ 1174 break; 1175 } 1176 1177 matches_map = 1; 1178 break; 1179 } 1180 } 1181 1182 if (matches_map) { 1183 int l = strlen(rep); 1184 /* increase the buffer size */ 1185 if (len + 2 + l >= maxlen) { 1186 replaced = erealloc(replaced, maxlen += 128); 1187 } 1188 1189 replaced[len++] = '&'; 1190 strlcpy(replaced + len, rep, maxlen); 1191 len += l; 1192 replaced[len++] = ';'; 1193 } 1194 } 1195 if (!matches_map) { 1196 int is_basic = 0; 1197 1198 if (this_char == '&') { 1199 if (double_encode) { 1200encode_amp: 1201 memcpy(replaced + len, "&", sizeof("&") - 1); 1202 len += sizeof("&") - 1; 1203 } else { 1204 char *e = memchr(old + i, ';', oldlen - i); 1205 char *s = old + i; 1206 1207 if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */ 1208 goto encode_amp; 1209 } else { 1210 if (*s == '#') { /* numeric entities */ 1211 s++; 1212 /* Hex (Z) */ 1213 if (*s == 'x' || *s == 'X') { 1214 s++; 1215 while (s < e) { 1216 if (!isxdigit((int)*(unsigned char *)s++)) { 1217 goto encode_amp; 1218 } 1219 } 1220 /* Dec (Z)*/ 1221 } else { 1222 while (s < e) { 1223 if (!isdigit((int)*(unsigned char *)s++)) { 1224 goto encode_amp; 1225 } 1226 } 1227 } 1228 } else { /* text entities */ 1229 while (s < e) { 1230 if (!isalnum((int)*(unsigned char *)s++)) { 1231 goto encode_amp; 1232 } 1233 } 1234 } 1235 replaced[len++] = '&'; 1236 } 1237 } 1238 is_basic = 1; 1239 } else { 1240 for (j = 0; basic_entities[j].charcode != 0; j++) { 1241 if ((basic_entities[j].charcode != this_char) || 1242 (basic_entities[j].flags && 1243 (quote_style & basic_entities[j].flags) == 0)) { 1244 continue; 1245 } 1246 1247 memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen); 1248 len += basic_entities[j].entitylen; 1249 1250 is_basic = 1; 1251 break; 1252 } 1253 } 1254 1255 if (!is_basic) { 1256 /* a wide char without a named entity; pass through the original sequence */ 1257 if (mbseqlen > 1) { 1258 memcpy(replaced + len, mbsequence, mbseqlen); 1259 len += mbseqlen; 1260 } else { 1261 replaced[len++] = (unsigned char)this_char; 1262 } 1263 } 1264 } 1265 } 1266 replaced[len] = '\0'; 1267 *newlen = len; 1268 1269 return replaced; 1270 1271 1272} 1273/* }}} */ 1274 1275/* {{{ php_html_entities 1276 */ 1277static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all) 1278{ 1279 char *str, *hint_charset = NULL; 1280 int str_len, hint_charset_len = 0; 1281 int len; 1282 long quote_style = ENT_COMPAT; 1283 char *replaced; 1284 zend_bool double_encode = 1; 1285 1286 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, "e_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) { 1287 return; 1288 } 1289 1290 replaced = php_escape_html_entities_ex(str, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC); 1291 RETVAL_STRINGL(replaced, len, 0); 1292} 1293/* }}} */ 1294 1295#define HTML_SPECIALCHARS 0 1296#define HTML_ENTITIES 1 1297 1298/* {{{ register_html_constants 1299 */ 1300void register_html_constants(INIT_FUNC_ARGS) 1301{ 1302 REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS); 1303 REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS); 1304 REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS); 1305 REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS); 1306 REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS); 1307 REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS); 1308} 1309/* }}} */ 1310 1311/* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]]) 1312 Convert special characters to HTML entities */ 1313PHP_FUNCTION(htmlspecialchars) 1314{ 1315 php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); 1316} 1317/* }}} */ 1318 1319/* {{{ proto string htmlspecialchars_decode(string string [, int quote_style]) 1320 Convert special HTML entities back to characters */ 1321PHP_FUNCTION(htmlspecialchars_decode) 1322{ 1323 char *str, *new_str, *e, *p; 1324 int len, j, i, new_len; 1325 long quote_style = ENT_COMPAT; 1326 struct basic_entities_dec basic_entities_dec[8]; 1327 1328 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, "e_style) == FAILURE) { 1329 return; 1330 } 1331 1332 new_str = estrndup(str, len); 1333 new_len = len; 1334 e = new_str + new_len; 1335 1336 if (!(p = memchr(new_str, '&', new_len))) { 1337 RETURN_STRINGL(new_str, new_len, 0); 1338 } 1339 1340 for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) { 1341 if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) { 1342 continue; 1343 } 1344 basic_entities_dec[j].charcode = basic_entities[i].charcode; 1345 memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1); 1346 basic_entities_dec[j].entitylen = basic_entities[i].entitylen; 1347 j++; 1348 } 1349 basic_entities_dec[j].charcode = '&'; 1350 basic_entities_dec[j].entitylen = sizeof("&") - 1; 1351 memcpy(basic_entities_dec[j].entity, "&", sizeof("&")); 1352 i = j + 1; 1353 1354 do { 1355 int l = e - p; 1356 1357 for (j = 0; j < i; j++) { 1358 if (basic_entities_dec[j].entitylen > l) { 1359 continue; 1360 } 1361 if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) { 1362 int e_len = basic_entities_dec[j].entitylen - 1; 1363 1364 *p++ = basic_entities_dec[j].charcode; 1365 memmove(p, p + e_len, (e - p - e_len)); 1366 e -= e_len; 1367 goto done; 1368 } 1369 } 1370 p++; 1371 1372done: 1373 if (p >= e) { 1374 break; 1375 } 1376 } while ((p = memchr(p, '&', (e - p)))); 1377 1378 new_len = e - new_str; 1379 1380 new_str[new_len] = '\0'; 1381 RETURN_STRINGL(new_str, new_len, 0); 1382} 1383/* }}} */ 1384 1385/* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset]) 1386 Convert all HTML entities to their applicable characters */ 1387PHP_FUNCTION(html_entity_decode) 1388{ 1389 char *str, *hint_charset = NULL; 1390 int str_len, hint_charset_len = 0, len; 1391 long quote_style = ENT_COMPAT; 1392 char *replaced; 1393 1394 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len, 1395 "e_style, &hint_charset, &hint_charset_len) == FAILURE) { 1396 return; 1397 } 1398 1399 replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC); 1400 if (replaced) { 1401 RETURN_STRINGL(replaced, len, 0); 1402 } 1403 RETURN_FALSE; 1404} 1405/* }}} */ 1406 1407 1408/* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]]) 1409 Convert all applicable characters to HTML entities */ 1410PHP_FUNCTION(htmlentities) 1411{ 1412 php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1); 1413} 1414/* }}} */ 1415 1416/* {{{ proto array get_html_translation_table([int table [, int quote_style [, string charset_hint]]]) 1417 Returns the internal translation table used by htmlspecialchars and htmlentities */ 1418PHP_FUNCTION(get_html_translation_table) 1419{ 1420 long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT; 1421 unsigned int i; 1422 int j; 1423 unsigned char ind[5]; /* max # of 8-bit code units (4; for UTF-8) + 1 for \0 */ 1424 void *dummy; 1425 char *charset_hint = NULL; 1426 int charset_hint_len; 1427 enum entity_charset charset; 1428 1429 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|lls", 1430 &which, "e_style, &charset_hint, &charset_hint_len) == FAILURE) { 1431 return; 1432 } 1433 1434 charset = determine_charset(charset_hint TSRMLS_CC); 1435 1436 array_init(return_value); 1437 1438 switch (which) { 1439 case HTML_ENTITIES: 1440 for (j = 0; entity_map[j].charset != cs_terminator; j++) { 1441 if (entity_map[j].charset != charset) 1442 continue; 1443 for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) { 1444 char buffer[16]; 1445 unsigned k; 1446 size_t written; 1447 1448 if (entity_map[j].table[i] == NULL) 1449 continue; 1450 1451 k = i + entity_map[j].basechar; 1452 1453 switch (charset) { 1454 case cs_utf_8: 1455 written = php_utf32_utf8(ind, k); 1456 ind[written] = '\0'; 1457 break; 1458 case cs_big5: 1459 case cs_gb2312: 1460 case cs_big5hkscs: 1461 case cs_sjis: 1462 /* we have no mappings for these, but if we had... */ 1463 /* break through */ 1464 default: /* one byte */ 1465 written = 1; 1466 ind[0] = (unsigned char)k; 1467 ind[1] = '\0'; 1468 break; 1469 } 1470 1471 snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]); 1472 if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, written+1, &dummy) == FAILURE) { 1473 /* in case of the single quote, which is repeated, the first one wins, 1474 * so don't replace the existint mapping */ 1475 add_assoc_string(return_value, (const char*)ind, buffer, 1); 1476 } 1477 } 1478 } 1479 /* break thru */ 1480 1481 case HTML_SPECIALCHARS: 1482 add_assoc_stringl(return_value, "&", "&", sizeof("&") - 1, 1); 1483 for (j = 0; basic_entities[j].charcode != 0; j++) { 1484 if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0) 1485 continue; 1486 1487 ind[0] = (unsigned char)basic_entities[j].charcode; 1488 ind[1] = '\0'; 1489 if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, 2, &dummy) == FAILURE) { 1490 add_assoc_stringl(return_value, ind, basic_entities[j].entity, 1491 basic_entities[j].entitylen, 1); 1492 } 1493 } 1494 1495 break; 1496 } 1497} 1498/* }}} */ 1499 1500/* 1501 * Local variables: 1502 * tab-width: 4 1503 * c-basic-offset: 4 1504 * End: 1505 * vim600: sw=4 ts=4 fdm=marker 1506 * vim<600: sw=4 ts=4 1507 */ 1508