1/* 2 * "streamable kanji code filter and converter" 3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. 4 * 5 * LICENSE NOTICES 6 * 7 * This file is part of "streamable kanji code filter and converter", 8 * which is distributed under the terms of GNU Lesser General Public 9 * License (version 2) as published by the Free Software Foundation. 10 * 11 * This software is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with "streamable kanji code filter and converter"; 18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, 19 * Suite 330, Boston, MA 02111-1307 USA 20 * 21 * The author of this file: Rui Hirokawa <hirokawa@php.net> 22 * 23 */ 24/* 25 * The source code included in this files was separated from mbfilter_tw.c 26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002. 27 * 28 */ 29 30#ifdef HAVE_CONFIG_H 31#include "config.h" 32#endif 33 34#include "mbfilter.h" 35#include "mbfilter_big5.h" 36 37#include "unicode_table_big5.h" 38 39static int mbfl_filt_ident_big5(int c, mbfl_identify_filter *filter); 40 41static const unsigned char mblen_table_big5[] = { /* 0x81-0xFE */ 42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 43 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 44 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 50 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 51 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 52 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 53 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 54 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 55 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 56 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 57 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 58}; 59 60static const char *mbfl_encoding_big5_aliases[] = {"CN-BIG5", "BIG-FIVE", "BIGFIVE", NULL}; 61 62const mbfl_encoding mbfl_encoding_big5 = { 63 mbfl_no_encoding_big5, 64 "BIG-5", 65 "BIG5", 66 (const char *(*)[])&mbfl_encoding_big5_aliases, 67 mblen_table_big5, 68 MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE 69}; 70 71const mbfl_encoding mbfl_encoding_cp950 = { 72 mbfl_no_encoding_cp950, 73 "CP950", 74 "BIG5", 75 NULL, 76 mblen_table_big5, 77 MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE 78}; 79 80const struct mbfl_identify_vtbl vtbl_identify_big5 = { 81 mbfl_no_encoding_big5, 82 mbfl_filt_ident_common_ctor, 83 mbfl_filt_ident_common_dtor, 84 mbfl_filt_ident_big5 85}; 86 87const struct mbfl_identify_vtbl vtbl_identify_cp950 = { 88 mbfl_no_encoding_cp950, 89 mbfl_filt_ident_common_ctor, 90 mbfl_filt_ident_common_dtor, 91 mbfl_filt_ident_big5 92}; 93 94const struct mbfl_convert_vtbl vtbl_big5_wchar = { 95 mbfl_no_encoding_big5, 96 mbfl_no_encoding_wchar, 97 mbfl_filt_conv_common_ctor, 98 mbfl_filt_conv_common_dtor, 99 mbfl_filt_conv_big5_wchar, 100 mbfl_filt_conv_common_flush 101}; 102 103const struct mbfl_convert_vtbl vtbl_wchar_big5 = { 104 mbfl_no_encoding_wchar, 105 mbfl_no_encoding_big5, 106 mbfl_filt_conv_common_ctor, 107 mbfl_filt_conv_common_dtor, 108 mbfl_filt_conv_wchar_big5, 109 mbfl_filt_conv_common_flush 110}; 111 112const struct mbfl_convert_vtbl vtbl_cp950_wchar = { 113 mbfl_no_encoding_cp950, 114 mbfl_no_encoding_wchar, 115 mbfl_filt_conv_common_ctor, 116 mbfl_filt_conv_common_dtor, 117 mbfl_filt_conv_big5_wchar, 118 mbfl_filt_conv_common_flush 119}; 120 121const struct mbfl_convert_vtbl vtbl_wchar_cp950 = { 122 mbfl_no_encoding_wchar, 123 mbfl_no_encoding_cp950, 124 mbfl_filt_conv_common_ctor, 125 mbfl_filt_conv_common_dtor, 126 mbfl_filt_conv_wchar_big5, 127 mbfl_filt_conv_common_flush 128}; 129 130#define CK(statement) do { if ((statement) < 0) return (-1); } while (0) 131 132/* 63 + 94 = 157 or 94 */ 133static unsigned short cp950_pua_tbl[][4] = { 134 {0xe000,0xe310,0xfa40,0xfefe}, 135 {0xe311,0xeeb7,0x8e40,0xa0fe}, 136 {0xeeb8,0xf6b0,0x8140,0x8dfe}, 137 {0xf6b1,0xf70e,0xc6a1,0xc6fe}, 138 {0xf70f,0xf848,0xc740,0xc8fe}, 139}; 140 141/* 142 * Big5 => wchar 143 */ 144int 145mbfl_filt_conv_big5_wchar(int c, mbfl_convert_filter *filter) 146{ 147 int k; 148 int c1, w, c2; 149 150 switch (filter->status) { 151 case 0: 152 if (filter->from->no_encoding == mbfl_no_encoding_cp950) { 153 c1 = 0x80; 154 } else { 155 c1 = 0xa0; 156 } 157 158 if (c >= 0 && c <= 0x80) { /* latin */ 159 CK((*filter->output_function)(c, filter->data)); 160 } else if (c == 0xff) { 161 CK((*filter->output_function)(0xf8f8, filter->data)); 162 } else if (c > c1 && c < 0xff) { /* dbcs lead byte */ 163 filter->status = 1; 164 filter->cache = c; 165 } else { 166 w = c & MBFL_WCSGROUP_MASK; 167 w |= MBFL_WCSGROUP_THROUGH; 168 CK((*filter->output_function)(w, filter->data)); 169 } 170 break; 171 172 case 1: /* dbcs second byte */ 173 filter->status = 0; 174 c1 = filter->cache; 175 if ((c > 0x39 && c < 0x7f) | (c > 0xa0 && c < 0xff)) { 176 if (c < 0x7f){ 177 w = (c1 - 0xa1)*157 + (c - 0x40); 178 } else { 179 w = (c1 - 0xa1)*157 + (c - 0xa1) + 0x3f; 180 } 181 if (w >= 0 && w < big5_ucs_table_size) { 182 w = big5_ucs_table[w]; 183 } else { 184 w = 0; 185 } 186 187 if (filter->from->no_encoding == mbfl_no_encoding_cp950) { 188 /* PUA for CP950 */ 189 if (w <= 0 && 190 (((c1 >= 0xfa && c1 <= 0xfe) || (c1 >= 0x8e && c1 <= 0xa0) || 191 (c1 >= 0x81 && c1 <= 0x8d) ||(c1 >= 0xc7 && c1 <= 0xc8)) 192 && ((c > 0x39 && c < 0x7f) || (c > 0xa0 && c < 0xff))) || 193 ((c1 == 0xc6) && (c > 0xa0 && c < 0xff))) { 194 c2 = c1 << 8 | c; 195 for (k = 0; k < sizeof(cp950_pua_tbl)/(sizeof(unsigned short)*4); k++) { 196 if (c2 >= cp950_pua_tbl[k][2] && c2 <= cp950_pua_tbl[k][3]) { 197 break; 198 } 199 } 200 201 if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) { 202 w = 157*(c1 - (cp950_pua_tbl[k][2]>>8)) + c - (c >= 0xa1 ? 0x62 : 0x40) 203 + cp950_pua_tbl[k][0]; 204 } else { 205 w = c2 - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0]; 206 } 207 } 208 } 209 210 if (w <= 0) { 211 w = (c1 << 8) | c; 212 w &= MBFL_WCSPLANE_MASK; 213 w |= MBFL_WCSPLANE_BIG5; 214 } 215 CK((*filter->output_function)(w, filter->data)); 216 } else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */ 217 CK((*filter->output_function)(c, filter->data)); 218 } else { 219 w = (c1 << 8) | c; 220 w &= MBFL_WCSGROUP_MASK; 221 w |= MBFL_WCSGROUP_THROUGH; 222 CK((*filter->output_function)(w, filter->data)); 223 } 224 break; 225 226 default: 227 filter->status = 0; 228 break; 229 } 230 231 return c; 232} 233 234/* 235 * wchar => Big5 236 */ 237int 238mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter) 239{ 240 int k; 241 int c1, s, c2; 242 243 s = 0; 244 if (c >= ucs_a1_big5_table_min && c < ucs_a1_big5_table_max) { 245 s = ucs_a1_big5_table[c - ucs_a1_big5_table_min]; 246 } else if (c >= ucs_a2_big5_table_min && c < ucs_a2_big5_table_max) { 247 s = ucs_a2_big5_table[c - ucs_a2_big5_table_min]; 248 } else if (c >= ucs_a3_big5_table_min && c < ucs_a3_big5_table_max) { 249 s = ucs_a3_big5_table[c - ucs_a3_big5_table_min]; 250 } else if (c >= ucs_i_big5_table_min && c < ucs_i_big5_table_max) { 251 s = ucs_i_big5_table[c - ucs_i_big5_table_min]; 252 } else if (c >= ucs_pua_big5_table_min && c < ucs_pua_big5_table_max) { 253 s = ucs_pua_big5_table[c - ucs_pua_big5_table_min]; 254 } else if (c >= ucs_r1_big5_table_min && c < ucs_r1_big5_table_max) { 255 s = ucs_r1_big5_table[c - ucs_r1_big5_table_min]; 256 } else if (c >= ucs_r2_big5_table_min && c < ucs_r2_big5_table_max) { 257 s = ucs_r2_big5_table[c - ucs_r2_big5_table_min]; 258 } 259 260 if (filter->to->no_encoding == mbfl_no_encoding_cp950) { 261 if (c >= 0xe000 && c <= 0xf848) { /* PUA for CP950 */ 262 for (k = 0; k < sizeof(cp950_pua_tbl)/(sizeof(unsigned short)*4); k++) { 263 if (c <= cp950_pua_tbl[k][1]) { 264 break; 265 } 266 } 267 c1 = c - cp950_pua_tbl[k][0]; 268 if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) { 269 c2 = cp950_pua_tbl[k][2] >> 8; 270 s = ((c1 / 157) + c2) << 8; c1 %= 157; 271 s |= c1 + (c1 >= 0x3f ? 0x62 : 0x40); 272 } else { 273 s = c1 + cp950_pua_tbl[k][2]; 274 } 275 } 276 277 if (c == 0x80) { 278 s = 0x80; 279 } else if (c == 0xf8f8) { 280 s = 0xff; 281 } else if (c == 0x256d) { 282 s = 0xa27e; 283 } else if (c == 0x256e) { 284 s = 0xa2a1; 285 } else if (c == 0x256f) { 286 s = 0xa2a3; 287 } else if (c == 0x2570) { 288 s = 0xa2a2; 289 } 290 } 291 292 if (s <= 0) { 293 c1 = c & ~MBFL_WCSPLANE_MASK; 294 if (c1 == MBFL_WCSPLANE_BIG5) { 295 s = c & MBFL_WCSPLANE_MASK; 296 } 297 if (c == 0) { 298 s = 0; 299 } else if (s <= 0) { 300 s = -1; 301 } 302 } 303 if (s >= 0) { 304 if (s <= 0x80 || s == 0xff) { /* latin */ 305 CK((*filter->output_function)(s, filter->data)); 306 } else { 307 CK((*filter->output_function)((s >> 8) & 0xff, filter->data)); 308 CK((*filter->output_function)(s & 0xff, filter->data)); 309 } 310 } else { 311 if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) { 312 CK(mbfl_filt_conv_illegal_output(c, filter)); 313 } 314 } 315 316 return c; 317} 318 319static int mbfl_filt_ident_big5(int c, mbfl_identify_filter *filter) 320{ 321 int c1; 322 if (filter->encoding->no_encoding == mbfl_no_encoding_cp950) { 323 c1 = 0x80; 324 } else { 325 c1 = 0xa0; 326 } 327 328 if (filter->status) { /* kanji second char */ 329 if (c < 0x40 || (c > 0x7e && c < 0xa1) ||c > 0xfe) { /* bad */ 330 filter->flag = 1; 331 } 332 filter->status = 0; 333 } else if (c >= 0 && c < 0x80) { /* latin ok */ 334 ; 335 } else if (c > c1 && c < 0xff) { /* DBCS lead byte */ 336 filter->status = 1; 337 } else { /* bad */ 338 filter->flag = 1; 339 } 340 341 return c; 342} 343 344 345