1/*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA  02111-1307  USA
20 *
21 * The author of this file: Rui Hirokawa <hirokawa@php.net>
22 *
23 */
24/*
25 * The source code included in this files was separated from mbfilter_tw.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30#ifdef HAVE_CONFIG_H
31#include "config.h"
32#endif
33
34#include "mbfilter.h"
35#include "mbfilter_big5.h"
36
37#include "unicode_table_big5.h"
38
39static int mbfl_filt_ident_big5(int c, mbfl_identify_filter *filter);
40
41static const unsigned char mblen_table_big5[] = { /* 0x81-0xFE */
42  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
51  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
52  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
53  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
54  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
55  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
56  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
57  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
58};
59
60static const char *mbfl_encoding_big5_aliases[] = {"CN-BIG5", "BIG-FIVE", "BIGFIVE", NULL};
61
62const mbfl_encoding mbfl_encoding_big5 = {
63    mbfl_no_encoding_big5,
64    "BIG-5",
65    "BIG5",
66    (const char *(*)[])&mbfl_encoding_big5_aliases,
67    mblen_table_big5,
68    MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE
69};
70
71const mbfl_encoding mbfl_encoding_cp950 = {
72    mbfl_no_encoding_cp950,
73    "CP950",
74    "BIG5",
75    NULL,
76    mblen_table_big5,
77    MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE
78};
79
80const struct mbfl_identify_vtbl vtbl_identify_big5 = {
81    mbfl_no_encoding_big5,
82    mbfl_filt_ident_common_ctor,
83    mbfl_filt_ident_common_dtor,
84    mbfl_filt_ident_big5
85};
86
87const struct mbfl_identify_vtbl vtbl_identify_cp950 = {
88    mbfl_no_encoding_cp950,
89    mbfl_filt_ident_common_ctor,
90    mbfl_filt_ident_common_dtor,
91    mbfl_filt_ident_big5
92};
93
94const struct mbfl_convert_vtbl vtbl_big5_wchar = {
95    mbfl_no_encoding_big5,
96    mbfl_no_encoding_wchar,
97    mbfl_filt_conv_common_ctor,
98    mbfl_filt_conv_common_dtor,
99    mbfl_filt_conv_big5_wchar,
100    mbfl_filt_conv_common_flush
101};
102
103const struct mbfl_convert_vtbl vtbl_wchar_big5 = {
104    mbfl_no_encoding_wchar,
105    mbfl_no_encoding_big5,
106    mbfl_filt_conv_common_ctor,
107    mbfl_filt_conv_common_dtor,
108    mbfl_filt_conv_wchar_big5,
109    mbfl_filt_conv_common_flush
110};
111
112const struct mbfl_convert_vtbl vtbl_cp950_wchar = {
113    mbfl_no_encoding_cp950,
114    mbfl_no_encoding_wchar,
115    mbfl_filt_conv_common_ctor,
116    mbfl_filt_conv_common_dtor,
117    mbfl_filt_conv_big5_wchar,
118    mbfl_filt_conv_common_flush
119};
120
121const struct mbfl_convert_vtbl vtbl_wchar_cp950 = {
122    mbfl_no_encoding_wchar,
123    mbfl_no_encoding_cp950,
124    mbfl_filt_conv_common_ctor,
125    mbfl_filt_conv_common_dtor,
126    mbfl_filt_conv_wchar_big5,
127    mbfl_filt_conv_common_flush
128};
129
130#define CK(statement)   do { if ((statement) < 0) return (-1); } while (0)
131
132/* 63 + 94 = 157 or 94 */
133static unsigned short cp950_pua_tbl[][4] = {
134    {0xe000,0xe310,0xfa40,0xfefe},
135    {0xe311,0xeeb7,0x8e40,0xa0fe},
136    {0xeeb8,0xf6b0,0x8140,0x8dfe},
137    {0xf6b1,0xf70e,0xc6a1,0xc6fe},
138    {0xf70f,0xf848,0xc740,0xc8fe},
139};
140
141/*
142 * Big5 => wchar
143 */
144int
145mbfl_filt_conv_big5_wchar(int c, mbfl_convert_filter *filter)
146{
147    int k;
148    int c1, w, c2;
149
150    switch (filter->status) {
151    case 0:
152        if (filter->from->no_encoding == mbfl_no_encoding_cp950) {
153            c1 = 0x80;
154        } else {
155            c1 = 0xa0;
156        }
157
158        if (c >= 0 && c <= 0x80) {  /* latin */
159            CK((*filter->output_function)(c, filter->data));
160        } else if (c == 0xff) {
161            CK((*filter->output_function)(0xf8f8, filter->data));
162        } else if (c > c1 && c < 0xff) {    /* dbcs lead byte */
163            filter->status = 1;
164            filter->cache = c;
165        } else {
166            w = c & MBFL_WCSGROUP_MASK;
167            w |= MBFL_WCSGROUP_THROUGH;
168            CK((*filter->output_function)(w, filter->data));
169        }
170        break;
171
172    case 1:     /* dbcs second byte */
173        filter->status = 0;
174        c1 = filter->cache;
175        if ((c > 0x39 && c < 0x7f) | (c > 0xa0 && c < 0xff)) {
176            if (c < 0x7f){
177                w = (c1 - 0xa1)*157 + (c - 0x40);
178            } else {
179                w = (c1 - 0xa1)*157 + (c - 0xa1) + 0x3f;
180            }
181            if (w >= 0 && w < big5_ucs_table_size) {
182                w = big5_ucs_table[w];
183            } else {
184                w = 0;
185            }
186
187            if (filter->from->no_encoding == mbfl_no_encoding_cp950) {
188                /* PUA for CP950 */
189                if (w <= 0 &&
190                    (((c1 >= 0xfa && c1 <= 0xfe) || (c1 >= 0x8e && c1 <= 0xa0) ||
191                      (c1 >= 0x81 && c1 <= 0x8d) ||(c1 >= 0xc7 && c1 <= 0xc8))
192                     && ((c > 0x39 && c < 0x7f) || (c > 0xa0 && c < 0xff))) ||
193                    ((c1 == 0xc6) && (c > 0xa0 && c < 0xff))) {
194                    c2 = c1 << 8 | c;
195                    for (k = 0; k < sizeof(cp950_pua_tbl)/(sizeof(unsigned short)*4); k++) {
196                        if (c2 >= cp950_pua_tbl[k][2] && c2 <= cp950_pua_tbl[k][3]) {
197                            break;
198                        }
199                    }
200
201                    if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
202                        w = 157*(c1 - (cp950_pua_tbl[k][2]>>8)) + c - (c >= 0xa1 ? 0x62 : 0x40)
203                            + cp950_pua_tbl[k][0];
204                    } else {
205                        w = c2 - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0];
206                    }
207                }
208            }
209
210            if (w <= 0) {
211                w = (c1 << 8) | c;
212                w &= MBFL_WCSPLANE_MASK;
213                w |= MBFL_WCSPLANE_BIG5;
214            }
215            CK((*filter->output_function)(w, filter->data));
216        } else if ((c >= 0 && c < 0x21) || c == 0x7f) {     /* CTLs */
217            CK((*filter->output_function)(c, filter->data));
218        } else {
219            w = (c1 << 8) | c;
220            w &= MBFL_WCSGROUP_MASK;
221            w |= MBFL_WCSGROUP_THROUGH;
222            CK((*filter->output_function)(w, filter->data));
223        }
224        break;
225
226    default:
227        filter->status = 0;
228        break;
229    }
230
231    return c;
232}
233
234/*
235 * wchar => Big5
236 */
237int
238mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter)
239{
240    int k;
241    int c1, s, c2;
242
243    s = 0;
244    if (c >= ucs_a1_big5_table_min && c < ucs_a1_big5_table_max) {
245        s = ucs_a1_big5_table[c - ucs_a1_big5_table_min];
246    } else if (c >= ucs_a2_big5_table_min && c < ucs_a2_big5_table_max) {
247        s = ucs_a2_big5_table[c - ucs_a2_big5_table_min];
248    } else if (c >= ucs_a3_big5_table_min && c < ucs_a3_big5_table_max) {
249        s = ucs_a3_big5_table[c - ucs_a3_big5_table_min];
250    } else if (c >= ucs_i_big5_table_min && c < ucs_i_big5_table_max) {
251        s = ucs_i_big5_table[c - ucs_i_big5_table_min];
252    } else if (c >= ucs_pua_big5_table_min && c < ucs_pua_big5_table_max) {
253        s = ucs_pua_big5_table[c - ucs_pua_big5_table_min];
254    } else if (c >= ucs_r1_big5_table_min && c < ucs_r1_big5_table_max) {
255        s = ucs_r1_big5_table[c - ucs_r1_big5_table_min];
256    } else if (c >= ucs_r2_big5_table_min && c < ucs_r2_big5_table_max) {
257        s = ucs_r2_big5_table[c - ucs_r2_big5_table_min];
258    }
259
260    if (filter->to->no_encoding == mbfl_no_encoding_cp950) {
261        if (c >= 0xe000 && c <= 0xf848) { /* PUA for CP950 */
262            for (k = 0; k < sizeof(cp950_pua_tbl)/(sizeof(unsigned short)*4); k++) {
263                if (c <= cp950_pua_tbl[k][1]) {
264                    break;
265                }
266            }
267            c1 = c - cp950_pua_tbl[k][0];
268            if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
269                c2 = cp950_pua_tbl[k][2] >> 8;
270                s = ((c1 / 157) + c2) << 8; c1 %= 157;
271                s |= c1 + (c1 >= 0x3f ? 0x62 : 0x40);
272            } else {
273                s = c1 + cp950_pua_tbl[k][2];
274            }
275        }
276
277        if (c == 0x80) {
278            s = 0x80;
279        } else if (c == 0xf8f8) {
280            s = 0xff;
281        } else if (c == 0x256d) {
282            s = 0xa27e;
283        } else if (c == 0x256e) {
284            s = 0xa2a1;
285        } else if (c == 0x256f) {
286            s = 0xa2a3;
287        } else if (c == 0x2570) {
288            s = 0xa2a2;
289        }
290    }
291
292    if (s <= 0) {
293        c1 = c & ~MBFL_WCSPLANE_MASK;
294        if (c1 == MBFL_WCSPLANE_BIG5) {
295            s = c & MBFL_WCSPLANE_MASK;
296        }
297        if (c == 0) {
298            s = 0;
299        } else if (s <= 0) {
300            s = -1;
301        }
302    }
303    if (s >= 0) {
304        if (s <= 0x80 || s == 0xff) {   /* latin */
305            CK((*filter->output_function)(s, filter->data));
306        } else {
307            CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
308            CK((*filter->output_function)(s & 0xff, filter->data));
309        }
310    } else {
311        if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
312            CK(mbfl_filt_conv_illegal_output(c, filter));
313        }
314    }
315
316    return c;
317}
318
319static int mbfl_filt_ident_big5(int c, mbfl_identify_filter *filter)
320{
321    int c1;
322    if (filter->encoding->no_encoding == mbfl_no_encoding_cp950) {
323        c1 = 0x80;
324    } else {
325        c1 = 0xa0;
326    }
327
328    if (filter->status) {       /* kanji second char */
329        if (c < 0x40 || (c > 0x7e && c < 0xa1) ||c > 0xfe) {    /* bad */
330            filter->flag = 1;
331        }
332        filter->status = 0;
333    } else if (c >= 0 && c < 0x80) {    /* latin  ok */
334        ;
335    } else if (c > c1 && c < 0xff) {    /* DBCS lead byte */
336        filter->status = 1;
337    } else {                            /* bad */
338        filter->flag = 1;
339    }
340
341    return c;
342}
343
344
345