imapext-2007

view src/c-client/utf8.c @ 0:ada5e610ab86

imap-2007e
author yuuji@gentei.org
date Mon, 14 Sep 2009 15:17:45 +0900
parents
children
line source
1 /* ========================================================================
2 * Copyright 1988-2008 University of Washington
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 *
11 * ========================================================================
12 */
14 /*
15 * Program: UTF-8 routines
16 *
17 * Author: Mark Crispin
18 * Networks and Distributed Computing
19 * Computing & Communications
20 * University of Washington
21 * Administration Building, AG-44
22 * Seattle, WA 98195
23 * Internet: MRC@CAC.Washington.EDU
24 *
25 * Date: 11 June 1997
26 * Last Edited: 17 January 2008
27 */
30 #include <stdio.h>
31 #include <ctype.h>
32 #include "c-client.h"
34 /* *** IMPORTANT ***
35 *
36 * There is a very important difference between "character set" and "charset",
37 * and the comments in this file reflect these differences. A "character set"
38 * (also known as "coded character set") is a mapping between codepoints and
39 * characters. A "charset" is as defined in MIME, and incorporates one or more
40 * coded character sets in a character encoding scheme. See RFC 2130 for more
41 * details.
42 */
45 /* Character set conversion tables */
47 #include "iso_8859.c" /* 8-bit single-byte coded graphic */
48 #include "koi8_r.c" /* Cyrillic - Russia */
49 #include "koi8_u.c" /* Cyrillic - Ukraine */
50 #include "tis_620.c" /* Thai */
51 #include "viscii.c" /* Vietnamese */
52 #include "windows.c" /* Windows */
53 #include "ibm.c" /* IBM */
54 #include "gb_2312.c" /* Chinese (PRC) - simplified */
55 #include "gb_12345.c" /* Chinese (PRC) - traditional */
56 #include "jis_0208.c" /* Japanese - basic */
57 #include "jis_0212.c" /* Japanese - supplementary */
58 #include "ksc_5601.c" /* Korean */
59 #include "big5.c" /* Taiwanese (ROC) - industrial standard */
60 #include "cns11643.c" /* Taiwanese (ROC) - national standard */
63 #include "widths.c" /* Unicode character widths */
64 #include "tmap.c" /* Unicode titlecase mapping */
65 #include "decomtab.c" /* Unicode decomposions */
67 /* EUC parameters */
69 #ifdef GBTOUNICODE /* PRC simplified Chinese */
70 static const struct utf8_eucparam gb_param = {
71 BASE_GB2312_KU,BASE_GB2312_TEN,MAX_GB2312_KU,MAX_GB2312_TEN,
72 (void *) gb2312tab};
73 #endif
76 #ifdef GB12345TOUNICODE /* PRC traditional Chinese */
77 static const struct utf8_eucparam gbt_param = {
78 BASE_GB12345_KU,BASE_GB12345_TEN,MAX_GB12345_KU,MAX_GB12345_TEN,
79 (void *) gb12345tab};
80 #endif
83 #ifdef BIG5TOUNICODE /* ROC traditional Chinese */
84 static const struct utf8_eucparam big5_param[] = {
85 {BASE_BIG5_KU,BASE_BIG5_TEN_0,MAX_BIG5_KU,MAX_BIG5_TEN_0,(void *) big5tab},
86 {BASE_BIG5_KU,BASE_BIG5_TEN_1,MAX_BIG5_KU,MAX_BIG5_TEN_1,NIL}
87 };
88 #endif
91 #ifdef JISTOUNICODE /* Japanese */
92 static const struct utf8_eucparam jis_param[] = {
93 {BASE_JIS0208_KU,BASE_JIS0208_TEN,MAX_JIS0208_KU,MAX_JIS0208_TEN,
94 (void *) jis0208tab},
95 {MIN_KANA_8,0,MAX_KANA_8,0,(void *) KANA_8},
96 #ifdef JIS0212TOUNICODE /* Japanese extended */
97 {BASE_JIS0212_KU,BASE_JIS0212_TEN,MAX_JIS0212_KU,MAX_JIS0212_TEN,
98 (void *) jis0212tab}
99 #else
100 {0,0,0,0,NIL}
101 #endif
102 };
103 #endif
106 #ifdef KSCTOUNICODE /* Korean */
107 static const struct utf8_eucparam ksc_param = {
108 BASE_KSC5601_KU,BASE_KSC5601_TEN,MAX_KSC5601_KU,MAX_KSC5601_TEN,
109 (void *) ksc5601tab};
110 #endif
112 /* List of supported charsets */
114 static const CHARSET utf8_csvalid[] = {
115 {"US-ASCII",CT_ASCII,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
116 NIL,NIL,NIL},
117 {"UTF-8",CT_UTF8,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
118 NIL,SC_UNICODE,NIL},
119 {"UTF-7",CT_UTF7,CF_PRIMARY | CF_POSTING | CF_UNSUPRT,
120 NIL,SC_UNICODE,"UTF-8"},
121 {"ISO-8859-1",CT_1BYTE0,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
122 NIL,SC_LATIN_1,NIL},
123 {"ISO-8859-2",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
124 (void *) iso8859_2tab,SC_LATIN_2,NIL},
125 {"ISO-8859-3",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
126 (void *) iso8859_3tab,SC_LATIN_3,NIL},
127 {"ISO-8859-4",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
128 (void *) iso8859_4tab,SC_LATIN_4,NIL},
129 {"ISO-8859-5",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
130 (void *) iso8859_5tab,SC_CYRILLIC,"KOI8-R"},
131 {"ISO-8859-6",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
132 (void *) iso8859_6tab,SC_ARABIC,NIL},
133 {"ISO-8859-7",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
134 (void *) iso8859_7tab,SC_GREEK,NIL},
135 {"ISO-8859-8",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
136 (void *) iso8859_8tab,SC_HEBREW,NIL},
137 {"ISO-8859-9",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
138 (void *) iso8859_9tab,SC_LATIN_5,NIL},
139 {"ISO-8859-10",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
140 (void *) iso8859_10tab,SC_LATIN_6,NIL},
141 {"ISO-8859-11",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
142 (void *) iso8859_11tab,SC_THAI,NIL},
143 #if 0 /* ISO 8859-12 reserved for ISCII(?) */
144 {"ISO-8859-12",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
145 (void *) iso8859_12tab,NIL,NIL},
146 #endif
147 {"ISO-8859-13",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
148 (void *) iso8859_13tab,SC_LATIN_7,NIL},
149 {"ISO-8859-14",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
150 (void *) iso8859_14tab,SC_LATIN_8,NIL},
151 {"ISO-8859-15",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
152 (void *) iso8859_15tab,SC_LATIN_9,NIL},
153 {"ISO-8859-16",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
154 (void *) iso8859_16tab,SC_LATIN_10,NIL},
155 {"KOI8-R",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
156 (void *) koi8rtab,SC_CYRILLIC,NIL},
157 {"KOI8-U",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
158 (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,NIL},
159 {"KOI8-RU",CT_1BYTE,CF_DISPLAY,
160 (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,"KOI8-U"},
161 {"TIS-620",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
162 (void *) tis620tab,SC_THAI,"ISO-8859-11"},
163 {"VISCII",CT_1BYTE8,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
164 (void *) visciitab,SC_VIETNAMESE,NIL},
166 #ifdef GBTOUNICODE
167 {"GBK",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
168 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,NIL},
169 {"GB2312",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
170 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
171 {"CN-GB",CT_DBYTE,CF_DISPLAY,
172 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
173 #ifdef CNS1TOUNICODE
174 {"ISO-2022-CN",CT_2022,CF_PRIMARY | CF_UNSUPRT,
175 NIL,SC_CHINESE_SIMPLIFIED | SC_CHINESE_TRADITIONAL,
176 NIL},
177 #endif
178 #endif
179 #ifdef GB12345TOUNICODE
180 {"CN-GB-12345",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
181 (void *) &gbt_param,SC_CHINESE_TRADITIONAL,"BIG5"},
182 #endif
183 #ifdef BIG5TOUNICODE
184 {"BIG5",CT_DBYTE2,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
185 (void *) big5_param,SC_CHINESE_TRADITIONAL,NIL},
186 {"CN-BIG5",CT_DBYTE2,CF_DISPLAY,
187 (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"},
188 {"BIG-5",CT_DBYTE2,CF_DISPLAY,
189 (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"},
190 #endif
191 #ifdef JISTOUNICODE
192 {"ISO-2022-JP",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
193 NIL,SC_JAPANESE,NIL},
194 {"EUC-JP",CT_EUC,CF_PRIMARY | CF_DISPLAY,
195 (void *) jis_param,SC_JAPANESE,"ISO-2022-JP"},
196 {"SHIFT_JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY,
197 NIL,SC_JAPANESE,"ISO-2022-JP"},
198 {"SHIFT-JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY,
199 NIL,SC_JAPANESE,"ISO-2022-JP"},
200 #ifdef JIS0212TOUNICODE
201 {"ISO-2022-JP-1",CT_2022,CF_UNSUPRT,
202 NIL,SC_JAPANESE,"ISO-2022-JP"},
203 #ifdef GBTOUNICODE
204 #ifdef KSCTOUNICODE
205 {"ISO-2022-JP-2",CT_2022,CF_UNSUPRT,
206 NIL,
207 SC_LATIN_1 | SC_LATIN_2 | SC_LATIN_3 | SC_LATIN_4 | SC_LATIN_5 |
208 SC_LATIN_6 | SC_LATIN_7 | SC_LATIN_8 | SC_LATIN_9 | SC_LATIN_10 |
209 SC_ARABIC | SC_CYRILLIC | SC_GREEK | SC_HEBREW | SC_THAI |
210 SC_VIETNAMESE | SC_CHINESE_SIMPLIFIED | SC_JAPANESE | SC_KOREAN
211 #ifdef CNS1TOUNICODE
212 | SC_CHINESE_TRADITIONAL
213 #endif
214 ,"UTF-8"},
215 #endif
216 #endif
217 #endif
218 #endif
220 #ifdef KSCTOUNICODE
221 {"ISO-2022-KR",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_UNSUPRT,
222 NIL,SC_KOREAN,"EUC-KR"},
223 {"EUC-KR",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
224 (void *) &ksc_param,SC_KOREAN,NIL},
225 {"KSC5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
226 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
227 {"KSC_5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
228 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
229 {"KS_C_5601-1987",CT_DBYTE,CF_DISPLAY,
230 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
231 {"KS_C_5601-1989",CT_DBYTE,CF_DISPLAY,
232 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
233 {"KS_C_5601-1992",CT_DBYTE,CF_DISPLAY,
234 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
235 {"KS_C_5601-1997",CT_DBYTE,CF_DISPLAY,
236 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
237 #endif
239 /* deep sigh */
240 {"WINDOWS-874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
241 (void *) windows_874tab,SC_THAI,"ISO-8859-11"},
242 {"CP874",CT_1BYTE,CF_DISPLAY,
243 (void *) windows_874tab,SC_THAI,"ISO-8859-11"},
244 #ifdef GBTOUNICODE
245 {"WINDOWS-936",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
246 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
247 {"CP936",CT_DBYTE,CF_DISPLAY,
248 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
249 #endif
250 #ifdef KSCTOUNICODE
251 {"WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
252 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
253 {"CP949",CT_DBYTE,CF_DISPLAY,
254 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
255 {"X-WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
256 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
257 #endif
258 {"WINDOWS-1250",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
259 (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"},
260 {"CP1250",CT_1BYTE,CF_DISPLAY,
261 (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"},
262 {"WINDOWS-1251",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
263 (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"},
264 {"CP1251",CT_1BYTE,CF_DISPLAY,
265 (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"},
266 {"WINDOWS-1252",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
267 (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"},
268 {"CP1252",CT_1BYTE,CF_DISPLAY,
269 (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"},
270 {"WINDOWS-1253",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
271 (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"},
272 {"CP1253",CT_1BYTE,CF_DISPLAY,
273 (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"},
274 {"WINDOWS-1254",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
275 (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"},
276 {"CP1254",CT_1BYTE,CF_DISPLAY,
277 (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"},
278 {"WINDOWS-1255",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
279 (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"},
280 {"CP1255",CT_1BYTE,CF_DISPLAY,
281 (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"},
282 {"WINDOWS-1256",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
283 (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"},
284 {"CP1256",CT_1BYTE,CF_DISPLAY,
285 (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"},
286 {"WINDOWS-1257",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
287 (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"},
288 {"CP1257",CT_1BYTE,CF_DISPLAY,
289 (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"},
290 {"WINDOWS-1258",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
291 (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"},
292 {"CP1258",CT_1BYTE,CF_DISPLAY,
293 (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"},
295 /* deeper sigh */
296 {"IBM367",CT_ASCII,CF_PRIMARY | CF_DISPLAY,
297 NIL,NIL,"US-ASCII"},
298 {"IBM437",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
299 (void *) ibm_437tab,SC_LATIN_1,"ISO-8859-1"},
300 {"IBM737",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
301 (void *) ibm_737tab,SC_GREEK,"ISO-8859-7"},
302 {"IBM775",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
303 (void *) ibm_775tab,SC_LATIN_7,"ISO-8859-13"},
304 {"IBM850",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
305 (void *) ibm_850tab,SC_LATIN_1,"ISO-8859-1"},
306 {"IBM852",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
307 (void *) ibm_852tab,SC_LATIN_2,"ISO-8859-2"},
308 {"IBM855",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
309 (void *) ibm_855tab,SC_CYRILLIC,"ISO-8859-5"},
310 {"IBM857",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
311 (void *) ibm_857tab,SC_LATIN_5,"ISO-8859-9"},
312 {"IBM860",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
313 (void *) ibm_860tab,SC_LATIN_1,"ISO-8859-1"},
314 {"IBM861",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
315 (void *) ibm_861tab,SC_LATIN_6,"ISO-8859-10"},
316 {"IBM862",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
317 (void *) ibm_862tab,SC_HEBREW,"ISO-8859-8"},
318 {"IBM863",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
319 (void *) ibm_863tab,SC_LATIN_1,"ISO-8859-1"},
320 {"IBM864",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
321 (void *) ibm_864tab,SC_ARABIC,"ISO-8859-6"},
322 {"IBM865",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
323 (void *) ibm_865tab,SC_LATIN_6,"ISO-8859-10"},
324 {"IBM866",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
325 (void *) ibm_866tab,SC_CYRILLIC,"KOI8-R"},
326 {"IBM869",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
327 (void *) ibm_869tab,SC_GREEK,"ISO-8859-7"},
328 {"IBM874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
329 (void *) ibm_874tab,SC_THAI,"ISO-8859-11"},
330 /* deepest sigh */
331 {"ANSI_X3.4-1968",CT_ASCII,CF_DISPLAY,
332 NIL,NIL,"US-ASCII"},
333 {"UNICODE-1-1-UTF-7",CT_UTF7,CF_UNSUPRT,
334 NIL,SC_UNICODE,"UTF-8"},
335 /* these should never appear in email */
336 {"UCS-2",CT_UCS2,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
337 NIL,SC_UNICODE,"UTF-8"},
338 {"UCS-4",CT_UCS4,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
339 NIL,SC_UNICODE,"UTF-8"},
340 {"UTF-16",CT_UTF16,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
341 NIL,SC_UNICODE,"UTF-8"},
342 NIL
343 };
345 /* Non-Unicode Script table */
347 static const SCRIPT utf8_scvalid[] = {
348 {"Arabic",NIL,SC_ARABIC},
349 {"Chinese Simplified","China, Singapore",SC_CHINESE_SIMPLIFIED},
350 {"Chinese Traditional","Taiwan, Hong Kong, Macao",SC_CHINESE_TRADITIONAL},
351 {"Cyrillic",NIL,SC_CYRILLIC},
352 {"Cyrillic Ukranian",NIL,SC_UKRANIAN},
353 {"Greek",NIL,SC_GREEK},
354 {"Hebrew",NIL,SC_HEBREW},
355 {"Japanese",NIL,SC_JAPANESE},
356 {"Korean",NIL,SC_KOREAN},
357 {"Latin-1","Western Europe",SC_LATIN_1},
358 {"Latin-2","Eastern Europe",SC_LATIN_2},
359 {"Latin-3","Southern Europe",SC_LATIN_3},
360 {"Latin-4","Northern Europe",SC_LATIN_4},
361 {"Latin-5","Turkish",SC_LATIN_5},
362 {"Latin-6","Nordic",SC_LATIN_6},
363 {"Latin-7","Baltic",SC_LATIN_7},
364 {"Latin-8","Celtic",SC_LATIN_8},
365 {"Latin-9","Euro",SC_LATIN_9},
366 {"Latin-10","Balkan",SC_LATIN_10},
367 {"Thai",NIL,SC_THAI},
368 {"Vietnamese",NIL,SC_VIETNAMESE},
369 NIL
370 };
372 /* Look up script name or return entire table
373 * Accepts: script name or NIL
374 * Returns: pointer to script table entry or NIL if unknown
375 */
377 SCRIPT *utf8_script (char *script)
378 {
379 unsigned long i;
380 if (!script) return (SCRIPT *) &utf8_scvalid[0];
381 else if (*script && (strlen (script) < 128))
382 for (i = 0; utf8_scvalid[i].name; i++)
383 if (!compare_cstring (script,utf8_scvalid[i].name))
384 return (SCRIPT *) &utf8_scvalid[i];
385 return NIL; /* failed */
386 }
389 /* Look up charset name or return entire table
390 * Accepts: charset name or NIL
391 * Returns: charset table entry or NIL if unknown
392 */
394 const CHARSET *utf8_charset (char *charset)
395 {
396 unsigned long i;
397 if (!charset) return (CHARSET *) &utf8_csvalid[0];
398 else if (*charset && (strlen (charset) < 128))
399 for (i = 0; utf8_csvalid[i].name; i++)
400 if (!compare_cstring (charset,utf8_csvalid[i].name))
401 return (CHARSET *) &utf8_csvalid[i];
402 return NIL; /* failed */
403 }
405 /* Validate charset and generate error message if invalid
406 * Accepts: bad character set
407 * Returns: NIL if good charset, else error message string
408 */
410 #define BADCSS "[BADCHARSET ("
411 #define BADCSE ")] Unknown charset: "
413 char *utf8_badcharset (char *charset)
414 {
415 char *msg = NIL;
416 if (!utf8_charset (charset)) {
417 char *s,*t;
418 unsigned long i,j;
419 /* calculate size of header, trailer, and bad
420 * charset plus charset names */
421 for (i = 0, j = sizeof (BADCSS) + sizeof (BADCSE) + strlen (charset) - 2;
422 utf8_csvalid[i].name; i++)
423 j += strlen (utf8_csvalid[i].name) + 1;
424 /* not built right */
425 if (!i) fatal ("No valid charsets!");
426 /* header */
427 for (s = msg = (char *) fs_get (j), t = BADCSS; *t; *s++ = *t++);
428 /* each charset */
429 for (i = 0; utf8_csvalid[i].name; *s++ = ' ', i++)
430 for (t = utf8_csvalid[i].name; *t; *s++ = *t++);
431 /* back over last space, trailer */
432 for (t = BADCSE, --s; *t; *s++ = *t++);
433 /* finally bogus charset */
434 for (t = charset; *t; *s++ = *t++);
435 *s++ = '\0'; /* finally tie off string */
436 if (s != (msg + j)) fatal ("charset msg botch");
437 }
438 return msg;
439 }
441 /* Convert charset labelled sized text to UTF-8
442 * Accepts: source sized text
443 * charset
444 * pointer to returned sized text if non-NIL
445 * flags
446 * Returns: T if successful, NIL if failure
447 */
449 long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags)
450 {
451 ucs4cn_t cv = (flags & U8T_CASECANON) ? ucs4_titlecase : NIL;
452 ucs4de_t de = (flags & U8T_DECOMPOSE) ? ucs4_decompose_recursive : NIL;
453 const CHARSET *cs = (charset && *charset) ?
454 utf8_charset (charset) : utf8_infercharset (text);
455 if (cs) return (text && ret) ? utf8_text_cs (text,cs,ret,cv,de) : LONGT;
456 if (ret) { /* no conversion possible */
457 ret->data = text->data; /* so return source */
458 ret->size = text->size;
459 }
460 return NIL; /* failure */
461 }
464 /* Operations used in converting data */
466 #define UTF8_COUNT_BMP(count,c,cv,de) { \
467 void *more = NIL; \
468 if (cv) c = (*cv) (c); \
469 if (de) c = (*de) (c,&more); \
470 do count += UTF8_SIZE_BMP(c); \
471 while (more && (c = (*de) (U8G_ERROR,&more)));\
472 }
474 #define UTF8_WRITE_BMP(b,c,cv,de) { \
475 void *more = NIL; \
476 if (cv) c = (*cv) (c); \
477 if (de) c = (*de) (c,&more); \
478 do UTF8_PUT_BMP (b,c) \
479 while (more && (c = (*de) (U8G_ERROR,&more)));\
480 }
482 #define UTF8_COUNT(count,c,cv,de) { \
483 void *more = NIL; \
484 if (cv) c = (*cv) (c); \
485 if (de) c = (*de) (c,&more); \
486 do count += utf8_size (c); \
487 while (more && (c = (*de) (U8G_ERROR,&more)));\
488 }
490 #define UTF8_WRITE(b,c,cv,de) { \
491 void *more = NIL; \
492 if (cv) c = (*cv) (c); \
493 if (de) c = (*de) (c,&more); \
494 do b = utf8_put (b,c); \
495 while (more && (c = (*de) (U8G_ERROR,&more)));\
496 }
498 /* Convert sized text to UTF-8 given CHARSET block
499 * Accepts: source sized text
500 * CHARSET block
501 * pointer to returned sized text
502 * canonicalization function
503 * decomposition function
504 * Returns: T if successful, NIL if failure
505 */
507 long utf8_text_cs (SIZEDTEXT *text,const CHARSET *cs,SIZEDTEXT *ret,
508 ucs4cn_t cv,ucs4de_t de)
509 {
510 ret->data = text->data; /* default to source */
511 ret->size = text->size;
512 switch (cs->type) { /* convert if type known */
513 case CT_ASCII: /* 7-bit ASCII no table */
514 case CT_UTF8: /* variable UTF-8 encoded Unicode no table */
515 if (cv || de) utf8_text_utf8 (text,ret,cv,de);
516 break;
517 case CT_1BYTE0: /* 1 byte no table */
518 utf8_text_1byte0 (text,ret,cv,de);
519 break;
520 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
521 utf8_text_1byte (text,ret,cs->tab,cv,de);
522 break;
523 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
524 utf8_text_1byte8 (text,ret,cs->tab,cv,de);
525 break;
526 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
527 utf8_text_euc (text,ret,cs->tab,cv,de);
528 break;
529 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
530 utf8_text_dbyte (text,ret,cs->tab,cv,de);
531 break;
532 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
533 utf8_text_dbyte2 (text,ret,cs->tab,cv,de);
534 break;
535 case CT_UTF7: /* variable UTF-7 encoded Unicode no table */
536 utf8_text_utf7 (text,ret,cv,de);
537 break;
538 case CT_UCS2: /* 2 byte 16-bit Unicode no table */
539 utf8_text_ucs2 (text,ret,cv,de);
540 break;
541 case CT_UCS4: /* 4 byte 32-bit Unicode no table */
542 utf8_text_ucs4 (text,ret,cv,de);
543 break;
544 case CT_UTF16: /* variable UTF-16 encoded Unicode no table */
545 utf8_text_utf16 (text,ret,cv,de);
546 break;
547 case CT_2022: /* variable ISO-2022 encoded no table*/
548 utf8_text_2022 (text,ret,cv,de);
549 break;
550 case CT_SJIS: /* 2 byte Shift-JIS encoded JIS no table */
551 utf8_text_sjis (text,ret,cv,de);
552 break;
553 default: /* unknown character set type */
554 return NIL;
555 }
556 return LONGT; /* return success */
557 }
559 /* Reverse mapping routines
560 *
561 * These routines only support character sets, not all possible charsets. In
562 * particular, they do not support any Unicode encodings or ISO 2022.
563 *
564 * As a special dispensation, utf8_cstext() and utf8_cstocstext() support
565 * support ISO-2022-JP if EUC-JP can be reverse mapped; and utf8_rmaptext()
566 * will generated ISO-2022-JP using an EUC-JP rmap if flagged to do so.
567 *
568 * No attempt is made to map "equivalent" Unicode characters or Unicode
569 * characters that have the same glyph; nor is there any attempt to handle
570 * combining characters or otherwise do any stringprep. Maybe later.
571 */
574 /* Convert UTF-8 sized text to charset
575 * Accepts: source sized text
576 * destination charset
577 * pointer to returned sized text
578 * substitute character if not in cs, else NIL to return failure
579 * Returns: T if successful, NIL if failure
580 */
583 long utf8_cstext (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,
584 unsigned long errch)
585 {
586 short iso2022jp = !compare_cstring (charset,"ISO-2022-JP");
587 unsigned short *rmap = utf8_rmap (iso2022jp ? "EUC-JP" : charset);
588 return rmap ? utf8_rmaptext (text,rmap,ret,errch,iso2022jp) : NIL;
589 }
591 /* Convert charset labelled sized text to another charset
592 * Accepts: source sized text
593 * source charset
594 * pointer to returned sized text
595 * destination charset
596 * substitute character if not in dest cs, else NIL to return failure
597 * Returns: T if successful, NIL if failure
598 *
599 * This routine has the same restricts as utf8_cstext().
600 */
602 long utf8_cstocstext (SIZEDTEXT *src,char *sc,SIZEDTEXT *dst,char *dc,
603 unsigned long errch)
604 {
605 SIZEDTEXT utf8;
606 const CHARSET *scs,*dcs;
607 unsigned short *rmap;
608 long ret = NIL;
609 long iso2022jp;
610 /* lookup charsets and reverse map */
611 if ((dc && (dcs = utf8_charset (dc))) &&
612 (rmap = (iso2022jp = ((dcs->type == CT_2022) &&
613 !compare_cstring (dcs->name,"ISO-2022-JP"))) ?
614 utf8_rmap ("EUC-JP") : utf8_rmap_cs (dcs)) &&
615 (scs = (sc && *sc) ? utf8_charset (sc) : utf8_infercharset (src))) {
616 /* init temporary buffer */
617 memset (&utf8,NIL,sizeof (SIZEDTEXT));
618 /* source cs equivalent to dest cs? */
619 if ((scs->type == dcs->type) && (scs->tab == dcs->tab)) {
620 dst->data = src->data; /* yes, just copy pointers */
621 dst->size = src->size;
622 ret = LONGT;
623 }
624 /* otherwise do the conversion */
625 else ret = (utf8_text_cs (src,scs,&utf8,NIL,NIL) &&
626 utf8_rmaptext (&utf8,rmap,dst,errch,iso2022jp));
627 /* flush temporary buffer */
628 if (utf8.data && (utf8.data != src->data) && (utf8.data != dst->data))
629 fs_give ((void **) &utf8.data);
630 }
631 return ret;
632 }
634 /* Cached rmap */
636 static const CHARSET *currmapcs = NIL;
637 static unsigned short *currmap = NIL;
640 /* Cache and return map for UTF-8 -> character set
641 * Accepts: character set name
642 * Returns: cached map if character set found, else NIL
643 */
645 unsigned short *utf8_rmap (char *charset)
646 {
647 return (currmapcs && !compare_cstring (charset,currmapcs->name)) ? currmap :
648 utf8_rmap_cs (utf8_charset (charset));
649 }
652 /* Cache and return map for UTF-8 -> character set given CHARSET block
653 * Accepts: CHARSET block
654 * Returns: cached map if character set found, else NIL
655 */
657 unsigned short *utf8_rmap_cs (const CHARSET *cs)
658 {
659 unsigned short *ret = NIL;
660 if (!cs); /* have charset? */
661 else if (cs == currmapcs) ret = currmap;
662 else if (ret = utf8_rmap_gen (cs,currmap)) {
663 currmapcs = cs;
664 currmap = ret;
665 }
666 return ret;
667 }
669 /* Return map for UTF-8 -> character set given CHARSET block
670 * Accepts: CHARSET block
671 * old map to recycle
672 * Returns: map if character set found, else NIL
673 */
675 unsigned short *utf8_rmap_gen (const CHARSET *cs,unsigned short *oldmap)
676 {
677 unsigned short u,*tab,*rmap;
678 unsigned int i,m,ku,ten;
679 struct utf8_eucparam *param,*p2;
680 switch (cs->type) { /* is a character set? */
681 case CT_ASCII: /* 7-bit ASCII no table */
682 case CT_1BYTE0: /* 1 byte no table */
683 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
684 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
685 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
686 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
687 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
688 case CT_SJIS: /* 2 byte Shift-JIS */
689 rmap = oldmap ? oldmap : /* recycle old map if supplied else make new */
690 (unsigned short *) fs_get (65536 * sizeof (unsigned short));
691 /* initialize table for ASCII */
692 for (i = 0; i < 128; i++) rmap[i] = (unsigned short) i;
693 /* populate remainder of table with NOCHAR */
694 #define NOCHARBYTE (NOCHAR & 0xff)
695 #if NOCHAR - ((NOCHARBYTE << 8) | NOCHARBYTE)
696 while (i < 65536) rmap[i++] = NOCHAR;
697 #else
698 memset (rmap + 128,NOCHARBYTE,(65536 - 128) * sizeof (unsigned short));
699 #endif
700 break;
701 default: /* unsupported charset type */
702 rmap = NIL; /* no map possible */
703 }
704 if (rmap) { /* have a map? */
705 switch (cs->type) { /* additional reverse map actions */
706 case CT_1BYTE0: /* 1 byte no table */
707 for (i = 128; i < 256; i++) rmap[i] = (unsigned short) i;
708 break;
709 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
710 for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++)
711 if (tab[i & BITS7] != UBOGON) rmap[tab[i & BITS7]] = (unsigned short)i;
712 break;
713 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
714 for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++)
715 if (tab[i] != UBOGON) rmap[tab[i]] = (unsigned short) i;
716 break;
717 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
718 for (param = (struct utf8_eucparam *) cs->tab,
719 tab = (unsigned short *) param->tab, ku = 0;
720 ku < param->max_ku; ku++)
721 for (ten = 0; ten < param->max_ten; ten++)
722 if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
723 rmap[u] = ((ku + param->base_ku) << 8) +
724 (ten + param->base_ten) + 0x8080;
725 break;
727 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
728 for (param = (struct utf8_eucparam *) cs->tab,
729 tab = (unsigned short *) param->tab, ku = 0;
730 ku < param->max_ku; ku++)
731 for (ten = 0; ten < param->max_ten; ten++)
732 if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
733 rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten);
734 break;
735 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
736 param = (struct utf8_eucparam *) cs->tab;
737 p2 = param + 1; /* plane 2 parameters */
738 /* only ten parameters should differ */
739 if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku))
740 fatal ("ku definition error for CT_DBYTE2 charset");
741 /* total codepoints in each ku */
742 m = param->max_ten + p2->max_ten;
743 tab = (unsigned short *) param->tab;
744 for (ku = 0; ku < param->max_ku; ku++) {
745 for (ten = 0; ten < param->max_ten; ten++)
746 if ((u = tab[(ku * m) + ten]) != UBOGON)
747 rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten);
748 for (ten = 0; ten < p2->max_ten; ten++)
749 if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON)
750 rmap[u] = ((ku + param->base_ku) << 8) + (ten + p2->base_ten);
751 }
752 break;
753 case CT_SJIS: /* 2 byte Shift-JIS */
754 for (ku = 0; ku < MAX_JIS0208_KU; ku++)
755 for (ten = 0; ten < MAX_JIS0208_TEN; ten++)
756 if ((u = jis0208tab[ku][ten]) != UBOGON) {
757 int sku = ku + BASE_JIS0208_KU;
758 int sten = ten + BASE_JIS0208_TEN;
759 rmap[u] = ((((sku + 1) >> 1) + ((sku < 95) ? 112 : 176)) << 8) +
760 sten + ((sku % 2) ? ((sten > 95) ? 32 : 31) : 126);
761 }
762 /* JIS Roman */
763 rmap[UCS2_YEN] = JISROMAN_YEN;
764 rmap[UCS2_OVERLINE] = JISROMAN_OVERLINE;
765 /* JIS hankaku katakana */
766 for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++)
767 rmap[UCS2_KATAKANA + u] = MIN_KANA_8 + u;
768 break;
769 }
770 /* hack: map NBSP to SP if otherwise no map */
771 if (rmap[0x00a0] == NOCHAR) rmap[0x00a0] = rmap[0x0020];
772 }
773 return rmap; /* return map */
774 }
776 /* Convert UTF-8 sized text to charset using rmap
777 * Accepts: source sized text
778 * conversion rmap
779 * pointer to returned sized text
780 * substitute character if not in rmap, else NIL to return failure
781 * ISO-2022-JP conversion flag
782 * Returns T if successful, NIL if failure
783 *
784 * This routine doesn't try to convert to all possible charsets; in particular
785 * it doesn't support other Unicode encodings or any ISO 2022 other than
786 * ISO-2022-JP.
787 */
789 long utf8_rmaptext (SIZEDTEXT *text,unsigned short *rmap,SIZEDTEXT *ret,
790 unsigned long errch,long iso2022jp)
791 {
792 unsigned long i,u,c;
793 /* get size of buffer */
794 if (i = utf8_rmapsize (text,rmap,errch,iso2022jp)) {
795 unsigned char *s = text->data;
796 unsigned char *t = ret->data = (unsigned char *) fs_get (i);
797 ret->size = i - 1; /* number of octets in destination buffer */
798 /* start non-zero ISO-2022-JP state at 1 */
799 if (iso2022jp) iso2022jp = 1;
800 /* convert string, ignore BOM */
801 for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) {
802 /* substitute error character for NOCHAR */
803 if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch;
804 switch (iso2022jp) { /* depends upon ISO 2022 mode */
805 case 0: /* ISO 2022 not in effect */
806 /* two-byte character */
807 if (c > 0xff) *t++ = (unsigned char) (c >> 8);
808 /* single-byte or low-byte of two-byte */
809 *t++ = (unsigned char) (c & 0xff);
810 break;
811 case 1: /* ISO 2022 Roman */
812 /* <ch> */
813 if (c < 0x80) *t++ = (unsigned char) c;
814 else { /* JIS character */
815 *t++ = I2C_ESC; /* ESC $ B <hi> <lo> */
816 *t++ = I2C_MULTI;
817 *t++ = I2CS_94x94_JIS_NEW;
818 *t++ = (unsigned char) (c >> 8) & 0x7f;
819 *t++ = (unsigned char) c & 0x7f;
820 iso2022jp = 2; /* shift to ISO 2022 JIS */
821 }
822 break;
823 case 2: /* ISO 2022 JIS */
824 if (c > 0x7f) { /* <hi> <lo> */
825 *t++ = (unsigned char) (c >> 8) & 0x7f;
826 *t++ = (unsigned char) c & 0x7f;
827 }
828 else { /* ASCII character */
829 *t++ = I2C_ESC; /* ESC ( J <ch> */
830 *t++ = I2C_G0_94;
831 *t++ = I2CS_94_JIS_ROMAN;
832 *t++ = (unsigned char) c;
833 iso2022jp = 1; /* shift to ISO 2022 Roman */
834 }
835 break;
836 }
837 }
838 if (iso2022jp == 2) { /* ISO-2022-JP string must end in Roman */
839 *t++ = I2C_ESC; /* ESC ( J */
840 *t++ = I2C_G0_94;
841 *t++ = I2CS_94_JIS_ROMAN;
842 }
843 *t++ = NIL; /* tie off returned data */
844 return LONGT; /* return success */
845 }
846 ret->data = NIL;
847 ret->size = 0;
848 return NIL; /* failure */
849 }
851 /* Calculate size of convertsion of UTF-8 sized text to charset using rmap
852 * Accepts: source sized text
853 * conversion rmap
854 * pointer to returned sized text
855 * substitute character if not in rmap, else NIL to return failure
856 * ISO-2022-JP conversion flag
857 * Returns size+1 if successful, NIL if failure
858 *
859 * This routine doesn't try to handle to all possible charsets; in particular
860 * it doesn't support other Unicode encodings or any ISO 2022 other than
861 * ISO-2022-JP.
862 */
864 unsigned long utf8_rmapsize (SIZEDTEXT *text,unsigned short *rmap,
865 unsigned long errch,long iso2022jp)
866 {
867 unsigned long i,u,c;
868 unsigned long ret = 1; /* terminating NUL */
869 unsigned char *s = text->data;
870 if (iso2022jp) iso2022jp = 1; /* start non-zero ISO-2022-JP state at 1 */
871 for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) {
872 if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch)))
873 return NIL; /* not in BMP, or NOCHAR and no err char */
874 switch (iso2022jp) { /* depends upon ISO 2022 mode */
875 case 0: /* ISO 2022 not in effect */
876 ret += (c > 0xff) ? 2 : 1;
877 break;
878 case 1: /* ISO 2022 Roman */
879 if (c < 0x80) ret += 1; /* <ch> */
880 else { /* JIS character */
881 ret += 5; /* ESC $ B <hi> <lo> */
882 iso2022jp = 2; /* shift to ISO 2022 JIS */
883 }
884 break;
885 case 2: /* ISO 2022 JIS */
886 if (c > 0x7f) ret += 2; /* <hi> <lo> */
887 else { /* ASCII character */
888 ret += 4; /* ESC ( J <ch> */
889 iso2022jp = 1; /* shift to ISO 2022 Roman */
890 }
891 break;
892 }
893 }
894 if (iso2022jp == 2) { /* ISO-2022-JP string must end in Roman */
895 ret += 3; /* ESC ( J */
896 iso2022jp = 1; /* reset state to Roman */
897 }
898 return ret;
899 }
901 /* Convert UCS-4 to charset using rmap
902 * Accepts: source UCS-4 character(s)
903 * numver of UCS-4 characters
904 * conversion rmap
905 * pointer to returned sized text
906 * substitute character if not in rmap, else NIL to return failure
907 * Returns T if successful, NIL if failure
908 *
909 * Currently only supports BMP characters, and does not support ISO-2022
910 */
912 long ucs4_rmaptext (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
913 SIZEDTEXT *ret,unsigned long errch)
914 {
915 long size = ucs4_rmaplen (ucs4,len,rmap,errch);
916 return (size >= 0) ? /* build in newly-created buffer */
917 ucs4_rmapbuf (ret->data = (unsigned char *) fs_get ((ret->size = size) +1),
918 ucs4,len,rmap,errch) : NIL;
919 }
921 /* Return size of UCS-4 string converted to other CS via rmap
922 * Accepts: source UCS-4 character(s)
923 * numver of UCS-4 characters
924 * conversion rmap
925 * substitute character if not in rmap, else NIL to return failure
926 * Returns: length if success, negative if failure (no-convert)
927 */
929 long ucs4_rmaplen (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
930 unsigned long errch)
931 {
932 long ret;
933 unsigned long i,u,c;
934 /* count non-BOM characters */
935 for (ret = 0,i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) {
936 if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch)))
937 return -1; /* not in BMP, or NOCHAR and no err char? */
938 ret += (c > 0xff) ? 2 : 1;
939 }
940 return ret;
941 }
944 /* Stuff buffer with UCS-4 string converted to other CS via rmap
945 * Accepts: destination buffer
946 * source UCS-4 character(s)
947 * number of UCS-4 characters
948 * conversion rmap
949 * substitute character if not in rmap, else NIL to return failure
950 * Returns: T, always
951 */
953 long ucs4_rmapbuf (unsigned char *t,unsigned long *ucs4,unsigned long len,
954 unsigned short *rmap,unsigned long errch)
955 {
956 unsigned long i,u,c;
957 /* convert non-BOM characters */
958 for (i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) {
959 /* substitute error character for NOCHAR */
960 if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch;
961 /* two-byte character? */
962 if (c > 0xff) *t++ = (unsigned char) (c >> 8);
963 /* single-byte or low-byte of two-byte */
964 *t++ = (unsigned char) (c & 0xff);
965 }
966 *t++ = NIL; /* tie off returned data */
967 return LONGT;
968 }
970 /* Return UCS-4 Unicode character from UTF-8 string
971 * Accepts: pointer to string
972 * remaining octets in string
973 * Returns: UCS-4 character with pointer and count updated
974 * or error code with pointer and count unchanged
975 */
977 unsigned long utf8_get (unsigned char **s,unsigned long *i)
978 {
979 unsigned char *t = *s;
980 unsigned long j = *i;
981 /* decode raw UTF-8 string */
982 unsigned long ret = utf8_get_raw (&t,&j);
983 if (ret & U8G_ERROR); /* invalid raw UTF-8 decoding? */
984 /* no, is it surrogate? */
985 else if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) ret = U8G_SURROGA;
986 /* or in non-Unicode ISO 10646 space? */
987 else if (ret > UCS4_MAXUNICODE) ret = U8G_NOTUNIC;
988 else {
989 *s = t; /* all is well, update pointer */
990 *i = j; /* and counter */
991 }
992 return ret; /* return value */
993 }
995 /* Return raw (including non-Unicode) UCS-4 character from UTF-8 string
996 * Accepts: pointer to string
997 * remaining octets in string
998 * Returns: UCS-4 character with pointer and count updated
999 * or error code with pointer and count unchanged
1000 */
1002 unsigned long utf8_get_raw (unsigned char **s,unsigned long *i)
1004 unsigned char c,c1;
1005 unsigned char *t = *s;
1006 unsigned long j = *i;
1007 unsigned long ret = U8G_NOTUTF8;
1008 int more = 0;
1009 do { /* make sure have source octets available */
1010 if (!j--) return more ? U8G_ENDSTRI : U8G_ENDSTRG;
1011 /* UTF-8 continuation? */
1012 else if (((c = *t++) > 0x7f) && (c < 0xc0)) {
1013 /* continuation when not in progress */
1014 if (!more) return U8G_BADCONT;
1015 --more; /* found a continuation octet */
1016 ret <<= 6; /* shift current value by 6 bits */
1017 ret |= c & 0x3f; /* merge continuation octet */
1019 /* incomplete UTF-8 character */
1020 else if (more) return U8G_INCMPLT;
1021 else { /* start of sequence */
1022 c1 = j ? *t : 0xbf; /* assume valid continuation if incomplete */
1023 if (c < 0x80) ret = c; /* U+0000 - U+007f */
1024 else if (c < 0xc2); /* c0 and c1 never valid */
1025 else if (c < 0xe0) { /* U+0080 - U+07ff */
1026 if (c &= 0x1f) more = 1;
1028 else if (c < 0xf0) { /* U+0800 - U+ffff */
1029 if ((c &= 0x0f) || (c1 >= 0xa0)) more = 2;
1031 else if (c < 0xf8) { /* U+10000 - U+10ffff (and 110000 - 1fffff) */
1032 if ((c &= 0x07) || (c1 >= 0x90)) more = 3;
1034 else if (c < 0xfc) { /* ISO 10646 200000 - 3ffffff */
1035 if ((c &= 0x03) || (c1 >= 0x88)) more = 4;
1037 else if (c < 0xfe) { /* ISO 10646 4000000 - 7fffffff */
1038 if ((c &= 0x01) || (c1 >= 0x84)) more = 5;
1040 /* fe and ff never valid */
1041 if (more) { /* multi-octet, make sure more to come */
1042 if (!j) return U8G_ENDSTRI;
1043 ret = c; /* continuation needed, save start bits */
1046 } while (more);
1047 if (!(ret & U8G_ERROR)) { /* success return? */
1048 *s = t; /* yes, update pointer */
1049 *i = j; /* and counter */
1051 return ret; /* return value */
1054 /* Return UCS-4 character from named charset string
1055 * Accepts: charset
1056 * pointer to string
1057 * remaining octets in string
1058 * Returns: UCS-4 character with pointer and count updated, negative if error
1060 * Error codes are the same as utf8_get().
1061 */
1063 unsigned long ucs4_cs_get (CHARSET *cs,unsigned char **s,unsigned long *i)
1065 unsigned char c,c1,ku,ten;
1066 unsigned long ret,d;
1067 unsigned char *t = *s;
1068 unsigned long j = *i;
1069 struct utf8_eucparam *p1,*p2,*p3;
1070 if (j--) c = *t++; /* get first octet */
1071 else return U8G_ENDSTRG; /* empty string */
1072 switch (cs->type) { /* convert if type known */
1073 case CT_UTF8: /* variable UTF-8 encoded Unicode no table */
1074 return utf8_get (s,i);
1075 case CT_ASCII: /* 7-bit ASCII no table */
1076 if (c >= 0x80) return U8G_NOTUTF8;
1077 case CT_1BYTE0: /* 1 byte no table */
1078 ret = c; /* identity */
1079 break;
1080 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
1081 ret = (c > 0x80) ? ((unsigned short *) cs->tab)[c & BITS7] : c;
1082 break;
1083 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
1084 ret = ((unsigned short *) cs->tab)[c];
1085 break;
1087 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
1088 if (c & BIT8) {
1089 p1 = (struct utf8_eucparam *) cs->tab;
1090 p2 = p1 + 1;
1091 p3 = p1 + 2;
1092 if (j--) c1 = *t++; /* get second octet */
1093 else return U8G_ENDSTRI;
1094 if (!(c1 & BIT8)) return U8G_NOTUTF8;
1095 switch (c) { /* check 8bit code set */
1096 case EUC_CS2: /* CS2 */
1097 if (p2->base_ku) { /* CS2 set up? */
1098 if (p2->base_ten) { /* yes, multibyte? */
1099 if (j--) c = *t++; /* get second octet */
1100 else return U8G_ENDSTRI;
1101 if ((c & BIT8) &&
1102 ((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) &&
1103 ((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) {
1104 ret = ((unsigned short *) p2->tab)[(ku*p2->max_ten) + ten];
1105 break;
1108 else if ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) {
1109 ret = c1 + ((unsigned long) p2->tab);
1110 break;
1113 return U8G_NOTUTF8; /* CS2 not set up or bogus */
1114 case EUC_CS3: /* CS3 */
1115 if (p3->base_ku) { /* CS3 set up? */
1116 if (p3->base_ten) { /* yes, multibyte? */
1117 if (j--) c = *t++; /* get second octet */
1118 else return U8G_ENDSTRI;
1119 if ((c & BIT8) &&
1120 ((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) &&
1121 ((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) {
1122 ret = ((unsigned short *) p3->tab)[(ku*p3->max_ten) + ten];
1123 break;
1126 else if ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) {
1127 ret = c1 + ((unsigned long) p3->tab);
1128 break;
1131 return U8G_NOTUTF8; /* CS3 not set up or bogus */
1132 default:
1133 if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) ||
1134 ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten))
1135 return U8G_NOTUTF8;
1136 ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten];
1137 /* special hack for JIS X 0212: merge rows less than 10 */
1138 if ((ret == UBOGON) && ku && (ku < 10) && p3->tab && p3->base_ten)
1139 ret = ((unsigned short *) p3->tab)
1140 [((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten];
1141 break;
1144 else ret = c; /* ASCII character */
1145 break;
1147 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
1148 if (c & BIT8) { /* double-byte character? */
1149 p1 = (struct utf8_eucparam *) cs->tab;
1150 if (j--) c1 = *t++; /* get second octet */
1151 else return U8G_ENDSTRI;
1152 if (((ku = c - p1->base_ku) < p1->max_ku) &&
1153 ((ten = c1 - p1->base_ten) < p1->max_ten))
1154 ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten];
1155 else return U8G_NOTUTF8;
1157 else ret = c; /* ASCII character */
1158 break;
1159 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
1160 if (c & BIT8) { /* double-byte character? */
1161 p1 = (struct utf8_eucparam *) cs->tab;
1162 p2 = p1 + 1;
1163 if (j--) c1 = *t++; /* get second octet */
1164 else return U8G_ENDSTRI;
1165 if (c1 & BIT8) { /* high vs. low plane */
1166 if ((ku = c - p2->base_ku) < p2->max_ku &&
1167 ((ten = c1 - p2->base_ten) < p2->max_ten))
1168 ret = ((unsigned short *) p1->tab)
1169 [(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten];
1170 else return U8G_NOTUTF8;
1172 else if ((ku = c - p1->base_ku) < p1->max_ku &&
1173 ((ten = c1 - p1->base_ten) < p1->max_ten))
1174 ret = ((unsigned short *) p1->tab)
1175 [(ku*(p1->max_ten + p2->max_ten)) + ten];
1176 else return U8G_NOTUTF8;
1178 else ret = c; /* ASCII character */
1179 break;
1180 case CT_SJIS: /* 2 byte Shift-JIS encoded JIS no table */
1181 /* compromise - do yen sign but not overline */
1182 if (!(c & BIT8)) ret = (c == JISROMAN_YEN) ? UCS2_YEN : c;
1183 /* half-width katakana? */
1184 else if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) ret = c + KANA_8;
1185 else { /* Shift-JIS */
1186 if (j--) c1 = *t++; /* get second octet */
1187 else return U8G_ENDSTRI;
1188 SJISTOJIS (c,c1);
1189 c = JISTOUNICODE (c,c1,ku,ten);
1191 break;
1193 case CT_UCS2: /* 2 byte 16-bit Unicode no table */
1194 ret = c << 8;
1195 if (j--) c = *t++; /* get second octet */
1196 else return U8G_ENDSTRI; /* empty string */
1197 ret |= c;
1198 break;
1199 case CT_UCS4: /* 4 byte 32-bit Unicode no table */
1200 if (c & 0x80) return U8G_NOTUTF8;
1201 if (j < 3) return U8G_ENDSTRI;
1202 j -= 3; /* count three octets */
1203 ret = c << 24;
1204 ret |= (*t++) << 16;
1205 ret |= (*t++) << 8;
1206 ret |= (*t++);
1207 break;
1208 case CT_UTF16: /* variable UTF-16 encoded Unicode no table */
1209 ret = c << 8;
1210 if (j--) c = *t++; /* get second octet */
1211 else return U8G_ENDSTRI; /* empty string */
1212 ret |= c;
1213 /* surrogate? */
1214 if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) {
1215 /* invalid first surrogate */
1216 if ((ret > UTF16_SURRHEND) || (j < 2)) return U8G_NOTUTF8;
1217 j -= 2; /* count two octets */
1218 d = (*t++) << 8; /* first octet of second surrogate */
1219 d |= *t++; /* second octet of second surrogate */
1220 if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) return U8G_NOTUTF8;
1221 ret = UTF16_BASE + ((ret & UTF16_MASK) << UTF16_SHIFT) +
1222 (d & UTF16_MASK);
1224 break;
1225 default: /* unknown/unsupported character set type */
1226 return U8G_NOTUTF8;
1228 *s = t; /* update pointer and counter */
1229 *i = j;
1230 return ret;
1233 /* Produce charset validity map for BMP
1234 * Accepts: list of charsets to map
1235 * Returns: validity map, indexed by BMP codepoint
1237 * Bit 0x1 is the "not-CJK" character bit
1238 */
1240 unsigned long *utf8_csvalidmap (char *charsets[])
1242 unsigned short u,*tab;
1243 unsigned int m,ku,ten;
1244 unsigned long i,csi,csb;
1245 struct utf8_eucparam *param,*p2;
1246 char *s;
1247 const CHARSET *cs;
1248 unsigned long *ret = (unsigned long *)
1249 fs_get (i = 0x10000 * sizeof (unsigned long));
1250 memset (ret,0,i); /* zero the entire vector */
1251 /* mark all the non-CJK codepoints */
1252 /* U+0000 - U+2E7F non-CJK */
1253 for (i = 0; i < 0x2E7F; ++i) ret[i] = 0x1;
1254 /* U+2E80 - U+2EFF CJK Radicals Supplement
1255 * U+2F00 - U+2FDF Kangxi Radicals
1256 * U+2FE0 - U+2FEF unassigned
1257 * U+2FF0 - U+2FFF Ideographic Description Characters
1258 * U+3000 - U+303F CJK Symbols and Punctuation
1259 * U+3040 - U+309F Hiragana
1260 * U+30A0 - U+30FF Katakana
1261 * U+3100 - U+312F BoPoMoFo
1262 * U+3130 - U+318F Hangul Compatibility Jamo
1263 * U+3190 - U+319F Kanbun
1264 * U+31A0 - U+31BF BoPoMoFo Extended
1265 * U+31C0 - U+31EF CJK Strokes
1266 * U+31F0 - U+31FF Katakana Phonetic Extensions
1267 * U+3200 - U+32FF Enclosed CJK Letters and Months
1268 * U+3300 - U+33FF CJK Compatibility
1269 * U+3400 - U+4DBF CJK Unified Ideographs Extension A
1270 * U+4DC0 - U+4DFF Yijing Hexagram Symbols
1271 * U+4E00 - U+9FFF CJK Unified Ideographs
1272 * U+A000 - U+A48F Yi Syllables
1273 * U+A490 - U+A4CF Yi Radicals
1274 * U+A700 - U+A71F Modifier Tone Letters
1275 */
1276 for (i = 0xa720; i < 0xabff; ++i) ret[i] = 0x1;
1277 /* U+AC00 - U+D7FF Hangul Syllables */
1278 for (i = 0xd800; i < 0xf8ff; ++i) ret[i] = 0x1;
1279 /* U+F900 - U+FAFF CJK Compatibility Ideographs */
1280 for (i = 0xfb00; i < 0xfe2f; ++i) ret[i] = 0x1;
1281 /* U+FE30 - U+FE4F CJK Compatibility Forms
1282 * U+FE50 - U+FE6F Small Form Variants (for CNS 11643)
1283 */
1284 for (i = 0xfe70; i < 0xfeff; ++i) ret[i] = 0x1;
1285 /* U+FF00 - U+FFEF CJK Compatibility Ideographs */
1286 for (i = 0xfff0; i < 0x10000; ++i) ret[i] = 0x1;
1288 /* for each supplied charset */
1289 for (csi = 1; ret && charsets && (s = charsets[csi - 1]); ++csi) {
1290 /* substitute EUC-JP for ISO-2022-JP */
1291 if (!compare_cstring (s,"ISO-2022-JP")) s = "EUC-JP";
1292 /* look up charset */
1293 if (cs = utf8_charset (s)) {
1294 csb = 1 << csi; /* charset bit */
1295 switch (cs->type) {
1296 case CT_ASCII: /* 7-bit ASCII no table */
1297 case CT_1BYTE0: /* 1 byte no table */
1298 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
1299 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
1300 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
1301 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
1302 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
1303 case CT_SJIS: /* 2 byte Shift-JIS */
1304 /* supported charset type, all ASCII is OK */
1305 for (i = 0; i < 128; ++i) ret[i] |= csb;
1306 break;
1307 default: /* unsupported charset type */
1308 fs_give ((void **) &ret);
1309 break;
1311 /* now do additional operations */
1312 if (ret) switch (cs->type) {
1313 case CT_1BYTE0: /* 1 byte no table */
1314 for (i = 128; i < 256; i++) ret[i] |= csb;
1315 break;
1316 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
1317 for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++)
1318 if (tab[i & BITS7] != UBOGON) ret[tab[i & BITS7]] |= csb;
1319 break;
1320 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
1321 for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++)
1322 if (tab[i] != UBOGON) ret[tab[i]] |= csb;
1323 break;
1324 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
1325 for (param = (struct utf8_eucparam *) cs->tab,
1326 tab = (unsigned short *) param->tab, ku = 0;
1327 ku < param->max_ku; ku++)
1328 for (ten = 0; ten < param->max_ten; ten++)
1329 if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
1330 ret[u] |= csb;
1331 break;
1333 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
1334 for (param = (struct utf8_eucparam *) cs->tab,
1335 tab = (unsigned short *) param->tab, ku = 0;
1336 ku < param->max_ku; ku++)
1337 for (ten = 0; ten < param->max_ten; ten++)
1338 if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
1339 ret[u] |= csb;
1340 break;
1341 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
1342 param = (struct utf8_eucparam *) cs->tab;
1343 p2 = param + 1; /* plane 2 parameters */
1344 /* only ten parameters should differ */
1345 if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku))
1346 fatal ("ku definition error for CT_DBYTE2 charset");
1347 /* total codepoints in each ku */
1348 m = param->max_ten + p2->max_ten;
1349 tab = (unsigned short *) param->tab;
1350 for (ku = 0; ku < param->max_ku; ku++) {
1351 for (ten = 0; ten < param->max_ten; ten++)
1352 if ((u = tab[(ku * m) + ten]) != UBOGON)
1353 ret[u] |= csb;
1354 for (ten = 0; ten < p2->max_ten; ten++)
1355 if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON)
1356 ret[u] |= csb;
1358 break;
1359 case CT_SJIS: /* 2 byte Shift-JIS */
1360 for (ku = 0; ku < MAX_JIS0208_KU; ku++)
1361 for (ten = 0; ten < MAX_JIS0208_TEN; ten++)
1362 if ((u = jis0208tab[ku][ten]) != UBOGON) ret[u] |= csb;
1363 /* JIS hankaku katakana */
1364 for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++)
1365 ret[UCS2_KATAKANA + u] |= csb;
1366 break;
1369 /* invalid charset, punt */
1370 else fs_give ((void **) &ret);
1372 return ret;
1375 /* Infer charset from unlabelled sized text
1376 * Accepts: sized text
1377 * Returns: charset if one inferred, or NIL if unknown
1378 */
1380 const CHARSET *utf8_infercharset (SIZEDTEXT *src)
1382 long iso2022jp = NIL;
1383 long eightbit = NIL;
1384 unsigned long i;
1385 /* look for ISO 2022 */
1386 if (src) for (i = 0; i < src->size; i++) {
1387 /* ESC sequence? */
1388 if ((src->data[i] == I2C_ESC) && (++i < src->size)) switch (src->data[i]) {
1389 case I2C_MULTI: /* yes, multibyte? */
1390 if (++i < src->size) switch (src->data[i]) {
1391 case I2CS_94x94_JIS_OLD: /* JIS X 0208-1978 */
1392 case I2CS_94x94_JIS_NEW: /* JIS X 0208-1983 */
1393 case I2CS_94x94_JIS_EXT: /* JIS X 0212-1990 (kludge...) */
1394 iso2022jp = T; /* found an ISO-2022-JP sequence */
1395 break;
1396 default: /* other multibyte */
1397 return NIL; /* definitely invalid */
1399 break;
1400 case I2C_G0_94: /* single byte */
1401 if (++i < src->size) switch (src->data[i]) {
1402 case I2CS_94_JIS_BUGROM: /* in case old buggy software */
1403 case I2CS_94_JIS_ROMAN: /* JIS X 0201-1976 left half */
1404 case I2CS_94_ASCII: /* ASCII */
1405 case I2CS_94_BRITISH: /* good enough for gov't work */
1406 break;
1407 default: /* other 94 single byte */
1408 return NIL; /* definitely invalid */
1411 /* if possible UTF-8 and not ISO-2022-JP */
1412 else if (!iso2022jp && (eightbit >= 0) && (src->data[i] & BIT8) &&
1413 (eightbit = utf8_validate (src->data + i,src->size - i)) > 0)
1414 i += eightbit - 1; /* skip past all but last of UTF-8 char */
1416 /* ISO-2022-JP overrides other guesses */
1417 if (iso2022jp) return utf8_charset ("ISO-2022-JP");
1418 if (eightbit > 0) return utf8_charset ("UTF-8");
1419 return eightbit ? NIL : utf8_charset ("US-ASCII");
1423 /* Validate that character at this position is UTF-8
1424 * Accepts: string pointer
1425 * size of remaining string
1426 * Returns: size of UTF-8 character in octets or -1 if not UTF-8
1427 */
1429 long utf8_validate (unsigned char *s,unsigned long i)
1431 unsigned long j = i;
1432 return (utf8_get (&s,&i) & U8G_ERROR) ? -1 : j - i;
1435 /* Convert ISO 8859-1 to UTF-8
1436 * Accepts: source sized text
1437 * pointer to return sized text
1438 * canonicalization function
1439 */
1441 void utf8_text_1byte0 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
1443 unsigned long i;
1444 unsigned char *s;
1445 unsigned int c;
1446 for (ret->size = i = 0; i < text->size;) {
1447 c = text->data[i++];
1448 UTF8_COUNT_BMP (ret->size,c,cv,de)
1450 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
1451 for (i = 0; i < text->size;) {
1452 c = text->data[i++];
1453 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
1458 /* Convert single byte ASCII+8bit character set sized text to UTF-8
1459 * Accepts: source sized text
1460 * pointer to return sized text
1461 * conversion table
1462 * canonicalization function
1463 */
1465 void utf8_text_1byte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1466 ucs4de_t de)
1468 unsigned long i;
1469 unsigned char *s;
1470 unsigned int c;
1471 unsigned short *tbl = (unsigned short *) tab;
1472 for (ret->size = i = 0; i < text->size;) {
1473 if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7];
1474 UTF8_COUNT_BMP (ret->size,c,cv,de)
1476 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
1477 for (i = 0; i < text->size;) {
1478 if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7];
1479 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
1483 /* Convert single byte 8bit character set sized text to UTF-8
1484 * Accepts: source sized text
1485 * pointer to return sized text
1486 * conversion table
1487 * canonicalization function
1488 */
1490 void utf8_text_1byte8 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1491 ucs4de_t de)
1493 unsigned long i;
1494 unsigned char *s;
1495 unsigned int c;
1496 unsigned short *tbl = (unsigned short *) tab;
1497 for (ret->size = i = 0; i < text->size;) {
1498 c = tbl[text->data[i++]];
1499 UTF8_COUNT_BMP (ret->size,c,cv,de)
1501 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
1502 for (i = 0; i < text->size;) {
1503 c = tbl[text->data[i++]];
1504 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
1508 /* Convert EUC sized text to UTF-8
1509 * Accepts: source sized text
1510 * pointer to return sized text
1511 * EUC parameter table
1512 * canonicalization function
1513 */
1515 void utf8_text_euc (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1516 ucs4de_t de)
1518 unsigned long i;
1519 unsigned char *s;
1520 unsigned int pass,c,c1,ku,ten;
1521 struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
1522 struct utf8_eucparam *p2 = p1 + 1;
1523 struct utf8_eucparam *p3 = p1 + 2;
1524 unsigned short *t1 = (unsigned short *) p1->tab;
1525 unsigned short *t2 = (unsigned short *) p2->tab;
1526 unsigned short *t3 = (unsigned short *) p3->tab;
1527 for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
1528 for (i = 0; i < text->size;) {
1529 /* not CS0? */
1530 if ((c = text->data[i++]) & BIT8) {
1531 /* yes, must have another high byte */
1532 if ((i >= text->size) || !((c1 = text->data[i++]) & BIT8))
1533 c = UBOGON; /* out of space or bogon */
1534 else switch (c) { /* check 8bit code set */
1535 case EUC_CS2: /* CS2 */
1536 if (p2->base_ku) { /* CS2 set up? */
1537 if (p2->base_ten) /* yes, multibyte? */
1538 c = ((i < text->size) && ((c = text->data[i++]) & BIT8) &&
1539 ((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) &&
1540 ((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) ?
1541 t2[(ku*p2->max_ten) + ten] : UBOGON;
1542 else c = ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) ?
1543 c1 + ((unsigned long) p2->tab) : UBOGON;
1545 else { /* CS2 not set up */
1546 c = UBOGON; /* swallow byte, say bogon */
1547 if (i < text->size) i++;
1549 break;
1550 case EUC_CS3: /* CS3 */
1551 if (p3->base_ku) { /* CS3 set up? */
1552 if (p3->base_ten) /* yes, multibyte? */
1553 c = ((i < text->size) && ((c = text->data[i++]) & BIT8) &&
1554 ((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) &&
1555 ((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) ?
1556 t3[(ku*p3->max_ten) + ten] : UBOGON;
1557 else c = ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) ?
1558 c1 + ((unsigned long) p3->tab) : UBOGON;
1560 else { /* CS3 not set up */
1561 c = UBOGON; /* swallow byte, say bogon */
1562 if (i < text->size) i++;
1564 break;
1566 default:
1567 if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) ||
1568 ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten)) c = UBOGON;
1569 else if (((c = t1[(ku*p1->max_ten) + ten]) == UBOGON) &&
1570 /* special hack for JIS X 0212: merge rows less than 10 */
1571 ku && (ku < 10) && t3 && p3->base_ten)
1572 c = t3[((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten];
1575 /* convert if second pass */
1576 if (pass) UTF8_WRITE_BMP (s,c,cv,de)
1577 else UTF8_COUNT_BMP (ret->size,c,cv,de);
1579 if (!pass) (s = ret->data = (unsigned char *)
1580 fs_get (ret->size + 1))[ret->size] =NIL;
1585 /* Convert ASCII + double-byte sized text to UTF-8
1586 * Accepts: source sized text
1587 * pointer to return sized text
1588 * conversion table
1589 * canonicalization function
1590 */
1592 void utf8_text_dbyte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1593 ucs4de_t de)
1595 unsigned long i;
1596 unsigned char *s;
1597 unsigned int c,c1,ku,ten;
1598 struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
1599 unsigned short *t1 = (unsigned short *) p1->tab;
1600 for (ret->size = i = 0; i < text->size;) {
1601 if ((c = text->data[i++]) & BIT8) {
1602 /* special hack for GBK: 0x80 is Euro */
1603 if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO;
1604 else c = ((i < text->size) && (c1 = text->data[i++]) &&
1605 ((ku = c - p1->base_ku) < p1->max_ku) &&
1606 ((ten = c1 - p1->base_ten) < p1->max_ten)) ?
1607 t1[(ku*p1->max_ten) + ten] : UBOGON;
1609 UTF8_COUNT_BMP (ret->size,c,cv,de)
1611 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
1612 for (i = 0; i < text->size;) {
1613 if ((c = text->data[i++]) & BIT8) {
1614 /* special hack for GBK: 0x80 is Euro */
1615 if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO;
1616 else c = ((i < text->size) && (c1 = text->data[i++]) &&
1617 ((ku = c - p1->base_ku) < p1->max_ku) &&
1618 ((ten = c1 - p1->base_ten) < p1->max_ten)) ?
1619 t1[(ku*p1->max_ten) + ten] : UBOGON;
1621 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
1625 /* Convert ASCII + double byte 2 plane sized text to UTF-8
1626 * Accepts: source sized text
1627 * pointer to return sized text
1628 * conversion table
1629 * canonicalization function
1630 */
1632 void utf8_text_dbyte2 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1633 ucs4de_t de)
1635 unsigned long i;
1636 unsigned char *s;
1637 unsigned int c,c1,ku,ten;
1638 struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
1639 struct utf8_eucparam *p2 = p1 + 1;
1640 unsigned short *t = (unsigned short *) p1->tab;
1641 for (ret->size = i = 0; i < text->size;) {
1642 if ((c = text->data[i++]) & BIT8) {
1643 if ((i >= text->size) || !(c1 = text->data[i++]))
1644 c = UBOGON; /* out of space or bogon */
1645 else if (c1 & BIT8) /* high vs. low plane */
1646 c = ((ku = c - p2->base_ku) < p2->max_ku &&
1647 ((ten = c1 - p2->base_ten) < p2->max_ten)) ?
1648 t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON;
1649 else c = ((ku = c - p1->base_ku) < p1->max_ku &&
1650 ((ten = c1 - p1->base_ten) < p1->max_ten)) ?
1651 t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON;
1653 UTF8_COUNT_BMP (ret->size,c,cv,de)
1655 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
1656 for (i = 0; i < text->size;) {
1657 if ((c = text->data[i++]) & BIT8) {
1658 if ((i >= text->size) || !(c1 = text->data[i++]))
1659 c = UBOGON; /* out of space or bogon */
1660 else if (c1 & BIT8) /* high vs. low plane */
1661 c = ((ku = c - p2->base_ku) < p2->max_ku &&
1662 ((ten = c1 - p2->base_ten) < p2->max_ten)) ?
1663 t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON;
1664 else c = ((ku = c - p1->base_ku) < p1->max_ku &&
1665 ((ten = c1 - p1->base_ten) < p1->max_ten)) ?
1666 t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON;
1668 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
1672 #ifdef JISTOUNICODE /* Japanese */
1673 /* Convert Shift JIS sized text to UTF-8
1674 * Accepts: source sized text
1675 * pointer to return sized text
1676 * canonicalization function
1677 */
1679 void utf8_text_sjis (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,
1680 ucs4de_t de)
1682 unsigned long i;
1683 unsigned char *s;
1684 unsigned int c,c1,ku,ten;
1685 for (ret->size = i = 0; i < text->size;) {
1686 if ((c = text->data[i++]) & BIT8) {
1687 /* half-width katakana */
1688 if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8;
1689 else if (i >= text->size) c = UBOGON;
1690 else { /* Shift-JIS */
1691 c1 = text->data[i++];
1692 SJISTOJIS (c,c1);
1693 c = JISTOUNICODE (c,c1,ku,ten);
1696 /* compromise - do yen sign but not overline */
1697 else if (c == JISROMAN_YEN) c = UCS2_YEN;
1698 UTF8_COUNT_BMP (ret->size,c,cv,de)
1700 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
1701 for (i = 0; i < text->size;) {
1702 if ((c = text->data[i++]) & BIT8) {
1703 /* half-width katakana */
1704 if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8;
1705 else { /* Shift-JIS */
1706 c1 = text->data[i++];
1707 SJISTOJIS (c,c1);
1708 c = JISTOUNICODE (c,c1,ku,ten);
1711 /* compromise - do yen sign but not overline */
1712 else if (c == JISROMAN_YEN) c = UCS2_YEN;
1713 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
1716 #endif
1718 /* Convert ISO-2022 sized text to UTF-8
1719 * Accepts: source sized text
1720 * pointer to returned sized text
1721 * canonicalization function
1722 */
1724 void utf8_text_2022 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
1726 unsigned long i;
1727 unsigned char *s;
1728 unsigned int pass,state,c,co,gi,gl,gr,g[4],ku,ten;
1729 for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
1730 gi = 0; /* quell compiler warnings */
1731 state = I2S_CHAR; /* initialize engine */
1732 g[0]= g[2] = I2CS_ASCII; /* G0 and G2 are ASCII */
1733 g[1]= g[3] = I2CS_ISO8859_1;/* G1 and G3 are ISO-8850-1 */
1734 gl = I2C_G0; gr = I2C_G1; /* left is G0, right is G1 */
1735 for (i = 0; i < text->size;) {
1736 c = text->data[i++];
1737 switch (state) { /* dispatch based upon engine state */
1738 case I2S_ESC: /* ESC seen */
1739 switch (c) { /* process intermediate character */
1740 case I2C_MULTI: /* multibyte character? */
1741 state = I2S_MUL; /* mark multibyte flag seen */
1742 break;
1743 case I2C_SS2: /* single shift GL to G2 */
1744 case I2C_SS2_ALT: /* Taiwan SeedNet */
1745 gl |= I2C_SG2;
1746 break;
1747 case I2C_SS3: /* single shift GL to G3 */
1748 case I2C_SS3_ALT: /* Taiwan SeedNet */
1749 gl |= I2C_SG3;
1750 break;
1751 case I2C_LS2: /* shift GL to G2 */
1752 gl = I2C_G2;
1753 break;
1754 case I2C_LS3: /* shift GL to G3 */
1755 gl = I2C_G3;
1756 break;
1757 case I2C_LS1R: /* shift GR to G1 */
1758 gr = I2C_G1;
1759 break;
1760 case I2C_LS2R: /* shift GR to G2 */
1761 gr = I2C_G2;
1762 break;
1763 case I2C_LS3R: /* shift GR to G3 */
1764 gr = I2C_G3;
1765 break;
1766 case I2C_G0_94: case I2C_G1_94: case I2C_G2_94: case I2C_G3_94:
1767 g[gi = c - I2C_G0_94] = (state == I2S_MUL) ? I2CS_94x94 : I2CS_94;
1768 state = I2S_INT; /* ready for character set */
1769 break;
1770 case I2C_G0_96: case I2C_G1_96: case I2C_G2_96: case I2C_G3_96:
1771 g[gi = c - I2C_G0_96] = (state == I2S_MUL) ? I2CS_96x96 : I2CS_96;
1772 state = I2S_INT; /* ready for character set */
1773 break;
1774 default: /* bogon */
1775 if (pass) *s++ = I2C_ESC,*s++ = c;
1776 else ret->size += 2;
1777 state = I2S_CHAR; /* return to previous state */
1779 break;
1781 case I2S_MUL: /* ESC $ */
1782 switch (c) { /* process multibyte intermediate character */
1783 case I2C_G0_94: case I2C_G1_94: case I2C_G2_94: case I2C_G3_94:
1784 g[gi = c - I2C_G0_94] = I2CS_94x94;
1785 state = I2S_INT; /* ready for character set */
1786 break;
1787 case I2C_G0_96: case I2C_G1_96: case I2C_G2_96: case I2C_G3_96:
1788 g[gi = c - I2C_G0_96] = I2CS_96x96;
1789 state = I2S_INT; /* ready for character set */
1790 break;
1791 default: /* probably omitted I2CS_94x94 */
1792 g[gi = I2C_G0] = I2CS_94x94 | c;
1793 state = I2S_CHAR; /* return to character state */
1795 break;
1796 case I2S_INT:
1797 state = I2S_CHAR; /* return to character state */
1798 g[gi] |= c; /* set character set */
1799 break;
1801 case I2S_CHAR: /* character data */
1802 switch (c) {
1803 case I2C_ESC: /* ESC character */
1804 state = I2S_ESC; /* see if ISO-2022 prefix */
1805 break;
1806 case I2C_SI: /* shift GL to G0 */
1807 gl = I2C_G0;
1808 break;
1809 case I2C_SO: /* shift GL to G1 */
1810 gl = I2C_G1;
1811 break;
1812 case I2C_SS2_ALT: /* single shift GL to G2 */
1813 case I2C_SS2_ALT_7:
1814 gl |= I2C_SG2;
1815 break;
1816 case I2C_SS3_ALT: /* single shift GL to G3 */
1817 case I2C_SS3_ALT_7:
1818 gl |= I2C_SG3;
1819 break;
1821 default: /* ordinary character */
1822 co = c; /* note original character */
1823 if (gl & (3 << 2)) { /* single shifted? */
1824 gi = g[gl >> 2]; /* get shifted character set */
1825 gl &= 0x3; /* cancel shift */
1827 /* select left or right half */
1828 else gi = (c & BIT8) ? g[gr] : g[gl];
1829 c &= BITS7; /* make 7-bit */
1830 switch (gi) { /* interpret in character set */
1831 case I2CS_ASCII: /* ASCII */
1832 break; /* easy! */
1833 case I2CS_BRITISH: /* British ASCII */
1834 /* Pound sterling sign */
1835 if (c == BRITISH_POUNDSTERLING) c = UCS2_POUNDSTERLING;
1836 break;
1837 case I2CS_JIS_ROMAN: /* JIS Roman */
1838 case I2CS_JIS_BUGROM: /* old bugs */
1839 switch (c) { /* two exceptions to ASCII */
1840 case JISROMAN_YEN: /* Yen sign */
1841 c = UCS2_YEN;
1842 break;
1843 /* overline */
1844 case JISROMAN_OVERLINE:
1845 c = UCS2_OVERLINE;
1846 break;
1848 break;
1849 case I2CS_JIS_KANA: /* JIS hankaku katakana */
1850 if ((c >= MIN_KANA_7) && (c < MAX_KANA_7)) c += KANA_7;
1851 break;
1853 case I2CS_ISO8859_1: /* Latin-1 (West European) */
1854 c |= BIT8; /* just turn on high bit */
1855 break;
1856 case I2CS_ISO8859_2: /* Latin-2 (Czech, Slovak) */
1857 c = iso8859_2tab[c];
1858 break;
1859 case I2CS_ISO8859_3: /* Latin-3 (Dutch, Turkish) */
1860 c = iso8859_3tab[c];
1861 break;
1862 case I2CS_ISO8859_4: /* Latin-4 (Scandinavian) */
1863 c = iso8859_4tab[c];
1864 break;
1865 case I2CS_ISO8859_5: /* Cyrillic */
1866 c = iso8859_5tab[c];
1867 break;
1868 case I2CS_ISO8859_6: /* Arabic */
1869 c = iso8859_6tab[c];
1870 break;
1871 case I2CS_ISO8859_7: /* Greek */
1872 c = iso8859_7tab[c];
1873 break;
1874 case I2CS_ISO8859_8: /* Hebrew */
1875 c = iso8859_8tab[c];
1876 break;
1877 case I2CS_ISO8859_9: /* Latin-5 (Finnish, Portuguese) */
1878 c = iso8859_9tab[c];
1879 break;
1880 case I2CS_TIS620: /* Thai */
1881 c = tis620tab[c];
1882 break;
1883 case I2CS_ISO8859_10: /* Latin-6 (Northern Europe) */
1884 c = iso8859_10tab[c];
1885 break;
1886 case I2CS_ISO8859_13: /* Latin-7 (Baltic) */
1887 c = iso8859_13tab[c];
1888 break;
1889 case I2CS_VSCII: /* Vietnamese */
1890 c = visciitab[c];
1891 break;
1892 case I2CS_ISO8859_14: /* Latin-8 (Celtic) */
1893 c = iso8859_14tab[c];
1894 break;
1895 case I2CS_ISO8859_15: /* Latin-9 (Euro) */
1896 c = iso8859_15tab[c];
1897 break;
1898 case I2CS_ISO8859_16: /* Latin-10 (Baltic) */
1899 c = iso8859_16tab[c];
1900 break;
1902 default: /* all other character sets */
1903 /* multibyte character set */
1904 if ((gi & I2CS_MUL) && !(c & BIT8) && isgraph (c)) {
1905 c = (i < text->size) ? text->data[i++] : 0;
1906 switch (gi) {
1907 #ifdef GBTOUNICODE
1908 case I2CS_GB: /* GB 2312 */
1909 co |= BIT8; /* make into EUC */
1910 c |= BIT8;
1911 c = GBTOUNICODE (co,c,ku,ten);
1912 break;
1913 #endif
1914 #ifdef JISTOUNICODE
1915 case I2CS_JIS_OLD:/* JIS X 0208-1978 */
1916 case I2CS_JIS_NEW:/* JIS X 0208-1983 */
1917 c = JISTOUNICODE (co,c,ku,ten);
1918 break;
1919 #endif
1920 #ifdef JIS0212TOUNICODE
1921 case I2CS_JIS_EXT:/* JIS X 0212-1990 */
1922 c = JIS0212TOUNICODE (co,c,ku,ten);
1923 break;
1924 #endif
1925 #ifdef KSCTOUNICODE
1926 case I2CS_KSC: /* KSC 5601 */
1927 co |= BIT8; /* make into EUC */
1928 c |= BIT8;
1929 c = KSCTOUNICODE (co,c,ku,ten);
1930 break;
1931 #endif
1932 #ifdef CNS1TOUNICODE
1933 case I2CS_CNS1: /* CNS 11643 plane 1 */
1934 c = CNS1TOUNICODE (co,c,ku,ten);
1935 break;
1936 #endif
1937 #ifdef CNS2TOUNICODE
1938 case I2CS_CNS2: /* CNS 11643 plane 2 */
1939 c = CNS2TOUNICODE (co,c,ku,ten);
1940 break;
1941 #endif
1942 #ifdef CNS3TOUNICODE
1943 case I2CS_CNS3: /* CNS 11643 plane 3 */
1944 c = CNS3TOUNICODE (co,c,ku,ten);
1945 break;
1946 #endif
1947 #ifdef CNS4TOUNICODE
1948 case I2CS_CNS4: /* CNS 11643 plane 4 */
1949 c = CNS4TOUNICODE (co,c,ku,ten);
1950 break;
1951 #endif
1952 #ifdef CNS5TOUNICODE
1953 case I2CS_CNS5: /* CNS 11643 plane 5 */
1954 c = CNS5TOUNICODE (co,c,ku,ten);
1955 break;
1956 #endif
1957 #ifdef CNS6TOUNICODE
1958 case I2CS_CNS6: /* CNS 11643 plane 6 */
1959 c = CNS6TOUNICODE (co,c,ku,ten);
1960 break;
1961 #endif
1962 #ifdef CNS7TOUNICODE
1963 case I2CS_CNS7: /* CNS 11643 plane 7 */
1964 c = CNS7TOUNICODE (co,c,ku,ten);
1965 break;
1966 #endif
1967 default: /* unknown multibyte, treat as UCS-2 */
1968 c |= (co << 8); /* wrong, but nothing else to do */
1969 break;
1972 else c = co; /* unknown single byte, treat as 8859-1 */
1974 /* convert if second pass */
1975 if (pass) UTF8_WRITE_BMP (s,c,cv,de)
1976 else UTF8_COUNT_BMP (ret->size,c,cv,de);
1980 if (!pass) (s = ret->data = (unsigned char *)
1981 fs_get (ret->size + 1))[ret->size] = NIL;
1982 else if (((unsigned long) (s - ret->data)) != ret->size)
1983 fatal ("ISO-2022 to UTF-8 botch");
1987 /* Convert UTF-7 sized text to UTF-8
1988 * Accepts: source sized text
1989 * pointer to returned sized text
1990 * canonicalization function
1991 */
1993 void utf8_text_utf7 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
1995 unsigned long i;
1996 unsigned char *s;
1997 unsigned int c,c1,d,uc,pass,e,e1,state,surrh;
1998 for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
1999 c1 = d = uc = e = e1 = 0;
2000 for (i = 0,state = NIL; i < text->size;) {
2001 c = text->data[i++]; /* get next byte */
2002 switch (state) {
2003 case U7_PLUS: /* previous character was + */
2004 if (c == '-') { /* +- means textual + */
2005 c = '+';
2006 state = U7_ASCII; /* revert to ASCII */
2007 break;
2009 state = U7_UNICODE; /* enter Unicode state */
2010 e = e1 = 0; /* initialize Unicode quantum position */
2011 case U7_UNICODE: /* Unicode state */
2012 if (c == '-') state = U7_MINUS;
2013 else { /* decode Unicode */
2014 /* don't use isupper/islower since this is ASCII only */
2015 if ((c >= 'A') && (c <= 'Z')) c -= 'A';
2016 else if ((c >= 'a') && (c <= 'z')) c -= 'a' - 26;
2017 else if (isdigit (c)) c -= '0' - 52;
2018 else if (c == '+') c = 62;
2019 else if (c == '/') c = 63;
2020 else state = U7_ASCII;/* end of modified BASE64 */
2022 break;
2023 case U7_MINUS: /* previous character was absorbed - */
2024 state = U7_ASCII; /* revert to ASCII */
2025 case U7_ASCII: /* ASCII state */
2026 if (c == '+') state = U7_PLUS;
2027 break;
2030 switch (state) { /* store character if in character mode */
2031 case U7_UNICODE: /* Unicode */
2032 switch (e++) { /* install based on BASE64 state */
2033 case 0:
2034 c1 = c << 2; /* byte 1: high 6 bits */
2035 break;
2036 case 1:
2037 d = c1 | (c >> 4); /* byte 1: low 2 bits */
2038 c1 = c << 4; /* byte 2: high 4 bits */
2039 break;
2040 case 2:
2041 d = c1 | (c >> 2); /* byte 2: low 4 bits */
2042 c1 = c << 6; /* byte 3: high 2 bits */
2043 break;
2044 case 3:
2045 d = c | c1; /* byte 3: low 6 bits */
2046 e = 0; /* reinitialize mechanism */
2047 break;
2049 if (e == 1) break; /* done if first BASE64 state */
2050 if (!e1) { /* first byte of UCS-2 character */
2051 uc = (d & 0xff) << 8; /* note first byte */
2052 e1 = T; /* enter second UCS-2 state */
2053 break; /* done */
2055 c = uc | (d & 0xff); /* build UCS-2 character */
2056 e1 = NIL; /* back to first UCS-2 state, drop in */
2057 /* surrogate pair? */
2058 if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
2059 /* save high surrogate for later */
2060 if (c < UTF16_SURRL) surrh = c;
2061 else c = UTF16_BASE + ((surrh & UTF16_MASK) << UTF16_SHIFT) +
2062 (c & UTF16_MASK);
2063 break; /* either way with surrogates, we're done */
2065 case U7_ASCII: /* just install if ASCII */
2066 /* convert if second pass */
2067 if (pass) UTF8_WRITE_BMP (s,c,cv,de)
2068 else UTF8_COUNT_BMP (ret->size,c,cv,de);
2071 if (!pass) (s = ret->data = (unsigned char *)
2072 fs_get (ret->size + 1))[ret->size] = NIL;
2073 else if (((unsigned long) (s - ret->data)) != ret->size)
2074 fatal ("UTF-7 to UTF-8 botch");
2079 /* Convert UTF-8 sized text to UTF-8
2080 * Accepts: source sized text
2081 * pointer to returned sized text
2082 * canonicalization function
2083 */
2085 void utf8_text_utf8 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2087 unsigned long i,c;
2088 unsigned char *s,*t;
2089 for (ret->size = 0, t = text->data, i = text->size; i;) {
2090 if ((c = utf8_get (&t,&i)) & U8G_ERROR) {
2091 ret->data = text->data; /* conversion failed */
2092 ret->size = text->size;
2093 return;
2095 UTF8_COUNT (ret->size,c,cv,de)
2097 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
2098 for (t = text->data, i = text->size; i;) {
2099 c = utf8_get (&t,&i);
2100 UTF8_WRITE (s,c,cv,de) /* convert UCS-4 to UTF-8 */
2102 if (((unsigned long) (s - ret->data)) != ret->size)
2103 fatal ("UTF-8 to UTF-8 botch");
2106 /* Convert UCS-2 sized text to UTF-8
2107 * Accepts: source sized text
2108 * pointer to returned sized text
2109 * canonicalization function
2110 */
2112 void utf8_text_ucs2 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2114 unsigned long i;
2115 unsigned char *s,*t;
2116 unsigned int c;
2117 for (ret->size = 0, t = text->data, i = text->size / 2; i; --i) {
2118 c = *t++ << 8;
2119 c |= *t++;
2120 UTF8_COUNT_BMP (ret->size,c,cv,de);
2122 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
2123 for (t = text->data, i = text->size / 2; i; --i) {
2124 c = *t++ << 8;
2125 c |= *t++;
2126 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
2128 if (((unsigned long) (s - ret->data)) != ret->size)
2129 fatal ("UCS-2 to UTF-8 botch");
2133 /* Convert UCS-4 sized text to UTF-8
2134 * Accepts: source sized text
2135 * pointer to returned sized text
2136 * canonicalization function
2137 */
2139 void utf8_text_ucs4 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2141 unsigned long i;
2142 unsigned char *s,*t;
2143 unsigned long c;
2144 for (ret->size = 0, t = text->data, i = text->size / 4; i; --i) {
2145 c = *t++ << 24; c |= *t++ << 16; c |= *t++ << 8; c |= *t++;
2146 UTF8_COUNT (ret->size,c,cv,de);
2148 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
2149 for (t = text->data, i = text->size / 2; i; --i) {
2150 c = *t++ << 24; c |= *t++ << 16; c |= *t++ << 8; c |= *t++;
2151 UTF8_WRITE (s,c,cv,de) /* convert UCS-4 to UTF-8 */
2153 if (((unsigned long) (s - ret->data)) != ret->size)
2154 fatal ("UCS-4 to UTF-8 botch");
2157 /* Convert UTF-16 sized text to UTF-8
2158 * Accepts: source sized text
2159 * pointer to returned sized text
2160 * canonicalization function
2161 */
2163 void utf8_text_utf16 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2165 unsigned long i;
2166 unsigned char *s,*t;
2167 unsigned long c,d;
2168 for (ret->size = 0, t = text->data, i = text->size / 2; i; --i) {
2169 c = *t++ << 8;
2170 c |= *t++;
2171 /* possible surrogate? */
2172 if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
2173 /* invalid first surrogate */
2174 if ((c > UTF16_SURRHEND) || !i) c = UBOGON;
2175 else { /* get second surrogate */
2176 d = *t++ << 8;
2177 d |= *t++;
2178 --i; /* swallowed another 16-bits */
2179 /* invalid second surrogate */
2180 if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) c = UBOGON;
2181 else c = UTF16_BASE + ((c & UTF16_MASK) << UTF16_SHIFT) +
2182 (d & UTF16_MASK);
2185 UTF8_COUNT (ret->size,c,cv,de);
2187 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
2188 for (t = text->data, i = text->size / 2; i; --i) {
2189 c = *t++ << 8;
2190 c |= *t++;
2191 /* possible surrogate? */
2192 if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
2193 /* invalid first surrogate */
2194 if ((c > UTF16_SURRHEND) || !i) c = UBOGON;
2195 else { /* get second surrogate */
2196 d = *t++ << 8;
2197 d |= *t++;
2198 --i; /* swallowed another 16-bits */
2199 /* invalid second surrogate */
2200 if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) c = UBOGON;
2201 else c = UTF16_BASE + ((c & UTF16_MASK) << UTF16_SHIFT) +
2202 (d & UTF16_MASK);
2205 UTF8_WRITE (s,c,cv,de) /* convert UCS-4 to UTF-8 */
2207 if (((unsigned long) (s - ret->data)) != ret->size)
2208 fatal ("UTF-16 to UTF-8 botch");
2211 /* Size of UCS-4 character, possibly not in BMP, as UTF-8 octets
2212 * Accepts: character
2213 * Returns: size (0 means bogon)
2215 * Use UTF8_SIZE macro if known to be in the BMP
2216 */
2218 unsigned long utf8_size (unsigned long c)
2220 if (c < 0x80) return 1;
2221 else if (c < 0x800) return 2;
2222 else if (c < 0x10000) return 3;
2223 else if (c < 0x200000) return 4;
2224 else if (c < 0x4000000) return 5;
2225 else if (c < 0x80000000) return 6;
2226 return 0;
2230 /* Put UCS-4 character, possibly not in BMP, as UTF-8 octets
2231 * Accepts: destination string pointer
2232 * character
2233 * Returns: updated destination pointer
2235 * Use UTF8_PUT_BMP macro if known to be in the BMP
2236 */
2238 unsigned char *utf8_put (unsigned char *s,unsigned long c)
2240 unsigned char mark[6] = {0x00,0xc0,0xe0,0xf0,0xf8,0xfc};
2241 unsigned long size = utf8_size (c);
2242 switch (size) {
2243 case 6:
2244 s[5] = 0x80 | (unsigned char) (c & 0x3f);
2245 c >>= 6;
2246 case 5:
2247 s[4] = 0x80 | (unsigned char) (c & 0x3f);
2248 c >>= 6;
2249 case 4:
2250 s[3] = 0x80 | (unsigned char) (c & 0x3f);
2251 c >>= 6;
2252 case 3:
2253 s[2] = 0x80 | (unsigned char) (c & 0x3f);
2254 c >>= 6;
2255 case 2:
2256 s[1] = 0x80 | (unsigned char) (c & 0x3f);
2257 c >>= 6;
2258 case 1:
2259 *s = mark[size-1] | (unsigned char) (c & 0x7f);
2260 break;
2262 return s + size;
2265 /* Return title case of a fixed-width UCS-4 character
2266 * Accepts: character
2267 * Returns: title case of character
2268 */
2270 unsigned long ucs4_titlecase (unsigned long c)
2272 if (c <= UCS4_TMAPMAX) return ucs4_tmaptab[c];
2273 if (c < UCS4_TMAPHIMIN) return c;
2274 if (c <= UCS4_TMAPHIMAX) return c - UCS4_TMAPHIMAP;
2275 if (c < UCS4_TMAPDESERETMIN) return c;
2276 if (c <= UCS4_TMAPDESERETMAX) return c - UCS4_TMAPDESERETMAP;
2277 return c;
2281 /* Return width of a fixed-width UCS-4 character in planes 0-2
2282 * Accepts: character
2283 * Returns: width (0, 1, 2) or negative error condition if not valid
2284 */
2286 long ucs4_width (unsigned long c)
2288 long ret;
2289 /* out of range, not-a-char, or surrogates */
2290 if ((c > UCS4_MAXUNICODE) || ((c & 0xfffe) == 0xfffe) ||
2291 ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR))) ret = U4W_NOTUNCD;
2292 /* private-use */
2293 else if (c >= UCS4_PVTBASE) ret = U4W_PRIVATE;
2294 /* SSP are not printing characters */
2295 else if (c >= UCS4_SSPBASE) ret = U4W_SSPCHAR;
2296 /* unassigned planes */
2297 else if (c >= UCS4_UNABASE) ret = U4W_UNASSGN;
2298 /* SIP and reserved plane 3 are wide */
2299 else if (c >= UCS4_SIPBASE) ret = 2;
2300 #if (UCS4_WIDLEN != UCS4_SIPBASE)
2301 #error "UCS4_WIDLEN != UCS4_SIPBASE"
2302 #endif
2303 /* C0/C1 controls */
2304 else if ((c <= UCS2_C0CONTROLEND) ||
2305 ((c >= UCS2_C1CONTROL) && (c <= UCS2_C1CONTROLEND)))
2306 ret = U4W_CONTROL;
2307 /* BMP and SMP get value from table */
2308 else switch (ret = (ucs4_widthtab[(c >> 2)] >> ((3 - (c & 0x3)) << 1)) &0x3){
2309 case 0: /* zero-width */
2310 if (c == 0x00ad) ret = 1; /* force U+00ad (SOFT HYPHEN) to width 1 */
2311 case 1: /* single-width */
2312 case 2: /* double-width */
2313 break;
2314 case 3: /* ambiguous width */
2315 ret = (c >= 0x2100) ? 2 : 1;/* need to do something better than this */
2316 break;
2318 return ret;
2321 /* Return screen width of UTF-8 string
2322 * Accepts: string
2323 * Returns: width or negative if not valid UTF-8
2324 */
2326 long utf8_strwidth (unsigned char *s)
2328 unsigned long c,i,ret;
2329 /* go through string */
2330 for (ret = 0; *s; ret += ucs4_width (c)) {
2331 /* It's alright to give a fake value for the byte count to utf8_get()
2332 * since the null of a null-terminated string will stop processing anyway.
2333 */
2334 i = 6; /* fake value */
2335 if ((c = utf8_get (&s,&i)) & U8G_ERROR) return -1;
2337 return ret;
2341 /* Return screen width of UTF-8 text
2342 * Accepts: SIZEDTEXT to string
2343 * Returns: width or negative if not valid UTF-8
2344 */
2346 long utf8_textwidth (SIZEDTEXT *utf8)
2348 unsigned long c;
2349 unsigned char *s = utf8->data;
2350 unsigned long i = utf8->size;
2351 unsigned long ret = 0;
2352 while (i) { /* while there's a string to process */
2353 if ((c = utf8_get (&s,&i)) & U8G_ERROR) return -1;
2354 ret += ucs4_width (c);
2356 return ret;
2359 /* Decomposition (phew!) */
2361 #define MORESINGLE 1 /* single UCS-4 tail value */
2362 #define MOREMULTIPLE 2 /* multiple UCS-2 tail values */
2364 struct decomposemore {
2365 short type; /* type of more */
2366 union {
2367 unsigned long single; /* single decomposed value */
2368 struct { /* multiple BMP values */
2369 unsigned short *next;
2370 unsigned long count;
2371 } multiple;
2372 } data;
2373 };
2375 #define RECURSIVEMORE struct recursivemore
2377 RECURSIVEMORE {
2378 struct decomposemore *more;
2379 RECURSIVEMORE *next;
2380 };
2383 /* Return decomposition of a UCS-4 character
2384 * Accepts: character or U8G_ERROR to return next from "more"
2385 * pointer to returned more
2386 * Returns: [next] decomposed value, more set if still more decomposition
2387 */
2389 unsigned long ucs4_decompose (unsigned long c,void **more)
2391 unsigned long i,ix,ret;
2392 struct decomposemore *m;
2393 if (c & U8G_ERROR) { /* want to chase more? */
2394 /* do sanity check */
2395 if (m = (struct decomposemore *) *more) switch (m->type) {
2396 case MORESINGLE: /* single value */
2397 ret = m->data.single;
2398 fs_give (more); /* no more decomposition */
2399 break;
2400 case MOREMULTIPLE: /* multiple value */
2401 ret = *m->data.multiple.next++;
2402 if (!--m->data.multiple.count) fs_give (more);
2403 break;
2404 default: /* uh-oh */
2405 fatal ("invalid more block argument to ucs4_decompose!");
2407 else fatal ("no more block provided to ucs4_decompose!");
2410 else { /* start decomposition */
2411 *more = NIL; /* initially set no more */
2412 /* BMP low decompositions */
2413 if (c < UCS4_BMPLOMIN) ret = c;
2414 /* fix this someday */
2415 else if (c == UCS4_BMPLOMIN) ret = ucs4_dbmplotab[0];
2416 else if (c <= UCS4_BMPLOMAX) {
2417 /* within range - have a decomposition? */
2418 if (i = ucs4_dbmploixtab[c - UCS4_BMPLOMIN]) {
2419 /* get first value of decomposition */
2420 ret = ucs4_dbmplotab[ix = i & UCS4_BMPLOIXMASK];
2421 /* has continuation? */
2422 if (i & UCS4_BMPLOSIZEMASK) {
2423 m = (struct decomposemore *)
2424 (*more = memset (fs_get (sizeof (struct decomposemore)),0,
2425 sizeof (struct decomposemore)));
2426 m->type = MOREMULTIPLE;
2427 m->data.multiple.next = &ucs4_dbmplotab[++ix];
2428 m->data.multiple.count = i >> UCS4_BMPLOSIZESHIFT;
2431 else ret = c; /* in range but doesn't decompose */
2433 /* BMP CJK compatibility */
2434 else if (c < UCS4_BMPCJKMIN) ret = c;
2435 else if (c <= UCS4_BMPCJKMAX) {
2436 if (!(ret = ucs4_bmpcjk1decomptab[c - UCS4_BMPCJKMIN])) ret = c;
2438 /* BMP CJK compatibility - some not in BMP */
2439 #if UCS4_BMPCJK2MIN - (UCS4_BMPCJKMAX + 1)
2440 else if (c < UCS4_BMPCJK2MIN) ret = c;
2441 #endif
2442 else if (c <= UCS4_BMPCJK2MAX)
2443 ret = ucs4_bmpcjk2decomptab[c - UCS4_BMPCJK2MIN];
2444 /* BMP high decompositions */
2445 else if (c < UCS4_BMPHIMIN) ret = c;
2446 else if (c <= UCS4_BMPHIMAX) {
2447 /* within range - have a decomposition? */
2448 if (i = ucs4_dbmphiixtab[c - UCS4_BMPHIMIN]) {
2449 /* get first value of decomposition */
2450 ret = ucs4_dbmphitab[ix = i & UCS4_BMPHIIXMASK];
2451 /* has continuation? */
2452 if (i & UCS4_BMPHISIZEMASK) {
2453 m = (struct decomposemore *)
2454 (*more = memset (fs_get (sizeof (struct decomposemore)),0,
2455 sizeof (struct decomposemore)));
2456 m->type = MOREMULTIPLE;
2457 m->data.multiple.next = &ucs4_dbmphitab[++ix];
2458 m->data.multiple.count = i >> UCS4_BMPHISIZESHIFT;
2461 else ret = c; /* in range but doesn't decompose */
2464 /* BMP half and full width forms */
2465 else if (c < UCS4_BMPHALFFULLMIN) ret = c;
2466 else if (c <= UCS4_BMPHALFFULLMAX) {
2467 if (!(ret = ucs4_bmphalffulldecomptab[c - UCS4_BMPHALFFULLMIN])) ret = c;
2469 /* SMP music */
2470 else if (c < UCS4_SMPMUSIC1MIN) ret = c;
2471 else if (c <= UCS4_SMPMUSIC1MAX) {
2472 ret = ucs4_smpmusic1decomptab[c -= UCS4_SMPMUSIC1MIN][0];
2473 m = (struct decomposemore *)
2474 (*more = memset (fs_get (sizeof (struct decomposemore)),0,
2475 sizeof (struct decomposemore)));
2476 m->type = MORESINGLE;
2477 m->data.single = ucs4_smpmusic1decomptab[c][1];
2479 else if (c < UCS4_SMPMUSIC2MIN) ret = c;
2480 else if (c <= UCS4_SMPMUSIC2MAX) {
2481 ret = ucs4_smpmusic2decomptab[c -= UCS4_SMPMUSIC2MIN][0];
2482 m = (struct decomposemore *)
2483 (*more = memset (fs_get (sizeof (struct decomposemore)),0,
2484 sizeof (struct decomposemore)));
2485 m->type = MORESINGLE;
2486 m->data.single = ucs4_smpmusic2decomptab[c][1];
2488 /* SMP mathematical forms */
2489 else if (c < UCS4_SMPMATHMIN) ret = c;
2490 else if (c <= UCS4_SMPMATHMAX) {
2491 if (!(ret = ucs4_smpmathdecomptab[c - UCS4_SMPMATHMIN])) ret = c;
2493 /* CJK compatibility ideographs in SIP */
2494 else if (!(ret = ((c >= UCS4_SIPMIN) && (c <= UCS4_SIPMAX)) ?
2495 ucs4_sipdecomptab[c - UCS4_SIPMIN] : c)) ret = c;
2497 return ret;
2500 /* Return recursive decomposition of a UCS-4 character
2501 * Accepts: character or U8G_ERROR to return next from "more"
2502 * pointer to returned more
2503 * Returns: [next] decomposed value, more set if still more decomposition
2504 */
2506 unsigned long ucs4_decompose_recursive (unsigned long c,void **more)
2508 unsigned long c1;
2509 void *m,*mn;
2510 RECURSIVEMORE *mr;
2511 if (c & U8G_ERROR) { /* want to chase more? */
2512 mn = NIL;
2513 if (mr = (RECURSIVEMORE *) *more) switch (mr->more->type) {
2514 case MORESINGLE: /* decompose single value */
2515 c = ucs4_decompose_recursive (mr->more->data.single,&mn);
2516 *more = mr->next; /* done with this more, remove it */
2517 fs_give ((void **) &mr->more);
2518 fs_give ((void **) &mr);
2519 break;
2520 case MOREMULTIPLE: /* decompose current value in multiple */
2521 c = ucs4_decompose_recursive (*mr->more->data.multiple.next++,&mn);
2522 /* if done with this multiple decomposition */
2523 if (!--mr->more->data.multiple.count) {
2524 *more = mr->next; /* done with this more, remove it */
2525 fs_give ((void **) &mr->more);
2526 fs_give ((void **) &mr);
2528 break;
2529 default: /* uh-oh */
2530 fatal ("invalid more block argument to ucs4_decompose_recursive!");
2532 else fatal ("no more block provided to ucs4_decompose_recursive!");
2533 if (mr = mn) { /* did this value recurse on us? */
2534 mr->next = *more; /* yes, insert new more at head */
2535 *more = mr;
2538 else { /* start decomposition */
2539 *more = NIL; /* initially set no more */
2540 mr = NIL;
2541 do { /* repeatedly decompose this codepoint */
2542 c = ucs4_decompose (c1 = c,&m);
2543 if (m) { /* multi-byte decomposition */
2544 if (c1 == c) fatal ("endless multiple decomposition!");
2545 /* create a block to stash this more */
2546 mr = memset (fs_get (sizeof (RECURSIVEMORE)),0,sizeof (RECURSIVEMORE));
2547 mr->more = m; /* note the expansion */
2548 mr->next = *more; /* old list is the tail */
2549 *more = mr; /* and this is the new head */
2551 } while (c1 != c); /* until nothing more to decompose */
2553 return c;

UW-IMAP'd extensions by yuuji