imapext-2007
diff src/c-client/utf8.c @ 0:ada5e610ab86
imap-2007e
author | yuuji@gentei.org |
---|---|
date | Mon, 14 Sep 2009 15:17:45 +0900 |
parents | |
children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/c-client/utf8.c Mon Sep 14 15:17:45 2009 +0900 1.3 @@ -0,0 +1,2554 @@ 1.4 +/* ======================================================================== 1.5 + * Copyright 1988-2008 University of Washington 1.6 + * 1.7 + * Licensed under the Apache License, Version 2.0 (the "License"); 1.8 + * you may not use this file except in compliance with the License. 1.9 + * You may obtain a copy of the License at 1.10 + * 1.11 + * http://www.apache.org/licenses/LICENSE-2.0 1.12 + * 1.13 + * 1.14 + * ======================================================================== 1.15 + */ 1.16 + 1.17 +/* 1.18 + * Program: UTF-8 routines 1.19 + * 1.20 + * Author: Mark Crispin 1.21 + * Networks and Distributed Computing 1.22 + * Computing & Communications 1.23 + * University of Washington 1.24 + * Administration Building, AG-44 1.25 + * Seattle, WA 98195 1.26 + * Internet: MRC@CAC.Washington.EDU 1.27 + * 1.28 + * Date: 11 June 1997 1.29 + * Last Edited: 17 January 2008 1.30 + */ 1.31 + 1.32 + 1.33 +#include <stdio.h> 1.34 +#include <ctype.h> 1.35 +#include "c-client.h" 1.36 + 1.37 +/* *** IMPORTANT *** 1.38 + * 1.39 + * There is a very important difference between "character set" and "charset", 1.40 + * and the comments in this file reflect these differences. A "character set" 1.41 + * (also known as "coded character set") is a mapping between codepoints and 1.42 + * characters. A "charset" is as defined in MIME, and incorporates one or more 1.43 + * coded character sets in a character encoding scheme. See RFC 2130 for more 1.44 + * details. 1.45 + */ 1.46 + 1.47 + 1.48 +/* Character set conversion tables */ 1.49 + 1.50 +#include "iso_8859.c" /* 8-bit single-byte coded graphic */ 1.51 +#include "koi8_r.c" /* Cyrillic - Russia */ 1.52 +#include "koi8_u.c" /* Cyrillic - Ukraine */ 1.53 +#include "tis_620.c" /* Thai */ 1.54 +#include "viscii.c" /* Vietnamese */ 1.55 +#include "windows.c" /* Windows */ 1.56 +#include "ibm.c" /* IBM */ 1.57 +#include "gb_2312.c" /* Chinese (PRC) - simplified */ 1.58 +#include "gb_12345.c" /* Chinese (PRC) - traditional */ 1.59 +#include "jis_0208.c" /* Japanese - basic */ 1.60 +#include "jis_0212.c" /* Japanese - supplementary */ 1.61 +#include "ksc_5601.c" /* Korean */ 1.62 +#include "big5.c" /* Taiwanese (ROC) - industrial standard */ 1.63 +#include "cns11643.c" /* Taiwanese (ROC) - national standard */ 1.64 + 1.65 + 1.66 +#include "widths.c" /* Unicode character widths */ 1.67 +#include "tmap.c" /* Unicode titlecase mapping */ 1.68 +#include "decomtab.c" /* Unicode decomposions */ 1.69 + 1.70 +/* EUC parameters */ 1.71 + 1.72 +#ifdef GBTOUNICODE /* PRC simplified Chinese */ 1.73 +static const struct utf8_eucparam gb_param = { 1.74 + BASE_GB2312_KU,BASE_GB2312_TEN,MAX_GB2312_KU,MAX_GB2312_TEN, 1.75 + (void *) gb2312tab}; 1.76 +#endif 1.77 + 1.78 + 1.79 +#ifdef GB12345TOUNICODE /* PRC traditional Chinese */ 1.80 +static const struct utf8_eucparam gbt_param = { 1.81 + BASE_GB12345_KU,BASE_GB12345_TEN,MAX_GB12345_KU,MAX_GB12345_TEN, 1.82 + (void *) gb12345tab}; 1.83 +#endif 1.84 + 1.85 + 1.86 +#ifdef BIG5TOUNICODE /* ROC traditional Chinese */ 1.87 +static const struct utf8_eucparam big5_param[] = { 1.88 + {BASE_BIG5_KU,BASE_BIG5_TEN_0,MAX_BIG5_KU,MAX_BIG5_TEN_0,(void *) big5tab}, 1.89 + {BASE_BIG5_KU,BASE_BIG5_TEN_1,MAX_BIG5_KU,MAX_BIG5_TEN_1,NIL} 1.90 +}; 1.91 +#endif 1.92 + 1.93 + 1.94 +#ifdef JISTOUNICODE /* Japanese */ 1.95 +static const struct utf8_eucparam jis_param[] = { 1.96 + {BASE_JIS0208_KU,BASE_JIS0208_TEN,MAX_JIS0208_KU,MAX_JIS0208_TEN, 1.97 + (void *) jis0208tab}, 1.98 + {MIN_KANA_8,0,MAX_KANA_8,0,(void *) KANA_8}, 1.99 +#ifdef JIS0212TOUNICODE /* Japanese extended */ 1.100 + {BASE_JIS0212_KU,BASE_JIS0212_TEN,MAX_JIS0212_KU,MAX_JIS0212_TEN, 1.101 + (void *) jis0212tab} 1.102 +#else 1.103 + {0,0,0,0,NIL} 1.104 +#endif 1.105 +}; 1.106 +#endif 1.107 + 1.108 + 1.109 +#ifdef KSCTOUNICODE /* Korean */ 1.110 +static const struct utf8_eucparam ksc_param = { 1.111 + BASE_KSC5601_KU,BASE_KSC5601_TEN,MAX_KSC5601_KU,MAX_KSC5601_TEN, 1.112 + (void *) ksc5601tab}; 1.113 +#endif 1.114 + 1.115 +/* List of supported charsets */ 1.116 + 1.117 +static const CHARSET utf8_csvalid[] = { 1.118 + {"US-ASCII",CT_ASCII,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.119 + NIL,NIL,NIL}, 1.120 + {"UTF-8",CT_UTF8,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.121 + NIL,SC_UNICODE,NIL}, 1.122 + {"UTF-7",CT_UTF7,CF_PRIMARY | CF_POSTING | CF_UNSUPRT, 1.123 + NIL,SC_UNICODE,"UTF-8"}, 1.124 + {"ISO-8859-1",CT_1BYTE0,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.125 + NIL,SC_LATIN_1,NIL}, 1.126 + {"ISO-8859-2",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.127 + (void *) iso8859_2tab,SC_LATIN_2,NIL}, 1.128 + {"ISO-8859-3",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.129 + (void *) iso8859_3tab,SC_LATIN_3,NIL}, 1.130 + {"ISO-8859-4",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.131 + (void *) iso8859_4tab,SC_LATIN_4,NIL}, 1.132 + {"ISO-8859-5",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.133 + (void *) iso8859_5tab,SC_CYRILLIC,"KOI8-R"}, 1.134 + {"ISO-8859-6",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.135 + (void *) iso8859_6tab,SC_ARABIC,NIL}, 1.136 + {"ISO-8859-7",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.137 + (void *) iso8859_7tab,SC_GREEK,NIL}, 1.138 + {"ISO-8859-8",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.139 + (void *) iso8859_8tab,SC_HEBREW,NIL}, 1.140 + {"ISO-8859-9",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.141 + (void *) iso8859_9tab,SC_LATIN_5,NIL}, 1.142 + {"ISO-8859-10",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.143 + (void *) iso8859_10tab,SC_LATIN_6,NIL}, 1.144 + {"ISO-8859-11",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.145 + (void *) iso8859_11tab,SC_THAI,NIL}, 1.146 +#if 0 /* ISO 8859-12 reserved for ISCII(?) */ 1.147 + {"ISO-8859-12",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.148 + (void *) iso8859_12tab,NIL,NIL}, 1.149 +#endif 1.150 + {"ISO-8859-13",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.151 + (void *) iso8859_13tab,SC_LATIN_7,NIL}, 1.152 + {"ISO-8859-14",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.153 + (void *) iso8859_14tab,SC_LATIN_8,NIL}, 1.154 + {"ISO-8859-15",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.155 + (void *) iso8859_15tab,SC_LATIN_9,NIL}, 1.156 + {"ISO-8859-16",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.157 + (void *) iso8859_16tab,SC_LATIN_10,NIL}, 1.158 + {"KOI8-R",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.159 + (void *) koi8rtab,SC_CYRILLIC,NIL}, 1.160 + {"KOI8-U",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.161 + (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,NIL}, 1.162 + {"KOI8-RU",CT_1BYTE,CF_DISPLAY, 1.163 + (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,"KOI8-U"}, 1.164 + {"TIS-620",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.165 + (void *) tis620tab,SC_THAI,"ISO-8859-11"}, 1.166 + {"VISCII",CT_1BYTE8,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.167 + (void *) visciitab,SC_VIETNAMESE,NIL}, 1.168 + 1.169 +#ifdef GBTOUNICODE 1.170 + {"GBK",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.171 + (void *) &gb_param,SC_CHINESE_SIMPLIFIED,NIL}, 1.172 + {"GB2312",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.173 + (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"}, 1.174 + {"CN-GB",CT_DBYTE,CF_DISPLAY, 1.175 + (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"}, 1.176 +#ifdef CNS1TOUNICODE 1.177 + {"ISO-2022-CN",CT_2022,CF_PRIMARY | CF_UNSUPRT, 1.178 + NIL,SC_CHINESE_SIMPLIFIED | SC_CHINESE_TRADITIONAL, 1.179 + NIL}, 1.180 +#endif 1.181 +#endif 1.182 +#ifdef GB12345TOUNICODE 1.183 + {"CN-GB-12345",CT_DBYTE,CF_PRIMARY | CF_DISPLAY, 1.184 + (void *) &gbt_param,SC_CHINESE_TRADITIONAL,"BIG5"}, 1.185 +#endif 1.186 +#ifdef BIG5TOUNICODE 1.187 + {"BIG5",CT_DBYTE2,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.188 + (void *) big5_param,SC_CHINESE_TRADITIONAL,NIL}, 1.189 + {"CN-BIG5",CT_DBYTE2,CF_DISPLAY, 1.190 + (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"}, 1.191 + {"BIG-5",CT_DBYTE2,CF_DISPLAY, 1.192 + (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"}, 1.193 +#endif 1.194 +#ifdef JISTOUNICODE 1.195 + {"ISO-2022-JP",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.196 + NIL,SC_JAPANESE,NIL}, 1.197 + {"EUC-JP",CT_EUC,CF_PRIMARY | CF_DISPLAY, 1.198 + (void *) jis_param,SC_JAPANESE,"ISO-2022-JP"}, 1.199 + {"SHIFT_JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY, 1.200 + NIL,SC_JAPANESE,"ISO-2022-JP"}, 1.201 + {"SHIFT-JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY, 1.202 + NIL,SC_JAPANESE,"ISO-2022-JP"}, 1.203 +#ifdef JIS0212TOUNICODE 1.204 + {"ISO-2022-JP-1",CT_2022,CF_UNSUPRT, 1.205 + NIL,SC_JAPANESE,"ISO-2022-JP"}, 1.206 +#ifdef GBTOUNICODE 1.207 +#ifdef KSCTOUNICODE 1.208 + {"ISO-2022-JP-2",CT_2022,CF_UNSUPRT, 1.209 + NIL, 1.210 + SC_LATIN_1 | SC_LATIN_2 | SC_LATIN_3 | SC_LATIN_4 | SC_LATIN_5 | 1.211 + SC_LATIN_6 | SC_LATIN_7 | SC_LATIN_8 | SC_LATIN_9 | SC_LATIN_10 | 1.212 + SC_ARABIC | SC_CYRILLIC | SC_GREEK | SC_HEBREW | SC_THAI | 1.213 + SC_VIETNAMESE | SC_CHINESE_SIMPLIFIED | SC_JAPANESE | SC_KOREAN 1.214 +#ifdef CNS1TOUNICODE 1.215 + | SC_CHINESE_TRADITIONAL 1.216 +#endif 1.217 + ,"UTF-8"}, 1.218 +#endif 1.219 +#endif 1.220 +#endif 1.221 +#endif 1.222 + 1.223 +#ifdef KSCTOUNICODE 1.224 + {"ISO-2022-KR",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_UNSUPRT, 1.225 + NIL,SC_KOREAN,"EUC-KR"}, 1.226 + {"EUC-KR",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.227 + (void *) &ksc_param,SC_KOREAN,NIL}, 1.228 + {"KSC5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY, 1.229 + (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, 1.230 + {"KSC_5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY, 1.231 + (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, 1.232 + {"KS_C_5601-1987",CT_DBYTE,CF_DISPLAY, 1.233 + (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, 1.234 + {"KS_C_5601-1989",CT_DBYTE,CF_DISPLAY, 1.235 + (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, 1.236 + {"KS_C_5601-1992",CT_DBYTE,CF_DISPLAY, 1.237 + (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, 1.238 + {"KS_C_5601-1997",CT_DBYTE,CF_DISPLAY, 1.239 + (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, 1.240 +#endif 1.241 + 1.242 + /* deep sigh */ 1.243 + {"WINDOWS-874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.244 + (void *) windows_874tab,SC_THAI,"ISO-8859-11"}, 1.245 + {"CP874",CT_1BYTE,CF_DISPLAY, 1.246 + (void *) windows_874tab,SC_THAI,"ISO-8859-11"}, 1.247 +#ifdef GBTOUNICODE 1.248 + {"WINDOWS-936",CT_DBYTE,CF_PRIMARY | CF_DISPLAY, 1.249 + (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"}, 1.250 + {"CP936",CT_DBYTE,CF_DISPLAY, 1.251 + (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"}, 1.252 +#endif 1.253 +#ifdef KSCTOUNICODE 1.254 + {"WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY, 1.255 + (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, 1.256 + {"CP949",CT_DBYTE,CF_DISPLAY, 1.257 + (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, 1.258 + {"X-WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY, 1.259 + (void *) &ksc_param,SC_KOREAN,"EUC-KR"}, 1.260 +#endif 1.261 + {"WINDOWS-1250",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.262 + (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"}, 1.263 + {"CP1250",CT_1BYTE,CF_DISPLAY, 1.264 + (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"}, 1.265 + {"WINDOWS-1251",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING, 1.266 + (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"}, 1.267 + {"CP1251",CT_1BYTE,CF_DISPLAY, 1.268 + (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"}, 1.269 + {"WINDOWS-1252",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.270 + (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"}, 1.271 + {"CP1252",CT_1BYTE,CF_DISPLAY, 1.272 + (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"}, 1.273 + {"WINDOWS-1253",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.274 + (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"}, 1.275 + {"CP1253",CT_1BYTE,CF_DISPLAY, 1.276 + (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"}, 1.277 + {"WINDOWS-1254",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.278 + (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"}, 1.279 + {"CP1254",CT_1BYTE,CF_DISPLAY, 1.280 + (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"}, 1.281 + {"WINDOWS-1255",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.282 + (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"}, 1.283 + {"CP1255",CT_1BYTE,CF_DISPLAY, 1.284 + (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"}, 1.285 + {"WINDOWS-1256",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.286 + (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"}, 1.287 + {"CP1256",CT_1BYTE,CF_DISPLAY, 1.288 + (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"}, 1.289 + {"WINDOWS-1257",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.290 + (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"}, 1.291 + {"CP1257",CT_1BYTE,CF_DISPLAY, 1.292 + (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"}, 1.293 + {"WINDOWS-1258",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.294 + (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"}, 1.295 + {"CP1258",CT_1BYTE,CF_DISPLAY, 1.296 + (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"}, 1.297 + 1.298 + /* deeper sigh */ 1.299 + {"IBM367",CT_ASCII,CF_PRIMARY | CF_DISPLAY, 1.300 + NIL,NIL,"US-ASCII"}, 1.301 + {"IBM437",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.302 + (void *) ibm_437tab,SC_LATIN_1,"ISO-8859-1"}, 1.303 + {"IBM737",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.304 + (void *) ibm_737tab,SC_GREEK,"ISO-8859-7"}, 1.305 + {"IBM775",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.306 + (void *) ibm_775tab,SC_LATIN_7,"ISO-8859-13"}, 1.307 + {"IBM850",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.308 + (void *) ibm_850tab,SC_LATIN_1,"ISO-8859-1"}, 1.309 + {"IBM852",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.310 + (void *) ibm_852tab,SC_LATIN_2,"ISO-8859-2"}, 1.311 + {"IBM855",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.312 + (void *) ibm_855tab,SC_CYRILLIC,"ISO-8859-5"}, 1.313 + {"IBM857",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.314 + (void *) ibm_857tab,SC_LATIN_5,"ISO-8859-9"}, 1.315 + {"IBM860",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.316 + (void *) ibm_860tab,SC_LATIN_1,"ISO-8859-1"}, 1.317 + {"IBM861",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.318 + (void *) ibm_861tab,SC_LATIN_6,"ISO-8859-10"}, 1.319 + {"IBM862",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.320 + (void *) ibm_862tab,SC_HEBREW,"ISO-8859-8"}, 1.321 + {"IBM863",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.322 + (void *) ibm_863tab,SC_LATIN_1,"ISO-8859-1"}, 1.323 + {"IBM864",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.324 + (void *) ibm_864tab,SC_ARABIC,"ISO-8859-6"}, 1.325 + {"IBM865",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.326 + (void *) ibm_865tab,SC_LATIN_6,"ISO-8859-10"}, 1.327 + {"IBM866",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.328 + (void *) ibm_866tab,SC_CYRILLIC,"KOI8-R"}, 1.329 + {"IBM869",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.330 + (void *) ibm_869tab,SC_GREEK,"ISO-8859-7"}, 1.331 + {"IBM874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY, 1.332 + (void *) ibm_874tab,SC_THAI,"ISO-8859-11"}, 1.333 + /* deepest sigh */ 1.334 + {"ANSI_X3.4-1968",CT_ASCII,CF_DISPLAY, 1.335 + NIL,NIL,"US-ASCII"}, 1.336 + {"UNICODE-1-1-UTF-7",CT_UTF7,CF_UNSUPRT, 1.337 + NIL,SC_UNICODE,"UTF-8"}, 1.338 + /* these should never appear in email */ 1.339 + {"UCS-2",CT_UCS2,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL, 1.340 + NIL,SC_UNICODE,"UTF-8"}, 1.341 + {"UCS-4",CT_UCS4,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL, 1.342 + NIL,SC_UNICODE,"UTF-8"}, 1.343 + {"UTF-16",CT_UTF16,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL, 1.344 + NIL,SC_UNICODE,"UTF-8"}, 1.345 + NIL 1.346 +}; 1.347 + 1.348 +/* Non-Unicode Script table */ 1.349 + 1.350 +static const SCRIPT utf8_scvalid[] = { 1.351 + {"Arabic",NIL,SC_ARABIC}, 1.352 + {"Chinese Simplified","China, Singapore",SC_CHINESE_SIMPLIFIED}, 1.353 + {"Chinese Traditional","Taiwan, Hong Kong, Macao",SC_CHINESE_TRADITIONAL}, 1.354 + {"Cyrillic",NIL,SC_CYRILLIC}, 1.355 + {"Cyrillic Ukranian",NIL,SC_UKRANIAN}, 1.356 + {"Greek",NIL,SC_GREEK}, 1.357 + {"Hebrew",NIL,SC_HEBREW}, 1.358 + {"Japanese",NIL,SC_JAPANESE}, 1.359 + {"Korean",NIL,SC_KOREAN}, 1.360 + {"Latin-1","Western Europe",SC_LATIN_1}, 1.361 + {"Latin-2","Eastern Europe",SC_LATIN_2}, 1.362 + {"Latin-3","Southern Europe",SC_LATIN_3}, 1.363 + {"Latin-4","Northern Europe",SC_LATIN_4}, 1.364 + {"Latin-5","Turkish",SC_LATIN_5}, 1.365 + {"Latin-6","Nordic",SC_LATIN_6}, 1.366 + {"Latin-7","Baltic",SC_LATIN_7}, 1.367 + {"Latin-8","Celtic",SC_LATIN_8}, 1.368 + {"Latin-9","Euro",SC_LATIN_9}, 1.369 + {"Latin-10","Balkan",SC_LATIN_10}, 1.370 + {"Thai",NIL,SC_THAI}, 1.371 + {"Vietnamese",NIL,SC_VIETNAMESE}, 1.372 + NIL 1.373 +}; 1.374 + 1.375 +/* Look up script name or return entire table 1.376 + * Accepts: script name or NIL 1.377 + * Returns: pointer to script table entry or NIL if unknown 1.378 + */ 1.379 + 1.380 +SCRIPT *utf8_script (char *script) 1.381 +{ 1.382 + unsigned long i; 1.383 + if (!script) return (SCRIPT *) &utf8_scvalid[0]; 1.384 + else if (*script && (strlen (script) < 128)) 1.385 + for (i = 0; utf8_scvalid[i].name; i++) 1.386 + if (!compare_cstring (script,utf8_scvalid[i].name)) 1.387 + return (SCRIPT *) &utf8_scvalid[i]; 1.388 + return NIL; /* failed */ 1.389 +} 1.390 + 1.391 + 1.392 +/* Look up charset name or return entire table 1.393 + * Accepts: charset name or NIL 1.394 + * Returns: charset table entry or NIL if unknown 1.395 + */ 1.396 + 1.397 +const CHARSET *utf8_charset (char *charset) 1.398 +{ 1.399 + unsigned long i; 1.400 + if (!charset) return (CHARSET *) &utf8_csvalid[0]; 1.401 + else if (*charset && (strlen (charset) < 128)) 1.402 + for (i = 0; utf8_csvalid[i].name; i++) 1.403 + if (!compare_cstring (charset,utf8_csvalid[i].name)) 1.404 + return (CHARSET *) &utf8_csvalid[i]; 1.405 + return NIL; /* failed */ 1.406 +} 1.407 + 1.408 +/* Validate charset and generate error message if invalid 1.409 + * Accepts: bad character set 1.410 + * Returns: NIL if good charset, else error message string 1.411 + */ 1.412 + 1.413 +#define BADCSS "[BADCHARSET (" 1.414 +#define BADCSE ")] Unknown charset: " 1.415 + 1.416 +char *utf8_badcharset (char *charset) 1.417 +{ 1.418 + char *msg = NIL; 1.419 + if (!utf8_charset (charset)) { 1.420 + char *s,*t; 1.421 + unsigned long i,j; 1.422 + /* calculate size of header, trailer, and bad 1.423 + * charset plus charset names */ 1.424 + for (i = 0, j = sizeof (BADCSS) + sizeof (BADCSE) + strlen (charset) - 2; 1.425 + utf8_csvalid[i].name; i++) 1.426 + j += strlen (utf8_csvalid[i].name) + 1; 1.427 + /* not built right */ 1.428 + if (!i) fatal ("No valid charsets!"); 1.429 + /* header */ 1.430 + for (s = msg = (char *) fs_get (j), t = BADCSS; *t; *s++ = *t++); 1.431 + /* each charset */ 1.432 + for (i = 0; utf8_csvalid[i].name; *s++ = ' ', i++) 1.433 + for (t = utf8_csvalid[i].name; *t; *s++ = *t++); 1.434 + /* back over last space, trailer */ 1.435 + for (t = BADCSE, --s; *t; *s++ = *t++); 1.436 + /* finally bogus charset */ 1.437 + for (t = charset; *t; *s++ = *t++); 1.438 + *s++ = '\0'; /* finally tie off string */ 1.439 + if (s != (msg + j)) fatal ("charset msg botch"); 1.440 + } 1.441 + return msg; 1.442 +} 1.443 + 1.444 +/* Convert charset labelled sized text to UTF-8 1.445 + * Accepts: source sized text 1.446 + * charset 1.447 + * pointer to returned sized text if non-NIL 1.448 + * flags 1.449 + * Returns: T if successful, NIL if failure 1.450 + */ 1.451 + 1.452 +long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags) 1.453 +{ 1.454 + ucs4cn_t cv = (flags & U8T_CASECANON) ? ucs4_titlecase : NIL; 1.455 + ucs4de_t de = (flags & U8T_DECOMPOSE) ? ucs4_decompose_recursive : NIL; 1.456 + const CHARSET *cs = (charset && *charset) ? 1.457 + utf8_charset (charset) : utf8_infercharset (text); 1.458 + if (cs) return (text && ret) ? utf8_text_cs (text,cs,ret,cv,de) : LONGT; 1.459 + if (ret) { /* no conversion possible */ 1.460 + ret->data = text->data; /* so return source */ 1.461 + ret->size = text->size; 1.462 + } 1.463 + return NIL; /* failure */ 1.464 +} 1.465 + 1.466 + 1.467 +/* Operations used in converting data */ 1.468 + 1.469 +#define UTF8_COUNT_BMP(count,c,cv,de) { \ 1.470 + void *more = NIL; \ 1.471 + if (cv) c = (*cv) (c); \ 1.472 + if (de) c = (*de) (c,&more); \ 1.473 + do count += UTF8_SIZE_BMP(c); \ 1.474 + while (more && (c = (*de) (U8G_ERROR,&more)));\ 1.475 +} 1.476 + 1.477 +#define UTF8_WRITE_BMP(b,c,cv,de) { \ 1.478 + void *more = NIL; \ 1.479 + if (cv) c = (*cv) (c); \ 1.480 + if (de) c = (*de) (c,&more); \ 1.481 + do UTF8_PUT_BMP (b,c) \ 1.482 + while (more && (c = (*de) (U8G_ERROR,&more)));\ 1.483 +} 1.484 + 1.485 +#define UTF8_COUNT(count,c,cv,de) { \ 1.486 + void *more = NIL; \ 1.487 + if (cv) c = (*cv) (c); \ 1.488 + if (de) c = (*de) (c,&more); \ 1.489 + do count += utf8_size (c); \ 1.490 + while (more && (c = (*de) (U8G_ERROR,&more)));\ 1.491 +} 1.492 + 1.493 +#define UTF8_WRITE(b,c,cv,de) { \ 1.494 + void *more = NIL; \ 1.495 + if (cv) c = (*cv) (c); \ 1.496 + if (de) c = (*de) (c,&more); \ 1.497 + do b = utf8_put (b,c); \ 1.498 + while (more && (c = (*de) (U8G_ERROR,&more)));\ 1.499 +} 1.500 + 1.501 +/* Convert sized text to UTF-8 given CHARSET block 1.502 + * Accepts: source sized text 1.503 + * CHARSET block 1.504 + * pointer to returned sized text 1.505 + * canonicalization function 1.506 + * decomposition function 1.507 + * Returns: T if successful, NIL if failure 1.508 + */ 1.509 + 1.510 +long utf8_text_cs (SIZEDTEXT *text,const CHARSET *cs,SIZEDTEXT *ret, 1.511 + ucs4cn_t cv,ucs4de_t de) 1.512 +{ 1.513 + ret->data = text->data; /* default to source */ 1.514 + ret->size = text->size; 1.515 + switch (cs->type) { /* convert if type known */ 1.516 + case CT_ASCII: /* 7-bit ASCII no table */ 1.517 + case CT_UTF8: /* variable UTF-8 encoded Unicode no table */ 1.518 + if (cv || de) utf8_text_utf8 (text,ret,cv,de); 1.519 + break; 1.520 + case CT_1BYTE0: /* 1 byte no table */ 1.521 + utf8_text_1byte0 (text,ret,cv,de); 1.522 + break; 1.523 + case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ 1.524 + utf8_text_1byte (text,ret,cs->tab,cv,de); 1.525 + break; 1.526 + case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ 1.527 + utf8_text_1byte8 (text,ret,cs->tab,cv,de); 1.528 + break; 1.529 + case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ 1.530 + utf8_text_euc (text,ret,cs->tab,cv,de); 1.531 + break; 1.532 + case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ 1.533 + utf8_text_dbyte (text,ret,cs->tab,cv,de); 1.534 + break; 1.535 + case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ 1.536 + utf8_text_dbyte2 (text,ret,cs->tab,cv,de); 1.537 + break; 1.538 + case CT_UTF7: /* variable UTF-7 encoded Unicode no table */ 1.539 + utf8_text_utf7 (text,ret,cv,de); 1.540 + break; 1.541 + case CT_UCS2: /* 2 byte 16-bit Unicode no table */ 1.542 + utf8_text_ucs2 (text,ret,cv,de); 1.543 + break; 1.544 + case CT_UCS4: /* 4 byte 32-bit Unicode no table */ 1.545 + utf8_text_ucs4 (text,ret,cv,de); 1.546 + break; 1.547 + case CT_UTF16: /* variable UTF-16 encoded Unicode no table */ 1.548 + utf8_text_utf16 (text,ret,cv,de); 1.549 + break; 1.550 + case CT_2022: /* variable ISO-2022 encoded no table*/ 1.551 + utf8_text_2022 (text,ret,cv,de); 1.552 + break; 1.553 + case CT_SJIS: /* 2 byte Shift-JIS encoded JIS no table */ 1.554 + utf8_text_sjis (text,ret,cv,de); 1.555 + break; 1.556 + default: /* unknown character set type */ 1.557 + return NIL; 1.558 + } 1.559 + return LONGT; /* return success */ 1.560 +} 1.561 + 1.562 +/* Reverse mapping routines 1.563 + * 1.564 + * These routines only support character sets, not all possible charsets. In 1.565 + * particular, they do not support any Unicode encodings or ISO 2022. 1.566 + * 1.567 + * As a special dispensation, utf8_cstext() and utf8_cstocstext() support 1.568 + * support ISO-2022-JP if EUC-JP can be reverse mapped; and utf8_rmaptext() 1.569 + * will generated ISO-2022-JP using an EUC-JP rmap if flagged to do so. 1.570 + * 1.571 + * No attempt is made to map "equivalent" Unicode characters or Unicode 1.572 + * characters that have the same glyph; nor is there any attempt to handle 1.573 + * combining characters or otherwise do any stringprep. Maybe later. 1.574 + */ 1.575 + 1.576 + 1.577 +/* Convert UTF-8 sized text to charset 1.578 + * Accepts: source sized text 1.579 + * destination charset 1.580 + * pointer to returned sized text 1.581 + * substitute character if not in cs, else NIL to return failure 1.582 + * Returns: T if successful, NIL if failure 1.583 + */ 1.584 + 1.585 + 1.586 +long utf8_cstext (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret, 1.587 + unsigned long errch) 1.588 +{ 1.589 + short iso2022jp = !compare_cstring (charset,"ISO-2022-JP"); 1.590 + unsigned short *rmap = utf8_rmap (iso2022jp ? "EUC-JP" : charset); 1.591 + return rmap ? utf8_rmaptext (text,rmap,ret,errch,iso2022jp) : NIL; 1.592 +} 1.593 + 1.594 +/* Convert charset labelled sized text to another charset 1.595 + * Accepts: source sized text 1.596 + * source charset 1.597 + * pointer to returned sized text 1.598 + * destination charset 1.599 + * substitute character if not in dest cs, else NIL to return failure 1.600 + * Returns: T if successful, NIL if failure 1.601 + * 1.602 + * This routine has the same restricts as utf8_cstext(). 1.603 + */ 1.604 + 1.605 +long utf8_cstocstext (SIZEDTEXT *src,char *sc,SIZEDTEXT *dst,char *dc, 1.606 + unsigned long errch) 1.607 +{ 1.608 + SIZEDTEXT utf8; 1.609 + const CHARSET *scs,*dcs; 1.610 + unsigned short *rmap; 1.611 + long ret = NIL; 1.612 + long iso2022jp; 1.613 + /* lookup charsets and reverse map */ 1.614 + if ((dc && (dcs = utf8_charset (dc))) && 1.615 + (rmap = (iso2022jp = ((dcs->type == CT_2022) && 1.616 + !compare_cstring (dcs->name,"ISO-2022-JP"))) ? 1.617 + utf8_rmap ("EUC-JP") : utf8_rmap_cs (dcs)) && 1.618 + (scs = (sc && *sc) ? utf8_charset (sc) : utf8_infercharset (src))) { 1.619 + /* init temporary buffer */ 1.620 + memset (&utf8,NIL,sizeof (SIZEDTEXT)); 1.621 + /* source cs equivalent to dest cs? */ 1.622 + if ((scs->type == dcs->type) && (scs->tab == dcs->tab)) { 1.623 + dst->data = src->data; /* yes, just copy pointers */ 1.624 + dst->size = src->size; 1.625 + ret = LONGT; 1.626 + } 1.627 + /* otherwise do the conversion */ 1.628 + else ret = (utf8_text_cs (src,scs,&utf8,NIL,NIL) && 1.629 + utf8_rmaptext (&utf8,rmap,dst,errch,iso2022jp)); 1.630 + /* flush temporary buffer */ 1.631 + if (utf8.data && (utf8.data != src->data) && (utf8.data != dst->data)) 1.632 + fs_give ((void **) &utf8.data); 1.633 + } 1.634 + return ret; 1.635 +} 1.636 + 1.637 +/* Cached rmap */ 1.638 + 1.639 +static const CHARSET *currmapcs = NIL; 1.640 +static unsigned short *currmap = NIL; 1.641 + 1.642 + 1.643 +/* Cache and return map for UTF-8 -> character set 1.644 + * Accepts: character set name 1.645 + * Returns: cached map if character set found, else NIL 1.646 + */ 1.647 + 1.648 +unsigned short *utf8_rmap (char *charset) 1.649 +{ 1.650 + return (currmapcs && !compare_cstring (charset,currmapcs->name)) ? currmap : 1.651 + utf8_rmap_cs (utf8_charset (charset)); 1.652 +} 1.653 + 1.654 + 1.655 +/* Cache and return map for UTF-8 -> character set given CHARSET block 1.656 + * Accepts: CHARSET block 1.657 + * Returns: cached map if character set found, else NIL 1.658 + */ 1.659 + 1.660 +unsigned short *utf8_rmap_cs (const CHARSET *cs) 1.661 +{ 1.662 + unsigned short *ret = NIL; 1.663 + if (!cs); /* have charset? */ 1.664 + else if (cs == currmapcs) ret = currmap; 1.665 + else if (ret = utf8_rmap_gen (cs,currmap)) { 1.666 + currmapcs = cs; 1.667 + currmap = ret; 1.668 + } 1.669 + return ret; 1.670 +} 1.671 + 1.672 +/* Return map for UTF-8 -> character set given CHARSET block 1.673 + * Accepts: CHARSET block 1.674 + * old map to recycle 1.675 + * Returns: map if character set found, else NIL 1.676 + */ 1.677 + 1.678 +unsigned short *utf8_rmap_gen (const CHARSET *cs,unsigned short *oldmap) 1.679 +{ 1.680 + unsigned short u,*tab,*rmap; 1.681 + unsigned int i,m,ku,ten; 1.682 + struct utf8_eucparam *param,*p2; 1.683 + switch (cs->type) { /* is a character set? */ 1.684 + case CT_ASCII: /* 7-bit ASCII no table */ 1.685 + case CT_1BYTE0: /* 1 byte no table */ 1.686 + case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ 1.687 + case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ 1.688 + case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ 1.689 + case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ 1.690 + case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ 1.691 + case CT_SJIS: /* 2 byte Shift-JIS */ 1.692 + rmap = oldmap ? oldmap : /* recycle old map if supplied else make new */ 1.693 + (unsigned short *) fs_get (65536 * sizeof (unsigned short)); 1.694 + /* initialize table for ASCII */ 1.695 + for (i = 0; i < 128; i++) rmap[i] = (unsigned short) i; 1.696 + /* populate remainder of table with NOCHAR */ 1.697 +#define NOCHARBYTE (NOCHAR & 0xff) 1.698 +#if NOCHAR - ((NOCHARBYTE << 8) | NOCHARBYTE) 1.699 + while (i < 65536) rmap[i++] = NOCHAR; 1.700 +#else 1.701 + memset (rmap + 128,NOCHARBYTE,(65536 - 128) * sizeof (unsigned short)); 1.702 +#endif 1.703 + break; 1.704 + default: /* unsupported charset type */ 1.705 + rmap = NIL; /* no map possible */ 1.706 + } 1.707 + if (rmap) { /* have a map? */ 1.708 + switch (cs->type) { /* additional reverse map actions */ 1.709 + case CT_1BYTE0: /* 1 byte no table */ 1.710 + for (i = 128; i < 256; i++) rmap[i] = (unsigned short) i; 1.711 + break; 1.712 + case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ 1.713 + for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++) 1.714 + if (tab[i & BITS7] != UBOGON) rmap[tab[i & BITS7]] = (unsigned short)i; 1.715 + break; 1.716 + case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ 1.717 + for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++) 1.718 + if (tab[i] != UBOGON) rmap[tab[i]] = (unsigned short) i; 1.719 + break; 1.720 + case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ 1.721 + for (param = (struct utf8_eucparam *) cs->tab, 1.722 + tab = (unsigned short *) param->tab, ku = 0; 1.723 + ku < param->max_ku; ku++) 1.724 + for (ten = 0; ten < param->max_ten; ten++) 1.725 + if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON) 1.726 + rmap[u] = ((ku + param->base_ku) << 8) + 1.727 + (ten + param->base_ten) + 0x8080; 1.728 + break; 1.729 + 1.730 + case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ 1.731 + for (param = (struct utf8_eucparam *) cs->tab, 1.732 + tab = (unsigned short *) param->tab, ku = 0; 1.733 + ku < param->max_ku; ku++) 1.734 + for (ten = 0; ten < param->max_ten; ten++) 1.735 + if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON) 1.736 + rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten); 1.737 + break; 1.738 + case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ 1.739 + param = (struct utf8_eucparam *) cs->tab; 1.740 + p2 = param + 1; /* plane 2 parameters */ 1.741 + /* only ten parameters should differ */ 1.742 + if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku)) 1.743 + fatal ("ku definition error for CT_DBYTE2 charset"); 1.744 + /* total codepoints in each ku */ 1.745 + m = param->max_ten + p2->max_ten; 1.746 + tab = (unsigned short *) param->tab; 1.747 + for (ku = 0; ku < param->max_ku; ku++) { 1.748 + for (ten = 0; ten < param->max_ten; ten++) 1.749 + if ((u = tab[(ku * m) + ten]) != UBOGON) 1.750 + rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten); 1.751 + for (ten = 0; ten < p2->max_ten; ten++) 1.752 + if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON) 1.753 + rmap[u] = ((ku + param->base_ku) << 8) + (ten + p2->base_ten); 1.754 + } 1.755 + break; 1.756 + case CT_SJIS: /* 2 byte Shift-JIS */ 1.757 + for (ku = 0; ku < MAX_JIS0208_KU; ku++) 1.758 + for (ten = 0; ten < MAX_JIS0208_TEN; ten++) 1.759 + if ((u = jis0208tab[ku][ten]) != UBOGON) { 1.760 + int sku = ku + BASE_JIS0208_KU; 1.761 + int sten = ten + BASE_JIS0208_TEN; 1.762 + rmap[u] = ((((sku + 1) >> 1) + ((sku < 95) ? 112 : 176)) << 8) + 1.763 + sten + ((sku % 2) ? ((sten > 95) ? 32 : 31) : 126); 1.764 + } 1.765 + /* JIS Roman */ 1.766 + rmap[UCS2_YEN] = JISROMAN_YEN; 1.767 + rmap[UCS2_OVERLINE] = JISROMAN_OVERLINE; 1.768 + /* JIS hankaku katakana */ 1.769 + for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++) 1.770 + rmap[UCS2_KATAKANA + u] = MIN_KANA_8 + u; 1.771 + break; 1.772 + } 1.773 + /* hack: map NBSP to SP if otherwise no map */ 1.774 + if (rmap[0x00a0] == NOCHAR) rmap[0x00a0] = rmap[0x0020]; 1.775 + } 1.776 + return rmap; /* return map */ 1.777 +} 1.778 + 1.779 +/* Convert UTF-8 sized text to charset using rmap 1.780 + * Accepts: source sized text 1.781 + * conversion rmap 1.782 + * pointer to returned sized text 1.783 + * substitute character if not in rmap, else NIL to return failure 1.784 + * ISO-2022-JP conversion flag 1.785 + * Returns T if successful, NIL if failure 1.786 + * 1.787 + * This routine doesn't try to convert to all possible charsets; in particular 1.788 + * it doesn't support other Unicode encodings or any ISO 2022 other than 1.789 + * ISO-2022-JP. 1.790 + */ 1.791 + 1.792 +long utf8_rmaptext (SIZEDTEXT *text,unsigned short *rmap,SIZEDTEXT *ret, 1.793 + unsigned long errch,long iso2022jp) 1.794 +{ 1.795 + unsigned long i,u,c; 1.796 + /* get size of buffer */ 1.797 + if (i = utf8_rmapsize (text,rmap,errch,iso2022jp)) { 1.798 + unsigned char *s = text->data; 1.799 + unsigned char *t = ret->data = (unsigned char *) fs_get (i); 1.800 + ret->size = i - 1; /* number of octets in destination buffer */ 1.801 + /* start non-zero ISO-2022-JP state at 1 */ 1.802 + if (iso2022jp) iso2022jp = 1; 1.803 + /* convert string, ignore BOM */ 1.804 + for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) { 1.805 + /* substitute error character for NOCHAR */ 1.806 + if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch; 1.807 + switch (iso2022jp) { /* depends upon ISO 2022 mode */ 1.808 + case 0: /* ISO 2022 not in effect */ 1.809 + /* two-byte character */ 1.810 + if (c > 0xff) *t++ = (unsigned char) (c >> 8); 1.811 + /* single-byte or low-byte of two-byte */ 1.812 + *t++ = (unsigned char) (c & 0xff); 1.813 + break; 1.814 + case 1: /* ISO 2022 Roman */ 1.815 + /* <ch> */ 1.816 + if (c < 0x80) *t++ = (unsigned char) c; 1.817 + else { /* JIS character */ 1.818 + *t++ = I2C_ESC; /* ESC $ B <hi> <lo> */ 1.819 + *t++ = I2C_MULTI; 1.820 + *t++ = I2CS_94x94_JIS_NEW; 1.821 + *t++ = (unsigned char) (c >> 8) & 0x7f; 1.822 + *t++ = (unsigned char) c & 0x7f; 1.823 + iso2022jp = 2; /* shift to ISO 2022 JIS */ 1.824 + } 1.825 + break; 1.826 + case 2: /* ISO 2022 JIS */ 1.827 + if (c > 0x7f) { /* <hi> <lo> */ 1.828 + *t++ = (unsigned char) (c >> 8) & 0x7f; 1.829 + *t++ = (unsigned char) c & 0x7f; 1.830 + } 1.831 + else { /* ASCII character */ 1.832 + *t++ = I2C_ESC; /* ESC ( J <ch> */ 1.833 + *t++ = I2C_G0_94; 1.834 + *t++ = I2CS_94_JIS_ROMAN; 1.835 + *t++ = (unsigned char) c; 1.836 + iso2022jp = 1; /* shift to ISO 2022 Roman */ 1.837 + } 1.838 + break; 1.839 + } 1.840 + } 1.841 + if (iso2022jp == 2) { /* ISO-2022-JP string must end in Roman */ 1.842 + *t++ = I2C_ESC; /* ESC ( J */ 1.843 + *t++ = I2C_G0_94; 1.844 + *t++ = I2CS_94_JIS_ROMAN; 1.845 + } 1.846 + *t++ = NIL; /* tie off returned data */ 1.847 + return LONGT; /* return success */ 1.848 + } 1.849 + ret->data = NIL; 1.850 + ret->size = 0; 1.851 + return NIL; /* failure */ 1.852 +} 1.853 + 1.854 +/* Calculate size of convertsion of UTF-8 sized text to charset using rmap 1.855 + * Accepts: source sized text 1.856 + * conversion rmap 1.857 + * pointer to returned sized text 1.858 + * substitute character if not in rmap, else NIL to return failure 1.859 + * ISO-2022-JP conversion flag 1.860 + * Returns size+1 if successful, NIL if failure 1.861 + * 1.862 + * This routine doesn't try to handle to all possible charsets; in particular 1.863 + * it doesn't support other Unicode encodings or any ISO 2022 other than 1.864 + * ISO-2022-JP. 1.865 + */ 1.866 + 1.867 +unsigned long utf8_rmapsize (SIZEDTEXT *text,unsigned short *rmap, 1.868 + unsigned long errch,long iso2022jp) 1.869 +{ 1.870 + unsigned long i,u,c; 1.871 + unsigned long ret = 1; /* terminating NUL */ 1.872 + unsigned char *s = text->data; 1.873 + if (iso2022jp) iso2022jp = 1; /* start non-zero ISO-2022-JP state at 1 */ 1.874 + for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) { 1.875 + if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch))) 1.876 + return NIL; /* not in BMP, or NOCHAR and no err char */ 1.877 + switch (iso2022jp) { /* depends upon ISO 2022 mode */ 1.878 + case 0: /* ISO 2022 not in effect */ 1.879 + ret += (c > 0xff) ? 2 : 1; 1.880 + break; 1.881 + case 1: /* ISO 2022 Roman */ 1.882 + if (c < 0x80) ret += 1; /* <ch> */ 1.883 + else { /* JIS character */ 1.884 + ret += 5; /* ESC $ B <hi> <lo> */ 1.885 + iso2022jp = 2; /* shift to ISO 2022 JIS */ 1.886 + } 1.887 + break; 1.888 + case 2: /* ISO 2022 JIS */ 1.889 + if (c > 0x7f) ret += 2; /* <hi> <lo> */ 1.890 + else { /* ASCII character */ 1.891 + ret += 4; /* ESC ( J <ch> */ 1.892 + iso2022jp = 1; /* shift to ISO 2022 Roman */ 1.893 + } 1.894 + break; 1.895 + } 1.896 + } 1.897 + if (iso2022jp == 2) { /* ISO-2022-JP string must end in Roman */ 1.898 + ret += 3; /* ESC ( J */ 1.899 + iso2022jp = 1; /* reset state to Roman */ 1.900 + } 1.901 + return ret; 1.902 +} 1.903 + 1.904 +/* Convert UCS-4 to charset using rmap 1.905 + * Accepts: source UCS-4 character(s) 1.906 + * numver of UCS-4 characters 1.907 + * conversion rmap 1.908 + * pointer to returned sized text 1.909 + * substitute character if not in rmap, else NIL to return failure 1.910 + * Returns T if successful, NIL if failure 1.911 + * 1.912 + * Currently only supports BMP characters, and does not support ISO-2022 1.913 + */ 1.914 + 1.915 +long ucs4_rmaptext (unsigned long *ucs4,unsigned long len,unsigned short *rmap, 1.916 + SIZEDTEXT *ret,unsigned long errch) 1.917 +{ 1.918 + long size = ucs4_rmaplen (ucs4,len,rmap,errch); 1.919 + return (size >= 0) ? /* build in newly-created buffer */ 1.920 + ucs4_rmapbuf (ret->data = (unsigned char *) fs_get ((ret->size = size) +1), 1.921 + ucs4,len,rmap,errch) : NIL; 1.922 +} 1.923 + 1.924 +/* Return size of UCS-4 string converted to other CS via rmap 1.925 + * Accepts: source UCS-4 character(s) 1.926 + * numver of UCS-4 characters 1.927 + * conversion rmap 1.928 + * substitute character if not in rmap, else NIL to return failure 1.929 + * Returns: length if success, negative if failure (no-convert) 1.930 + */ 1.931 + 1.932 +long ucs4_rmaplen (unsigned long *ucs4,unsigned long len,unsigned short *rmap, 1.933 + unsigned long errch) 1.934 +{ 1.935 + long ret; 1.936 + unsigned long i,u,c; 1.937 + /* count non-BOM characters */ 1.938 + for (ret = 0,i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) { 1.939 + if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch))) 1.940 + return -1; /* not in BMP, or NOCHAR and no err char? */ 1.941 + ret += (c > 0xff) ? 2 : 1; 1.942 + } 1.943 + return ret; 1.944 +} 1.945 + 1.946 + 1.947 +/* Stuff buffer with UCS-4 string converted to other CS via rmap 1.948 + * Accepts: destination buffer 1.949 + * source UCS-4 character(s) 1.950 + * number of UCS-4 characters 1.951 + * conversion rmap 1.952 + * substitute character if not in rmap, else NIL to return failure 1.953 + * Returns: T, always 1.954 + */ 1.955 + 1.956 +long ucs4_rmapbuf (unsigned char *t,unsigned long *ucs4,unsigned long len, 1.957 + unsigned short *rmap,unsigned long errch) 1.958 +{ 1.959 + unsigned long i,u,c; 1.960 + /* convert non-BOM characters */ 1.961 + for (i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) { 1.962 + /* substitute error character for NOCHAR */ 1.963 + if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch; 1.964 + /* two-byte character? */ 1.965 + if (c > 0xff) *t++ = (unsigned char) (c >> 8); 1.966 + /* single-byte or low-byte of two-byte */ 1.967 + *t++ = (unsigned char) (c & 0xff); 1.968 + } 1.969 + *t++ = NIL; /* tie off returned data */ 1.970 + return LONGT; 1.971 +} 1.972 + 1.973 +/* Return UCS-4 Unicode character from UTF-8 string 1.974 + * Accepts: pointer to string 1.975 + * remaining octets in string 1.976 + * Returns: UCS-4 character with pointer and count updated 1.977 + * or error code with pointer and count unchanged 1.978 + */ 1.979 + 1.980 +unsigned long utf8_get (unsigned char **s,unsigned long *i) 1.981 +{ 1.982 + unsigned char *t = *s; 1.983 + unsigned long j = *i; 1.984 + /* decode raw UTF-8 string */ 1.985 + unsigned long ret = utf8_get_raw (&t,&j); 1.986 + if (ret & U8G_ERROR); /* invalid raw UTF-8 decoding? */ 1.987 + /* no, is it surrogate? */ 1.988 + else if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) ret = U8G_SURROGA; 1.989 + /* or in non-Unicode ISO 10646 space? */ 1.990 + else if (ret > UCS4_MAXUNICODE) ret = U8G_NOTUNIC; 1.991 + else { 1.992 + *s = t; /* all is well, update pointer */ 1.993 + *i = j; /* and counter */ 1.994 + } 1.995 + return ret; /* return value */ 1.996 +} 1.997 + 1.998 +/* Return raw (including non-Unicode) UCS-4 character from UTF-8 string 1.999 + * Accepts: pointer to string 1.1000 + * remaining octets in string 1.1001 + * Returns: UCS-4 character with pointer and count updated 1.1002 + * or error code with pointer and count unchanged 1.1003 + */ 1.1004 + 1.1005 +unsigned long utf8_get_raw (unsigned char **s,unsigned long *i) 1.1006 +{ 1.1007 + unsigned char c,c1; 1.1008 + unsigned char *t = *s; 1.1009 + unsigned long j = *i; 1.1010 + unsigned long ret = U8G_NOTUTF8; 1.1011 + int more = 0; 1.1012 + do { /* make sure have source octets available */ 1.1013 + if (!j--) return more ? U8G_ENDSTRI : U8G_ENDSTRG; 1.1014 + /* UTF-8 continuation? */ 1.1015 + else if (((c = *t++) > 0x7f) && (c < 0xc0)) { 1.1016 + /* continuation when not in progress */ 1.1017 + if (!more) return U8G_BADCONT; 1.1018 + --more; /* found a continuation octet */ 1.1019 + ret <<= 6; /* shift current value by 6 bits */ 1.1020 + ret |= c & 0x3f; /* merge continuation octet */ 1.1021 + } 1.1022 + /* incomplete UTF-8 character */ 1.1023 + else if (more) return U8G_INCMPLT; 1.1024 + else { /* start of sequence */ 1.1025 + c1 = j ? *t : 0xbf; /* assume valid continuation if incomplete */ 1.1026 + if (c < 0x80) ret = c; /* U+0000 - U+007f */ 1.1027 + else if (c < 0xc2); /* c0 and c1 never valid */ 1.1028 + else if (c < 0xe0) { /* U+0080 - U+07ff */ 1.1029 + if (c &= 0x1f) more = 1; 1.1030 + } 1.1031 + else if (c < 0xf0) { /* U+0800 - U+ffff */ 1.1032 + if ((c &= 0x0f) || (c1 >= 0xa0)) more = 2; 1.1033 + } 1.1034 + else if (c < 0xf8) { /* U+10000 - U+10ffff (and 110000 - 1fffff) */ 1.1035 + if ((c &= 0x07) || (c1 >= 0x90)) more = 3; 1.1036 + } 1.1037 + else if (c < 0xfc) { /* ISO 10646 200000 - 3ffffff */ 1.1038 + if ((c &= 0x03) || (c1 >= 0x88)) more = 4; 1.1039 + } 1.1040 + else if (c < 0xfe) { /* ISO 10646 4000000 - 7fffffff */ 1.1041 + if ((c &= 0x01) || (c1 >= 0x84)) more = 5; 1.1042 + } 1.1043 + /* fe and ff never valid */ 1.1044 + if (more) { /* multi-octet, make sure more to come */ 1.1045 + if (!j) return U8G_ENDSTRI; 1.1046 + ret = c; /* continuation needed, save start bits */ 1.1047 + } 1.1048 + } 1.1049 + } while (more); 1.1050 + if (!(ret & U8G_ERROR)) { /* success return? */ 1.1051 + *s = t; /* yes, update pointer */ 1.1052 + *i = j; /* and counter */ 1.1053 + } 1.1054 + return ret; /* return value */ 1.1055 +} 1.1056 + 1.1057 +/* Return UCS-4 character from named charset string 1.1058 + * Accepts: charset 1.1059 + * pointer to string 1.1060 + * remaining octets in string 1.1061 + * Returns: UCS-4 character with pointer and count updated, negative if error 1.1062 + * 1.1063 + * Error codes are the same as utf8_get(). 1.1064 + */ 1.1065 + 1.1066 +unsigned long ucs4_cs_get (CHARSET *cs,unsigned char **s,unsigned long *i) 1.1067 +{ 1.1068 + unsigned char c,c1,ku,ten; 1.1069 + unsigned long ret,d; 1.1070 + unsigned char *t = *s; 1.1071 + unsigned long j = *i; 1.1072 + struct utf8_eucparam *p1,*p2,*p3; 1.1073 + if (j--) c = *t++; /* get first octet */ 1.1074 + else return U8G_ENDSTRG; /* empty string */ 1.1075 + switch (cs->type) { /* convert if type known */ 1.1076 + case CT_UTF8: /* variable UTF-8 encoded Unicode no table */ 1.1077 + return utf8_get (s,i); 1.1078 + case CT_ASCII: /* 7-bit ASCII no table */ 1.1079 + if (c >= 0x80) return U8G_NOTUTF8; 1.1080 + case CT_1BYTE0: /* 1 byte no table */ 1.1081 + ret = c; /* identity */ 1.1082 + break; 1.1083 + case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ 1.1084 + ret = (c > 0x80) ? ((unsigned short *) cs->tab)[c & BITS7] : c; 1.1085 + break; 1.1086 + case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ 1.1087 + ret = ((unsigned short *) cs->tab)[c]; 1.1088 + break; 1.1089 + 1.1090 + case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ 1.1091 + if (c & BIT8) { 1.1092 + p1 = (struct utf8_eucparam *) cs->tab; 1.1093 + p2 = p1 + 1; 1.1094 + p3 = p1 + 2; 1.1095 + if (j--) c1 = *t++; /* get second octet */ 1.1096 + else return U8G_ENDSTRI; 1.1097 + if (!(c1 & BIT8)) return U8G_NOTUTF8; 1.1098 + switch (c) { /* check 8bit code set */ 1.1099 + case EUC_CS2: /* CS2 */ 1.1100 + if (p2->base_ku) { /* CS2 set up? */ 1.1101 + if (p2->base_ten) { /* yes, multibyte? */ 1.1102 + if (j--) c = *t++; /* get second octet */ 1.1103 + else return U8G_ENDSTRI; 1.1104 + if ((c & BIT8) && 1.1105 + ((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) && 1.1106 + ((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) { 1.1107 + ret = ((unsigned short *) p2->tab)[(ku*p2->max_ten) + ten]; 1.1108 + break; 1.1109 + } 1.1110 + } 1.1111 + else if ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) { 1.1112 + ret = c1 + ((unsigned long) p2->tab); 1.1113 + break; 1.1114 + } 1.1115 + } 1.1116 + return U8G_NOTUTF8; /* CS2 not set up or bogus */ 1.1117 + case EUC_CS3: /* CS3 */ 1.1118 + if (p3->base_ku) { /* CS3 set up? */ 1.1119 + if (p3->base_ten) { /* yes, multibyte? */ 1.1120 + if (j--) c = *t++; /* get second octet */ 1.1121 + else return U8G_ENDSTRI; 1.1122 + if ((c & BIT8) && 1.1123 + ((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) && 1.1124 + ((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) { 1.1125 + ret = ((unsigned short *) p3->tab)[(ku*p3->max_ten) + ten]; 1.1126 + break; 1.1127 + } 1.1128 + } 1.1129 + else if ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) { 1.1130 + ret = c1 + ((unsigned long) p3->tab); 1.1131 + break; 1.1132 + } 1.1133 + } 1.1134 + return U8G_NOTUTF8; /* CS3 not set up or bogus */ 1.1135 + default: 1.1136 + if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) || 1.1137 + ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten)) 1.1138 + return U8G_NOTUTF8; 1.1139 + ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten]; 1.1140 + /* special hack for JIS X 0212: merge rows less than 10 */ 1.1141 + if ((ret == UBOGON) && ku && (ku < 10) && p3->tab && p3->base_ten) 1.1142 + ret = ((unsigned short *) p3->tab) 1.1143 + [((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten]; 1.1144 + break; 1.1145 + } 1.1146 + } 1.1147 + else ret = c; /* ASCII character */ 1.1148 + break; 1.1149 + 1.1150 + case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ 1.1151 + if (c & BIT8) { /* double-byte character? */ 1.1152 + p1 = (struct utf8_eucparam *) cs->tab; 1.1153 + if (j--) c1 = *t++; /* get second octet */ 1.1154 + else return U8G_ENDSTRI; 1.1155 + if (((ku = c - p1->base_ku) < p1->max_ku) && 1.1156 + ((ten = c1 - p1->base_ten) < p1->max_ten)) 1.1157 + ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten]; 1.1158 + else return U8G_NOTUTF8; 1.1159 + } 1.1160 + else ret = c; /* ASCII character */ 1.1161 + break; 1.1162 + case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ 1.1163 + if (c & BIT8) { /* double-byte character? */ 1.1164 + p1 = (struct utf8_eucparam *) cs->tab; 1.1165 + p2 = p1 + 1; 1.1166 + if (j--) c1 = *t++; /* get second octet */ 1.1167 + else return U8G_ENDSTRI; 1.1168 + if (c1 & BIT8) { /* high vs. low plane */ 1.1169 + if ((ku = c - p2->base_ku) < p2->max_ku && 1.1170 + ((ten = c1 - p2->base_ten) < p2->max_ten)) 1.1171 + ret = ((unsigned short *) p1->tab) 1.1172 + [(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten]; 1.1173 + else return U8G_NOTUTF8; 1.1174 + } 1.1175 + else if ((ku = c - p1->base_ku) < p1->max_ku && 1.1176 + ((ten = c1 - p1->base_ten) < p1->max_ten)) 1.1177 + ret = ((unsigned short *) p1->tab) 1.1178 + [(ku*(p1->max_ten + p2->max_ten)) + ten]; 1.1179 + else return U8G_NOTUTF8; 1.1180 + } 1.1181 + else ret = c; /* ASCII character */ 1.1182 + break; 1.1183 + case CT_SJIS: /* 2 byte Shift-JIS encoded JIS no table */ 1.1184 + /* compromise - do yen sign but not overline */ 1.1185 + if (!(c & BIT8)) ret = (c == JISROMAN_YEN) ? UCS2_YEN : c; 1.1186 + /* half-width katakana? */ 1.1187 + else if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) ret = c + KANA_8; 1.1188 + else { /* Shift-JIS */ 1.1189 + if (j--) c1 = *t++; /* get second octet */ 1.1190 + else return U8G_ENDSTRI; 1.1191 + SJISTOJIS (c,c1); 1.1192 + c = JISTOUNICODE (c,c1,ku,ten); 1.1193 + } 1.1194 + break; 1.1195 + 1.1196 + case CT_UCS2: /* 2 byte 16-bit Unicode no table */ 1.1197 + ret = c << 8; 1.1198 + if (j--) c = *t++; /* get second octet */ 1.1199 + else return U8G_ENDSTRI; /* empty string */ 1.1200 + ret |= c; 1.1201 + break; 1.1202 + case CT_UCS4: /* 4 byte 32-bit Unicode no table */ 1.1203 + if (c & 0x80) return U8G_NOTUTF8; 1.1204 + if (j < 3) return U8G_ENDSTRI; 1.1205 + j -= 3; /* count three octets */ 1.1206 + ret = c << 24; 1.1207 + ret |= (*t++) << 16; 1.1208 + ret |= (*t++) << 8; 1.1209 + ret |= (*t++); 1.1210 + break; 1.1211 + case CT_UTF16: /* variable UTF-16 encoded Unicode no table */ 1.1212 + ret = c << 8; 1.1213 + if (j--) c = *t++; /* get second octet */ 1.1214 + else return U8G_ENDSTRI; /* empty string */ 1.1215 + ret |= c; 1.1216 + /* surrogate? */ 1.1217 + if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) { 1.1218 + /* invalid first surrogate */ 1.1219 + if ((ret > UTF16_SURRHEND) || (j < 2)) return U8G_NOTUTF8; 1.1220 + j -= 2; /* count two octets */ 1.1221 + d = (*t++) << 8; /* first octet of second surrogate */ 1.1222 + d |= *t++; /* second octet of second surrogate */ 1.1223 + if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) return U8G_NOTUTF8; 1.1224 + ret = UTF16_BASE + ((ret & UTF16_MASK) << UTF16_SHIFT) + 1.1225 + (d & UTF16_MASK); 1.1226 + } 1.1227 + break; 1.1228 + default: /* unknown/unsupported character set type */ 1.1229 + return U8G_NOTUTF8; 1.1230 + } 1.1231 + *s = t; /* update pointer and counter */ 1.1232 + *i = j; 1.1233 + return ret; 1.1234 +} 1.1235 + 1.1236 +/* Produce charset validity map for BMP 1.1237 + * Accepts: list of charsets to map 1.1238 + * Returns: validity map, indexed by BMP codepoint 1.1239 + * 1.1240 + * Bit 0x1 is the "not-CJK" character bit 1.1241 + */ 1.1242 + 1.1243 +unsigned long *utf8_csvalidmap (char *charsets[]) 1.1244 +{ 1.1245 + unsigned short u,*tab; 1.1246 + unsigned int m,ku,ten; 1.1247 + unsigned long i,csi,csb; 1.1248 + struct utf8_eucparam *param,*p2; 1.1249 + char *s; 1.1250 + const CHARSET *cs; 1.1251 + unsigned long *ret = (unsigned long *) 1.1252 + fs_get (i = 0x10000 * sizeof (unsigned long)); 1.1253 + memset (ret,0,i); /* zero the entire vector */ 1.1254 + /* mark all the non-CJK codepoints */ 1.1255 + /* U+0000 - U+2E7F non-CJK */ 1.1256 + for (i = 0; i < 0x2E7F; ++i) ret[i] = 0x1; 1.1257 + /* U+2E80 - U+2EFF CJK Radicals Supplement 1.1258 + * U+2F00 - U+2FDF Kangxi Radicals 1.1259 + * U+2FE0 - U+2FEF unassigned 1.1260 + * U+2FF0 - U+2FFF Ideographic Description Characters 1.1261 + * U+3000 - U+303F CJK Symbols and Punctuation 1.1262 + * U+3040 - U+309F Hiragana 1.1263 + * U+30A0 - U+30FF Katakana 1.1264 + * U+3100 - U+312F BoPoMoFo 1.1265 + * U+3130 - U+318F Hangul Compatibility Jamo 1.1266 + * U+3190 - U+319F Kanbun 1.1267 + * U+31A0 - U+31BF BoPoMoFo Extended 1.1268 + * U+31C0 - U+31EF CJK Strokes 1.1269 + * U+31F0 - U+31FF Katakana Phonetic Extensions 1.1270 + * U+3200 - U+32FF Enclosed CJK Letters and Months 1.1271 + * U+3300 - U+33FF CJK Compatibility 1.1272 + * U+3400 - U+4DBF CJK Unified Ideographs Extension A 1.1273 + * U+4DC0 - U+4DFF Yijing Hexagram Symbols 1.1274 + * U+4E00 - U+9FFF CJK Unified Ideographs 1.1275 + * U+A000 - U+A48F Yi Syllables 1.1276 + * U+A490 - U+A4CF Yi Radicals 1.1277 + * U+A700 - U+A71F Modifier Tone Letters 1.1278 + */ 1.1279 + for (i = 0xa720; i < 0xabff; ++i) ret[i] = 0x1; 1.1280 + /* U+AC00 - U+D7FF Hangul Syllables */ 1.1281 + for (i = 0xd800; i < 0xf8ff; ++i) ret[i] = 0x1; 1.1282 + /* U+F900 - U+FAFF CJK Compatibility Ideographs */ 1.1283 + for (i = 0xfb00; i < 0xfe2f; ++i) ret[i] = 0x1; 1.1284 + /* U+FE30 - U+FE4F CJK Compatibility Forms 1.1285 + * U+FE50 - U+FE6F Small Form Variants (for CNS 11643) 1.1286 + */ 1.1287 + for (i = 0xfe70; i < 0xfeff; ++i) ret[i] = 0x1; 1.1288 + /* U+FF00 - U+FFEF CJK Compatibility Ideographs */ 1.1289 + for (i = 0xfff0; i < 0x10000; ++i) ret[i] = 0x1; 1.1290 + 1.1291 + /* for each supplied charset */ 1.1292 + for (csi = 1; ret && charsets && (s = charsets[csi - 1]); ++csi) { 1.1293 + /* substitute EUC-JP for ISO-2022-JP */ 1.1294 + if (!compare_cstring (s,"ISO-2022-JP")) s = "EUC-JP"; 1.1295 + /* look up charset */ 1.1296 + if (cs = utf8_charset (s)) { 1.1297 + csb = 1 << csi; /* charset bit */ 1.1298 + switch (cs->type) { 1.1299 + case CT_ASCII: /* 7-bit ASCII no table */ 1.1300 + case CT_1BYTE0: /* 1 byte no table */ 1.1301 + case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ 1.1302 + case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ 1.1303 + case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ 1.1304 + case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ 1.1305 + case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ 1.1306 + case CT_SJIS: /* 2 byte Shift-JIS */ 1.1307 + /* supported charset type, all ASCII is OK */ 1.1308 + for (i = 0; i < 128; ++i) ret[i] |= csb; 1.1309 + break; 1.1310 + default: /* unsupported charset type */ 1.1311 + fs_give ((void **) &ret); 1.1312 + break; 1.1313 + } 1.1314 + /* now do additional operations */ 1.1315 + if (ret) switch (cs->type) { 1.1316 + case CT_1BYTE0: /* 1 byte no table */ 1.1317 + for (i = 128; i < 256; i++) ret[i] |= csb; 1.1318 + break; 1.1319 + case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ 1.1320 + for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++) 1.1321 + if (tab[i & BITS7] != UBOGON) ret[tab[i & BITS7]] |= csb; 1.1322 + break; 1.1323 + case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ 1.1324 + for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++) 1.1325 + if (tab[i] != UBOGON) ret[tab[i]] |= csb; 1.1326 + break; 1.1327 + case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ 1.1328 + for (param = (struct utf8_eucparam *) cs->tab, 1.1329 + tab = (unsigned short *) param->tab, ku = 0; 1.1330 + ku < param->max_ku; ku++) 1.1331 + for (ten = 0; ten < param->max_ten; ten++) 1.1332 + if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON) 1.1333 + ret[u] |= csb; 1.1334 + break; 1.1335 + 1.1336 + case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ 1.1337 + for (param = (struct utf8_eucparam *) cs->tab, 1.1338 + tab = (unsigned short *) param->tab, ku = 0; 1.1339 + ku < param->max_ku; ku++) 1.1340 + for (ten = 0; ten < param->max_ten; ten++) 1.1341 + if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON) 1.1342 + ret[u] |= csb; 1.1343 + break; 1.1344 + case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ 1.1345 + param = (struct utf8_eucparam *) cs->tab; 1.1346 + p2 = param + 1; /* plane 2 parameters */ 1.1347 + /* only ten parameters should differ */ 1.1348 + if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku)) 1.1349 + fatal ("ku definition error for CT_DBYTE2 charset"); 1.1350 + /* total codepoints in each ku */ 1.1351 + m = param->max_ten + p2->max_ten; 1.1352 + tab = (unsigned short *) param->tab; 1.1353 + for (ku = 0; ku < param->max_ku; ku++) { 1.1354 + for (ten = 0; ten < param->max_ten; ten++) 1.1355 + if ((u = tab[(ku * m) + ten]) != UBOGON) 1.1356 + ret[u] |= csb; 1.1357 + for (ten = 0; ten < p2->max_ten; ten++) 1.1358 + if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON) 1.1359 + ret[u] |= csb; 1.1360 + } 1.1361 + break; 1.1362 + case CT_SJIS: /* 2 byte Shift-JIS */ 1.1363 + for (ku = 0; ku < MAX_JIS0208_KU; ku++) 1.1364 + for (ten = 0; ten < MAX_JIS0208_TEN; ten++) 1.1365 + if ((u = jis0208tab[ku][ten]) != UBOGON) ret[u] |= csb; 1.1366 + /* JIS hankaku katakana */ 1.1367 + for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++) 1.1368 + ret[UCS2_KATAKANA + u] |= csb; 1.1369 + break; 1.1370 + } 1.1371 + } 1.1372 + /* invalid charset, punt */ 1.1373 + else fs_give ((void **) &ret); 1.1374 + } 1.1375 + return ret; 1.1376 +} 1.1377 + 1.1378 +/* Infer charset from unlabelled sized text 1.1379 + * Accepts: sized text 1.1380 + * Returns: charset if one inferred, or NIL if unknown 1.1381 + */ 1.1382 + 1.1383 +const CHARSET *utf8_infercharset (SIZEDTEXT *src) 1.1384 +{ 1.1385 + long iso2022jp = NIL; 1.1386 + long eightbit = NIL; 1.1387 + unsigned long i; 1.1388 + /* look for ISO 2022 */ 1.1389 + if (src) for (i = 0; i < src->size; i++) { 1.1390 + /* ESC sequence? */ 1.1391 + if ((src->data[i] == I2C_ESC) && (++i < src->size)) switch (src->data[i]) { 1.1392 + case I2C_MULTI: /* yes, multibyte? */ 1.1393 + if (++i < src->size) switch (src->data[i]) { 1.1394 + case I2CS_94x94_JIS_OLD: /* JIS X 0208-1978 */ 1.1395 + case I2CS_94x94_JIS_NEW: /* JIS X 0208-1983 */ 1.1396 + case I2CS_94x94_JIS_EXT: /* JIS X 0212-1990 (kludge...) */ 1.1397 + iso2022jp = T; /* found an ISO-2022-JP sequence */ 1.1398 + break; 1.1399 + default: /* other multibyte */ 1.1400 + return NIL; /* definitely invalid */ 1.1401 + } 1.1402 + break; 1.1403 + case I2C_G0_94: /* single byte */ 1.1404 + if (++i < src->size) switch (src->data[i]) { 1.1405 + case I2CS_94_JIS_BUGROM: /* in case old buggy software */ 1.1406 + case I2CS_94_JIS_ROMAN: /* JIS X 0201-1976 left half */ 1.1407 + case I2CS_94_ASCII: /* ASCII */ 1.1408 + case I2CS_94_BRITISH: /* good enough for gov't work */ 1.1409 + break; 1.1410 + default: /* other 94 single byte */ 1.1411 + return NIL; /* definitely invalid */ 1.1412 + } 1.1413 + } 1.1414 + /* if possible UTF-8 and not ISO-2022-JP */ 1.1415 + else if (!iso2022jp && (eightbit >= 0) && (src->data[i] & BIT8) && 1.1416 + (eightbit = utf8_validate (src->data + i,src->size - i)) > 0) 1.1417 + i += eightbit - 1; /* skip past all but last of UTF-8 char */ 1.1418 + } 1.1419 + /* ISO-2022-JP overrides other guesses */ 1.1420 + if (iso2022jp) return utf8_charset ("ISO-2022-JP"); 1.1421 + if (eightbit > 0) return utf8_charset ("UTF-8"); 1.1422 + return eightbit ? NIL : utf8_charset ("US-ASCII"); 1.1423 +} 1.1424 + 1.1425 + 1.1426 +/* Validate that character at this position is UTF-8 1.1427 + * Accepts: string pointer 1.1428 + * size of remaining string 1.1429 + * Returns: size of UTF-8 character in octets or -1 if not UTF-8 1.1430 + */ 1.1431 + 1.1432 +long utf8_validate (unsigned char *s,unsigned long i) 1.1433 +{ 1.1434 + unsigned long j = i; 1.1435 + return (utf8_get (&s,&i) & U8G_ERROR) ? -1 : j - i; 1.1436 +} 1.1437 + 1.1438 +/* Convert ISO 8859-1 to UTF-8 1.1439 + * Accepts: source sized text 1.1440 + * pointer to return sized text 1.1441 + * canonicalization function 1.1442 + */ 1.1443 + 1.1444 +void utf8_text_1byte0 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de) 1.1445 +{ 1.1446 + unsigned long i; 1.1447 + unsigned char *s; 1.1448 + unsigned int c; 1.1449 + for (ret->size = i = 0; i < text->size;) { 1.1450 + c = text->data[i++]; 1.1451 + UTF8_COUNT_BMP (ret->size,c,cv,de) 1.1452 + } 1.1453 + (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL; 1.1454 + for (i = 0; i < text->size;) { 1.1455 + c = text->data[i++]; 1.1456 + UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */ 1.1457 + } 1.1458 +} 1.1459 + 1.1460 + 1.1461 +/* Convert single byte ASCII+8bit character set sized text to UTF-8 1.1462 + * Accepts: source sized text 1.1463 + * pointer to return sized text 1.1464 + * conversion table 1.1465 + * canonicalization function 1.1466 + */ 1.1467 + 1.1468 +void utf8_text_1byte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, 1.1469 + ucs4de_t de) 1.1470 +{ 1.1471 + unsigned long i; 1.1472 + unsigned char *s; 1.1473 + unsigned int c; 1.1474 + unsigned short *tbl = (unsigned short *) tab; 1.1475 + for (ret->size = i = 0; i < text->size;) { 1.1476 + if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7]; 1.1477 + UTF8_COUNT_BMP (ret->size,c,cv,de) 1.1478 + } 1.1479 + (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL; 1.1480 + for (i = 0; i < text->size;) { 1.1481 + if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7]; 1.1482 + UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */ 1.1483 + } 1.1484 +} 1.1485 + 1.1486 +/* Convert single byte 8bit character set sized text to UTF-8 1.1487 + * Accepts: source sized text 1.1488 + * pointer to return sized text 1.1489 + * conversion table 1.1490 + * canonicalization function 1.1491 + */ 1.1492 + 1.1493 +void utf8_text_1byte8 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, 1.1494 + ucs4de_t de) 1.1495 +{ 1.1496 + unsigned long i; 1.1497 + unsigned char *s; 1.1498 + unsigned int c; 1.1499 + unsigned short *tbl = (unsigned short *) tab; 1.1500 + for (ret->size = i = 0; i < text->size;) { 1.1501 + c = tbl[text->data[i++]]; 1.1502 + UTF8_COUNT_BMP (ret->size,c,cv,de) 1.1503 + } 1.1504 + (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL; 1.1505 + for (i = 0; i < text->size;) { 1.1506 + c = tbl[text->data[i++]]; 1.1507 + UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */ 1.1508 + } 1.1509 +} 1.1510 + 1.1511 +/* Convert EUC sized text to UTF-8 1.1512 + * Accepts: source sized text 1.1513 + * pointer to return sized text 1.1514 + * EUC parameter table 1.1515 + * canonicalization function 1.1516 + */ 1.1517 + 1.1518 +void utf8_text_euc (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, 1.1519 + ucs4de_t de) 1.1520 +{ 1.1521 + unsigned long i; 1.1522 + unsigned char *s; 1.1523 + unsigned int pass,c,c1,ku,ten; 1.1524 + struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab; 1.1525 + struct utf8_eucparam *p2 = p1 + 1; 1.1526 + struct utf8_eucparam *p3 = p1 + 2; 1.1527 + unsigned short *t1 = (unsigned short *) p1->tab; 1.1528 + unsigned short *t2 = (unsigned short *) p2->tab; 1.1529 + unsigned short *t3 = (unsigned short *) p3->tab; 1.1530 + for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) { 1.1531 + for (i = 0; i < text->size;) { 1.1532 + /* not CS0? */ 1.1533 + if ((c = text->data[i++]) & BIT8) { 1.1534 + /* yes, must have another high byte */ 1.1535 + if ((i >= text->size) || !((c1 = text->data[i++]) & BIT8)) 1.1536 + c = UBOGON; /* out of space or bogon */ 1.1537 + else switch (c) { /* check 8bit code set */ 1.1538 + case EUC_CS2: /* CS2 */ 1.1539 + if (p2->base_ku) { /* CS2 set up? */ 1.1540 + if (p2->base_ten) /* yes, multibyte? */ 1.1541 + c = ((i < text->size) && ((c = text->data[i++]) & BIT8) && 1.1542 + ((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) && 1.1543 + ((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) ? 1.1544 + t2[(ku*p2->max_ten) + ten] : UBOGON; 1.1545 + else c = ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) ? 1.1546 + c1 + ((unsigned long) p2->tab) : UBOGON; 1.1547 + } 1.1548 + else { /* CS2 not set up */ 1.1549 + c = UBOGON; /* swallow byte, say bogon */ 1.1550 + if (i < text->size) i++; 1.1551 + } 1.1552 + break; 1.1553 + case EUC_CS3: /* CS3 */ 1.1554 + if (p3->base_ku) { /* CS3 set up? */ 1.1555 + if (p3->base_ten) /* yes, multibyte? */ 1.1556 + c = ((i < text->size) && ((c = text->data[i++]) & BIT8) && 1.1557 + ((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) && 1.1558 + ((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) ? 1.1559 + t3[(ku*p3->max_ten) + ten] : UBOGON; 1.1560 + else c = ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) ? 1.1561 + c1 + ((unsigned long) p3->tab) : UBOGON; 1.1562 + } 1.1563 + else { /* CS3 not set up */ 1.1564 + c = UBOGON; /* swallow byte, say bogon */ 1.1565 + if (i < text->size) i++; 1.1566 + } 1.1567 + break; 1.1568 + 1.1569 + default: 1.1570 + if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) || 1.1571 + ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten)) c = UBOGON; 1.1572 + else if (((c = t1[(ku*p1->max_ten) + ten]) == UBOGON) && 1.1573 + /* special hack for JIS X 0212: merge rows less than 10 */ 1.1574 + ku && (ku < 10) && t3 && p3->base_ten) 1.1575 + c = t3[((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten]; 1.1576 + } 1.1577 + } 1.1578 + /* convert if second pass */ 1.1579 + if (pass) UTF8_WRITE_BMP (s,c,cv,de) 1.1580 + else UTF8_COUNT_BMP (ret->size,c,cv,de); 1.1581 + } 1.1582 + if (!pass) (s = ret->data = (unsigned char *) 1.1583 + fs_get (ret->size + 1))[ret->size] =NIL; 1.1584 + } 1.1585 +} 1.1586 + 1.1587 + 1.1588 +/* Convert ASCII + double-byte sized text to UTF-8 1.1589 + * Accepts: source sized text 1.1590 + * pointer to return sized text 1.1591 + * conversion table 1.1592 + * canonicalization function 1.1593 + */ 1.1594 + 1.1595 +void utf8_text_dbyte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, 1.1596 + ucs4de_t de) 1.1597 +{ 1.1598 + unsigned long i; 1.1599 + unsigned char *s; 1.1600 + unsigned int c,c1,ku,ten; 1.1601 + struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab; 1.1602 + unsigned short *t1 = (unsigned short *) p1->tab; 1.1603 + for (ret->size = i = 0; i < text->size;) { 1.1604 + if ((c = text->data[i++]) & BIT8) { 1.1605 + /* special hack for GBK: 0x80 is Euro */ 1.1606 + if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO; 1.1607 + else c = ((i < text->size) && (c1 = text->data[i++]) && 1.1608 + ((ku = c - p1->base_ku) < p1->max_ku) && 1.1609 + ((ten = c1 - p1->base_ten) < p1->max_ten)) ? 1.1610 + t1[(ku*p1->max_ten) + ten] : UBOGON; 1.1611 + } 1.1612 + UTF8_COUNT_BMP (ret->size,c,cv,de) 1.1613 + } 1.1614 + (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL; 1.1615 + for (i = 0; i < text->size;) { 1.1616 + if ((c = text->data[i++]) & BIT8) { 1.1617 + /* special hack for GBK: 0x80 is Euro */ 1.1618 + if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO; 1.1619 + else c = ((i < text->size) && (c1 = text->data[i++]) && 1.1620 + ((ku = c - p1->base_ku) < p1->max_ku) && 1.1621 + ((ten = c1 - p1->base_ten) < p1->max_ten)) ? 1.1622 + t1[(ku*p1->max_ten) + ten] : UBOGON; 1.1623 + } 1.1624 + UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */ 1.1625 + } 1.1626 +} 1.1627 + 1.1628 +/* Convert ASCII + double byte 2 plane sized text to UTF-8 1.1629 + * Accepts: source sized text 1.1630 + * pointer to return sized text 1.1631 + * conversion table 1.1632 + * canonicalization function 1.1633 + */ 1.1634 + 1.1635 +void utf8_text_dbyte2 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, 1.1636 + ucs4de_t de) 1.1637 +{ 1.1638 + unsigned long i; 1.1639 + unsigned char *s; 1.1640 + unsigned int c,c1,ku,ten; 1.1641 + struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab; 1.1642 + struct utf8_eucparam *p2 = p1 + 1; 1.1643 + unsigned short *t = (unsigned short *) p1->tab; 1.1644 + for (ret->size = i = 0; i < text->size;) { 1.1645 + if ((c = text->data[i++]) & BIT8) { 1.1646 + if ((i >= text->size) || !(c1 = text->data[i++])) 1.1647 + c = UBOGON; /* out of space or bogon */ 1.1648 + else if (c1 & BIT8) /* high vs. low plane */ 1.1649 + c = ((ku = c - p2->base_ku) < p2->max_ku && 1.1650 + ((ten = c1 - p2->base_ten) < p2->max_ten)) ? 1.1651 + t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON; 1.1652 + else c = ((ku = c - p1->base_ku) < p1->max_ku && 1.1653 + ((ten = c1 - p1->base_ten) < p1->max_ten)) ? 1.1654 + t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON; 1.1655 + } 1.1656 + UTF8_COUNT_BMP (ret->size,c,cv,de) 1.1657 + } 1.1658 + (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL; 1.1659 + for (i = 0; i < text->size;) { 1.1660 + if ((c = text->data[i++]) & BIT8) { 1.1661 + if ((i >= text->size) || !(c1 = text->data[i++])) 1.1662 + c = UBOGON; /* out of space or bogon */ 1.1663 + else if (c1 & BIT8) /* high vs. low plane */ 1.1664 + c = ((ku = c - p2->base_ku) < p2->max_ku && 1.1665 + ((ten = c1 - p2->base_ten) < p2->max_ten)) ? 1.1666 + t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON; 1.1667 + else c = ((ku = c - p1->base_ku) < p1->max_ku && 1.1668 + ((ten = c1 - p1->base_ten) < p1->max_ten)) ? 1.1669 + t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON; 1.1670 + } 1.1671 + UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */ 1.1672 + } 1.1673 +} 1.1674 + 1.1675 +#ifdef JISTOUNICODE /* Japanese */ 1.1676 +/* Convert Shift JIS sized text to UTF-8 1.1677 + * Accepts: source sized text 1.1678 + * pointer to return sized text 1.1679 + * canonicalization function 1.1680 + */ 1.1681 + 1.1682 +void utf8_text_sjis (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv, 1.1683 + ucs4de_t de) 1.1684 +{ 1.1685 + unsigned long i; 1.1686 + unsigned char *s; 1.1687 + unsigned int c,c1,ku,ten; 1.1688 + for (ret->size = i = 0; i < text->size;) { 1.1689 + if ((c = text->data[i++]) & BIT8) { 1.1690 + /* half-width katakana */ 1.1691 + if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8; 1.1692 + else if (i >= text->size) c = UBOGON; 1.1693 + else { /* Shift-JIS */ 1.1694 + c1 = text->data[i++]; 1.1695 + SJISTOJIS (c,c1); 1.1696 + c = JISTOUNICODE (c,c1,ku,ten); 1.1697 + } 1.1698 + } 1.1699 + /* compromise - do yen sign but not overline */ 1.1700 + else if (c == JISROMAN_YEN) c = UCS2_YEN; 1.1701 + UTF8_COUNT_BMP (ret->size,c,cv,de) 1.1702 + } 1.1703 + (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL; 1.1704 + for (i = 0; i < text->size;) { 1.1705 + if ((c = text->data[i++]) & BIT8) { 1.1706 + /* half-width katakana */ 1.1707 + if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8; 1.1708 + else { /* Shift-JIS */ 1.1709 + c1 = text->data[i++]; 1.1710 + SJISTOJIS (c,c1); 1.1711 + c = JISTOUNICODE (c,c1,ku,ten); 1.1712 + } 1.1713 + } 1.1714 + /* compromise - do yen sign but not overline */ 1.1715 + else if (c == JISROMAN_YEN) c = UCS2_YEN; 1.1716 + UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */ 1.1717 + } 1.1718 +} 1.1719 +#endif 1.1720 + 1.1721 +/* Convert ISO-2022 sized text to UTF-8 1.1722 + * Accepts: source sized text 1.1723 + * pointer to returned sized text 1.1724 + * canonicalization function 1.1725 + */ 1.1726 + 1.1727 +void utf8_text_2022 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de) 1.1728 +{ 1.1729 + unsigned long i; 1.1730 + unsigned char *s; 1.1731 + unsigned int pass,state,c,co,gi,gl,gr,g[4],ku,ten; 1.1732 + for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) { 1.1733 + gi = 0; /* quell compiler warnings */ 1.1734 + state = I2S_CHAR; /* initialize engine */ 1.1735 + g[0]= g[2] = I2CS_ASCII; /* G0 and G2 are ASCII */ 1.1736 + g[1]= g[3] = I2CS_ISO8859_1;/* G1 and G3 are ISO-8850-1 */ 1.1737 + gl = I2C_G0; gr = I2C_G1; /* left is G0, right is G1 */ 1.1738 + for (i = 0; i < text->size;) { 1.1739 + c = text->data[i++]; 1.1740 + switch (state) { /* dispatch based upon engine state */ 1.1741 + case I2S_ESC: /* ESC seen */ 1.1742 + switch (c) { /* process intermediate character */ 1.1743 + case I2C_MULTI: /* multibyte character? */ 1.1744 + state = I2S_MUL; /* mark multibyte flag seen */ 1.1745 + break; 1.1746 + case I2C_SS2: /* single shift GL to G2 */ 1.1747 + case I2C_SS2_ALT: /* Taiwan SeedNet */ 1.1748 + gl |= I2C_SG2; 1.1749 + break; 1.1750 + case I2C_SS3: /* single shift GL to G3 */ 1.1751 + case I2C_SS3_ALT: /* Taiwan SeedNet */ 1.1752 + gl |= I2C_SG3; 1.1753 + break; 1.1754 + case I2C_LS2: /* shift GL to G2 */ 1.1755 + gl = I2C_G2; 1.1756 + break; 1.1757 + case I2C_LS3: /* shift GL to G3 */ 1.1758 + gl = I2C_G3; 1.1759 + break; 1.1760 + case I2C_LS1R: /* shift GR to G1 */ 1.1761 + gr = I2C_G1; 1.1762 + break; 1.1763 + case I2C_LS2R: /* shift GR to G2 */ 1.1764 + gr = I2C_G2; 1.1765 + break; 1.1766 + case I2C_LS3R: /* shift GR to G3 */ 1.1767 + gr = I2C_G3; 1.1768 + break; 1.1769 + case I2C_G0_94: case I2C_G1_94: case I2C_G2_94: case I2C_G3_94: 1.1770 + g[gi = c - I2C_G0_94] = (state == I2S_MUL) ? I2CS_94x94 : I2CS_94; 1.1771 + state = I2S_INT; /* ready for character set */ 1.1772 + break; 1.1773 + case I2C_G0_96: case I2C_G1_96: case I2C_G2_96: case I2C_G3_96: 1.1774 + g[gi = c - I2C_G0_96] = (state == I2S_MUL) ? I2CS_96x96 : I2CS_96; 1.1775 + state = I2S_INT; /* ready for character set */ 1.1776 + break; 1.1777 + default: /* bogon */ 1.1778 + if (pass) *s++ = I2C_ESC,*s++ = c; 1.1779 + else ret->size += 2; 1.1780 + state = I2S_CHAR; /* return to previous state */ 1.1781 + } 1.1782 + break; 1.1783 + 1.1784 + case I2S_MUL: /* ESC $ */ 1.1785 + switch (c) { /* process multibyte intermediate character */ 1.1786 + case I2C_G0_94: case I2C_G1_94: case I2C_G2_94: case I2C_G3_94: 1.1787 + g[gi = c - I2C_G0_94] = I2CS_94x94; 1.1788 + state = I2S_INT; /* ready for character set */ 1.1789 + break; 1.1790 + case I2C_G0_96: case I2C_G1_96: case I2C_G2_96: case I2C_G3_96: 1.1791 + g[gi = c - I2C_G0_96] = I2CS_96x96; 1.1792 + state = I2S_INT; /* ready for character set */ 1.1793 + break; 1.1794 + default: /* probably omitted I2CS_94x94 */ 1.1795 + g[gi = I2C_G0] = I2CS_94x94 | c; 1.1796 + state = I2S_CHAR; /* return to character state */ 1.1797 + } 1.1798 + break; 1.1799 + case I2S_INT: 1.1800 + state = I2S_CHAR; /* return to character state */ 1.1801 + g[gi] |= c; /* set character set */ 1.1802 + break; 1.1803 + 1.1804 + case I2S_CHAR: /* character data */ 1.1805 + switch (c) { 1.1806 + case I2C_ESC: /* ESC character */ 1.1807 + state = I2S_ESC; /* see if ISO-2022 prefix */ 1.1808 + break; 1.1809 + case I2C_SI: /* shift GL to G0 */ 1.1810 + gl = I2C_G0; 1.1811 + break; 1.1812 + case I2C_SO: /* shift GL to G1 */ 1.1813 + gl = I2C_G1; 1.1814 + break; 1.1815 + case I2C_SS2_ALT: /* single shift GL to G2 */ 1.1816 + case I2C_SS2_ALT_7: 1.1817 + gl |= I2C_SG2; 1.1818 + break; 1.1819 + case I2C_SS3_ALT: /* single shift GL to G3 */ 1.1820 + case I2C_SS3_ALT_7: 1.1821 + gl |= I2C_SG3; 1.1822 + break; 1.1823 + 1.1824 + default: /* ordinary character */ 1.1825 + co = c; /* note original character */ 1.1826 + if (gl & (3 << 2)) { /* single shifted? */ 1.1827 + gi = g[gl >> 2]; /* get shifted character set */ 1.1828 + gl &= 0x3; /* cancel shift */ 1.1829 + } 1.1830 + /* select left or right half */ 1.1831 + else gi = (c & BIT8) ? g[gr] : g[gl]; 1.1832 + c &= BITS7; /* make 7-bit */ 1.1833 + switch (gi) { /* interpret in character set */ 1.1834 + case I2CS_ASCII: /* ASCII */ 1.1835 + break; /* easy! */ 1.1836 + case I2CS_BRITISH: /* British ASCII */ 1.1837 + /* Pound sterling sign */ 1.1838 + if (c == BRITISH_POUNDSTERLING) c = UCS2_POUNDSTERLING; 1.1839 + break; 1.1840 + case I2CS_JIS_ROMAN: /* JIS Roman */ 1.1841 + case I2CS_JIS_BUGROM: /* old bugs */ 1.1842 + switch (c) { /* two exceptions to ASCII */ 1.1843 + case JISROMAN_YEN: /* Yen sign */ 1.1844 + c = UCS2_YEN; 1.1845 + break; 1.1846 + /* overline */ 1.1847 + case JISROMAN_OVERLINE: 1.1848 + c = UCS2_OVERLINE; 1.1849 + break; 1.1850 + } 1.1851 + break; 1.1852 + case I2CS_JIS_KANA: /* JIS hankaku katakana */ 1.1853 + if ((c >= MIN_KANA_7) && (c < MAX_KANA_7)) c += KANA_7; 1.1854 + break; 1.1855 + 1.1856 + case I2CS_ISO8859_1: /* Latin-1 (West European) */ 1.1857 + c |= BIT8; /* just turn on high bit */ 1.1858 + break; 1.1859 + case I2CS_ISO8859_2: /* Latin-2 (Czech, Slovak) */ 1.1860 + c = iso8859_2tab[c]; 1.1861 + break; 1.1862 + case I2CS_ISO8859_3: /* Latin-3 (Dutch, Turkish) */ 1.1863 + c = iso8859_3tab[c]; 1.1864 + break; 1.1865 + case I2CS_ISO8859_4: /* Latin-4 (Scandinavian) */ 1.1866 + c = iso8859_4tab[c]; 1.1867 + break; 1.1868 + case I2CS_ISO8859_5: /* Cyrillic */ 1.1869 + c = iso8859_5tab[c]; 1.1870 + break; 1.1871 + case I2CS_ISO8859_6: /* Arabic */ 1.1872 + c = iso8859_6tab[c]; 1.1873 + break; 1.1874 + case I2CS_ISO8859_7: /* Greek */ 1.1875 + c = iso8859_7tab[c]; 1.1876 + break; 1.1877 + case I2CS_ISO8859_8: /* Hebrew */ 1.1878 + c = iso8859_8tab[c]; 1.1879 + break; 1.1880 + case I2CS_ISO8859_9: /* Latin-5 (Finnish, Portuguese) */ 1.1881 + c = iso8859_9tab[c]; 1.1882 + break; 1.1883 + case I2CS_TIS620: /* Thai */ 1.1884 + c = tis620tab[c]; 1.1885 + break; 1.1886 + case I2CS_ISO8859_10: /* Latin-6 (Northern Europe) */ 1.1887 + c = iso8859_10tab[c]; 1.1888 + break; 1.1889 + case I2CS_ISO8859_13: /* Latin-7 (Baltic) */ 1.1890 + c = iso8859_13tab[c]; 1.1891 + break; 1.1892 + case I2CS_VSCII: /* Vietnamese */ 1.1893 + c = visciitab[c]; 1.1894 + break; 1.1895 + case I2CS_ISO8859_14: /* Latin-8 (Celtic) */ 1.1896 + c = iso8859_14tab[c]; 1.1897 + break; 1.1898 + case I2CS_ISO8859_15: /* Latin-9 (Euro) */ 1.1899 + c = iso8859_15tab[c]; 1.1900 + break; 1.1901 + case I2CS_ISO8859_16: /* Latin-10 (Baltic) */ 1.1902 + c = iso8859_16tab[c]; 1.1903 + break; 1.1904 + 1.1905 + default: /* all other character sets */ 1.1906 + /* multibyte character set */ 1.1907 + if ((gi & I2CS_MUL) && !(c & BIT8) && isgraph (c)) { 1.1908 + c = (i < text->size) ? text->data[i++] : 0; 1.1909 + switch (gi) { 1.1910 +#ifdef GBTOUNICODE 1.1911 + case I2CS_GB: /* GB 2312 */ 1.1912 + co |= BIT8; /* make into EUC */ 1.1913 + c |= BIT8; 1.1914 + c = GBTOUNICODE (co,c,ku,ten); 1.1915 + break; 1.1916 +#endif 1.1917 +#ifdef JISTOUNICODE 1.1918 + case I2CS_JIS_OLD:/* JIS X 0208-1978 */ 1.1919 + case I2CS_JIS_NEW:/* JIS X 0208-1983 */ 1.1920 + c = JISTOUNICODE (co,c,ku,ten); 1.1921 + break; 1.1922 +#endif 1.1923 +#ifdef JIS0212TOUNICODE 1.1924 + case I2CS_JIS_EXT:/* JIS X 0212-1990 */ 1.1925 + c = JIS0212TOUNICODE (co,c,ku,ten); 1.1926 + break; 1.1927 +#endif 1.1928 +#ifdef KSCTOUNICODE 1.1929 + case I2CS_KSC: /* KSC 5601 */ 1.1930 + co |= BIT8; /* make into EUC */ 1.1931 + c |= BIT8; 1.1932 + c = KSCTOUNICODE (co,c,ku,ten); 1.1933 + break; 1.1934 +#endif 1.1935 +#ifdef CNS1TOUNICODE 1.1936 + case I2CS_CNS1: /* CNS 11643 plane 1 */ 1.1937 + c = CNS1TOUNICODE (co,c,ku,ten); 1.1938 + break; 1.1939 +#endif 1.1940 +#ifdef CNS2TOUNICODE 1.1941 + case I2CS_CNS2: /* CNS 11643 plane 2 */ 1.1942 + c = CNS2TOUNICODE (co,c,ku,ten); 1.1943 + break; 1.1944 +#endif 1.1945 +#ifdef CNS3TOUNICODE 1.1946 + case I2CS_CNS3: /* CNS 11643 plane 3 */ 1.1947 + c = CNS3TOUNICODE (co,c,ku,ten); 1.1948 + break; 1.1949 +#endif 1.1950 +#ifdef CNS4TOUNICODE 1.1951 + case I2CS_CNS4: /* CNS 11643 plane 4 */ 1.1952 + c = CNS4TOUNICODE (co,c,ku,ten); 1.1953 + break; 1.1954 +#endif 1.1955 +#ifdef CNS5TOUNICODE 1.1956 + case I2CS_CNS5: /* CNS 11643 plane 5 */ 1.1957 + c = CNS5TOUNICODE (co,c,ku,ten); 1.1958 + break; 1.1959 +#endif 1.1960 +#ifdef CNS6TOUNICODE 1.1961 + case I2CS_CNS6: /* CNS 11643 plane 6 */ 1.1962 + c = CNS6TOUNICODE (co,c,ku,ten); 1.1963 + break; 1.1964 +#endif 1.1965 +#ifdef CNS7TOUNICODE 1.1966 + case I2CS_CNS7: /* CNS 11643 plane 7 */ 1.1967 + c = CNS7TOUNICODE (co,c,ku,ten); 1.1968 + break; 1.1969 +#endif 1.1970 + default: /* unknown multibyte, treat as UCS-2 */ 1.1971 + c |= (co << 8); /* wrong, but nothing else to do */ 1.1972 + break; 1.1973 + } 1.1974 + } 1.1975 + else c = co; /* unknown single byte, treat as 8859-1 */ 1.1976 + } 1.1977 + /* convert if second pass */ 1.1978 + if (pass) UTF8_WRITE_BMP (s,c,cv,de) 1.1979 + else UTF8_COUNT_BMP (ret->size,c,cv,de); 1.1980 + } 1.1981 + } 1.1982 + } 1.1983 + if (!pass) (s = ret->data = (unsigned char *) 1.1984 + fs_get (ret->size + 1))[ret->size] = NIL; 1.1985 + else if (((unsigned long) (s - ret->data)) != ret->size) 1.1986 + fatal ("ISO-2022 to UTF-8 botch"); 1.1987 + } 1.1988 +} 1.1989 + 1.1990 +/* Convert UTF-7 sized text to UTF-8 1.1991 + * Accepts: source sized text 1.1992 + * pointer to returned sized text 1.1993 + * canonicalization function 1.1994 + */ 1.1995 + 1.1996 +void utf8_text_utf7 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de) 1.1997 +{ 1.1998 + unsigned long i; 1.1999 + unsigned char *s; 1.2000 + unsigned int c,c1,d,uc,pass,e,e1,state,surrh; 1.2001 + for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) { 1.2002 + c1 = d = uc = e = e1 = 0; 1.2003 + for (i = 0,state = NIL; i < text->size;) { 1.2004 + c = text->data[i++]; /* get next byte */ 1.2005 + switch (state) { 1.2006 + case U7_PLUS: /* previous character was + */ 1.2007 + if (c == '-') { /* +- means textual + */ 1.2008 + c = '+'; 1.2009 + state = U7_ASCII; /* revert to ASCII */ 1.2010 + break; 1.2011 + } 1.2012 + state = U7_UNICODE; /* enter Unicode state */ 1.2013 + e = e1 = 0; /* initialize Unicode quantum position */ 1.2014 + case U7_UNICODE: /* Unicode state */ 1.2015 + if (c == '-') state = U7_MINUS; 1.2016 + else { /* decode Unicode */ 1.2017 + /* don't use isupper/islower since this is ASCII only */ 1.2018 + if ((c >= 'A') && (c <= 'Z')) c -= 'A'; 1.2019 + else if ((c >= 'a') && (c <= 'z')) c -= 'a' - 26; 1.2020 + else if (isdigit (c)) c -= '0' - 52; 1.2021 + else if (c == '+') c = 62; 1.2022 + else if (c == '/') c = 63; 1.2023 + else state = U7_ASCII;/* end of modified BASE64 */ 1.2024 + } 1.2025 + break; 1.2026 + case U7_MINUS: /* previous character was absorbed - */ 1.2027 + state = U7_ASCII; /* revert to ASCII */ 1.2028 + case U7_ASCII: /* ASCII state */ 1.2029 + if (c == '+') state = U7_PLUS; 1.2030 + break; 1.2031 + } 1.2032 + 1.2033 + switch (state) { /* store character if in character mode */ 1.2034 + case U7_UNICODE: /* Unicode */ 1.2035 + switch (e++) { /* install based on BASE64 state */ 1.2036 + case 0: 1.2037 + c1 = c << 2; /* byte 1: high 6 bits */ 1.2038 + break; 1.2039 + case 1: 1.2040 + d = c1 | (c >> 4); /* byte 1: low 2 bits */ 1.2041 + c1 = c << 4; /* byte 2: high 4 bits */ 1.2042 + break; 1.2043 + case 2: 1.2044 + d = c1 | (c >> 2); /* byte 2: low 4 bits */ 1.2045 + c1 = c << 6; /* byte 3: high 2 bits */ 1.2046 + break; 1.2047 + case 3: 1.2048 + d = c | c1; /* byte 3: low 6 bits */ 1.2049 + e = 0; /* reinitialize mechanism */ 1.2050 + break; 1.2051 + } 1.2052 + if (e == 1) break; /* done if first BASE64 state */ 1.2053 + if (!e1) { /* first byte of UCS-2 character */ 1.2054 + uc = (d & 0xff) << 8; /* note first byte */ 1.2055 + e1 = T; /* enter second UCS-2 state */ 1.2056 + break; /* done */ 1.2057 + } 1.2058 + c = uc | (d & 0xff); /* build UCS-2 character */ 1.2059 + e1 = NIL; /* back to first UCS-2 state, drop in */ 1.2060 + /* surrogate pair? */ 1.2061 + if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) { 1.2062 + /* save high surrogate for later */ 1.2063 + if (c < UTF16_SURRL) surrh = c; 1.2064 + else c = UTF16_BASE + ((surrh & UTF16_MASK) << UTF16_SHIFT) + 1.2065 + (c & UTF16_MASK); 1.2066 + break; /* either way with surrogates, we're done */ 1.2067 + } 1.2068 + case U7_ASCII: /* just install if ASCII */ 1.2069 + /* convert if second pass */ 1.2070 + if (pass) UTF8_WRITE_BMP (s,c,cv,de) 1.2071 + else UTF8_COUNT_BMP (ret->size,c,cv,de); 1.2072 + } 1.2073 + } 1.2074 + if (!pass) (s = ret->data = (unsigned char *) 1.2075 + fs_get (ret->size + 1))[ret->size] = NIL; 1.2076 + else if (((unsigned long) (s - ret->data)) != ret->size) 1.2077 + fatal ("UTF-7 to UTF-8 botch"); 1.2078 + } 1.2079 +} 1.2080 + 1.2081 + 1.2082 +/* Convert UTF-8 sized text to UTF-8 1.2083 + * Accepts: source sized text 1.2084 + * pointer to returned sized text 1.2085 + * canonicalization function 1.2086 + */ 1.2087 + 1.2088 +void utf8_text_utf8 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de) 1.2089 +{ 1.2090 + unsigned long i,c; 1.2091 + unsigned char *s,*t; 1.2092 + for (ret->size = 0, t = text->data, i = text->size; i;) { 1.2093 + if ((c = utf8_get (&t,&i)) & U8G_ERROR) { 1.2094 + ret->data = text->data; /* conversion failed */ 1.2095 + ret->size = text->size; 1.2096 + return; 1.2097 + } 1.2098 + UTF8_COUNT (ret->size,c,cv,de) 1.2099 + } 1.2100 + (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL; 1.2101 + for (t = text->data, i = text->size; i;) { 1.2102 + c = utf8_get (&t,&i); 1.2103 + UTF8_WRITE (s,c,cv,de) /* convert UCS-4 to UTF-8 */ 1.2104 + } 1.2105 + if (((unsigned long) (s - ret->data)) != ret->size) 1.2106 + fatal ("UTF-8 to UTF-8 botch"); 1.2107 +} 1.2108 + 1.2109 +/* Convert UCS-2 sized text to UTF-8 1.2110 + * Accepts: source sized text 1.2111 + * pointer to returned sized text 1.2112 + * canonicalization function 1.2113 + */ 1.2114 + 1.2115 +void utf8_text_ucs2 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de) 1.2116 +{ 1.2117 + unsigned long i; 1.2118 + unsigned char *s,*t; 1.2119 + unsigned int c; 1.2120 + for (ret->size = 0, t = text->data, i = text->size / 2; i; --i) { 1.2121 + c = *t++ << 8; 1.2122 + c |= *t++; 1.2123 + UTF8_COUNT_BMP (ret->size,c,cv,de); 1.2124 + } 1.2125 + (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL; 1.2126 + for (t = text->data, i = text->size / 2; i; --i) { 1.2127 + c = *t++ << 8; 1.2128 + c |= *t++; 1.2129 + UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */ 1.2130 + } 1.2131 + if (((unsigned long) (s - ret->data)) != ret->size) 1.2132 + fatal ("UCS-2 to UTF-8 botch"); 1.2133 +} 1.2134 + 1.2135 + 1.2136 +/* Convert UCS-4 sized text to UTF-8 1.2137 + * Accepts: source sized text 1.2138 + * pointer to returned sized text 1.2139 + * canonicalization function 1.2140 + */ 1.2141 + 1.2142 +void utf8_text_ucs4 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de) 1.2143 +{ 1.2144 + unsigned long i; 1.2145 + unsigned char *s,*t; 1.2146 + unsigned long c; 1.2147 + for (ret->size = 0, t = text->data, i = text->size / 4; i; --i) { 1.2148 + c = *t++ << 24; c |= *t++ << 16; c |= *t++ << 8; c |= *t++; 1.2149 + UTF8_COUNT (ret->size,c,cv,de); 1.2150 + } 1.2151 + (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL; 1.2152 + for (t = text->data, i = text->size / 2; i; --i) { 1.2153 + c = *t++ << 24; c |= *t++ << 16; c |= *t++ << 8; c |= *t++; 1.2154 + UTF8_WRITE (s,c,cv,de) /* convert UCS-4 to UTF-8 */ 1.2155 + } 1.2156 + if (((unsigned long) (s - ret->data)) != ret->size) 1.2157 + fatal ("UCS-4 to UTF-8 botch"); 1.2158 +} 1.2159 + 1.2160 +/* Convert UTF-16 sized text to UTF-8 1.2161 + * Accepts: source sized text 1.2162 + * pointer to returned sized text 1.2163 + * canonicalization function 1.2164 + */ 1.2165 + 1.2166 +void utf8_text_utf16 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de) 1.2167 +{ 1.2168 + unsigned long i; 1.2169 + unsigned char *s,*t; 1.2170 + unsigned long c,d; 1.2171 + for (ret->size = 0, t = text->data, i = text->size / 2; i; --i) { 1.2172 + c = *t++ << 8; 1.2173 + c |= *t++; 1.2174 + /* possible surrogate? */ 1.2175 + if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) { 1.2176 + /* invalid first surrogate */ 1.2177 + if ((c > UTF16_SURRHEND) || !i) c = UBOGON; 1.2178 + else { /* get second surrogate */ 1.2179 + d = *t++ << 8; 1.2180 + d |= *t++; 1.2181 + --i; /* swallowed another 16-bits */ 1.2182 + /* invalid second surrogate */ 1.2183 + if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) c = UBOGON; 1.2184 + else c = UTF16_BASE + ((c & UTF16_MASK) << UTF16_SHIFT) + 1.2185 + (d & UTF16_MASK); 1.2186 + } 1.2187 + } 1.2188 + UTF8_COUNT (ret->size,c,cv,de); 1.2189 + } 1.2190 + (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL; 1.2191 + for (t = text->data, i = text->size / 2; i; --i) { 1.2192 + c = *t++ << 8; 1.2193 + c |= *t++; 1.2194 + /* possible surrogate? */ 1.2195 + if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) { 1.2196 + /* invalid first surrogate */ 1.2197 + if ((c > UTF16_SURRHEND) || !i) c = UBOGON; 1.2198 + else { /* get second surrogate */ 1.2199 + d = *t++ << 8; 1.2200 + d |= *t++; 1.2201 + --i; /* swallowed another 16-bits */ 1.2202 + /* invalid second surrogate */ 1.2203 + if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) c = UBOGON; 1.2204 + else c = UTF16_BASE + ((c & UTF16_MASK) << UTF16_SHIFT) + 1.2205 + (d & UTF16_MASK); 1.2206 + } 1.2207 + } 1.2208 + UTF8_WRITE (s,c,cv,de) /* convert UCS-4 to UTF-8 */ 1.2209 + } 1.2210 + if (((unsigned long) (s - ret->data)) != ret->size) 1.2211 + fatal ("UTF-16 to UTF-8 botch"); 1.2212 +} 1.2213 + 1.2214 +/* Size of UCS-4 character, possibly not in BMP, as UTF-8 octets 1.2215 + * Accepts: character 1.2216 + * Returns: size (0 means bogon) 1.2217 + * 1.2218 + * Use UTF8_SIZE macro if known to be in the BMP 1.2219 + */ 1.2220 + 1.2221 +unsigned long utf8_size (unsigned long c) 1.2222 +{ 1.2223 + if (c < 0x80) return 1; 1.2224 + else if (c < 0x800) return 2; 1.2225 + else if (c < 0x10000) return 3; 1.2226 + else if (c < 0x200000) return 4; 1.2227 + else if (c < 0x4000000) return 5; 1.2228 + else if (c < 0x80000000) return 6; 1.2229 + return 0; 1.2230 +} 1.2231 + 1.2232 + 1.2233 +/* Put UCS-4 character, possibly not in BMP, as UTF-8 octets 1.2234 + * Accepts: destination string pointer 1.2235 + * character 1.2236 + * Returns: updated destination pointer 1.2237 + * 1.2238 + * Use UTF8_PUT_BMP macro if known to be in the BMP 1.2239 + */ 1.2240 + 1.2241 +unsigned char *utf8_put (unsigned char *s,unsigned long c) 1.2242 +{ 1.2243 + unsigned char mark[6] = {0x00,0xc0,0xe0,0xf0,0xf8,0xfc}; 1.2244 + unsigned long size = utf8_size (c); 1.2245 + switch (size) { 1.2246 + case 6: 1.2247 + s[5] = 0x80 | (unsigned char) (c & 0x3f); 1.2248 + c >>= 6; 1.2249 + case 5: 1.2250 + s[4] = 0x80 | (unsigned char) (c & 0x3f); 1.2251 + c >>= 6; 1.2252 + case 4: 1.2253 + s[3] = 0x80 | (unsigned char) (c & 0x3f); 1.2254 + c >>= 6; 1.2255 + case 3: 1.2256 + s[2] = 0x80 | (unsigned char) (c & 0x3f); 1.2257 + c >>= 6; 1.2258 + case 2: 1.2259 + s[1] = 0x80 | (unsigned char) (c & 0x3f); 1.2260 + c >>= 6; 1.2261 + case 1: 1.2262 + *s = mark[size-1] | (unsigned char) (c & 0x7f); 1.2263 + break; 1.2264 + } 1.2265 + return s + size; 1.2266 +} 1.2267 + 1.2268 +/* Return title case of a fixed-width UCS-4 character 1.2269 + * Accepts: character 1.2270 + * Returns: title case of character 1.2271 + */ 1.2272 + 1.2273 +unsigned long ucs4_titlecase (unsigned long c) 1.2274 +{ 1.2275 + if (c <= UCS4_TMAPMAX) return ucs4_tmaptab[c]; 1.2276 + if (c < UCS4_TMAPHIMIN) return c; 1.2277 + if (c <= UCS4_TMAPHIMAX) return c - UCS4_TMAPHIMAP; 1.2278 + if (c < UCS4_TMAPDESERETMIN) return c; 1.2279 + if (c <= UCS4_TMAPDESERETMAX) return c - UCS4_TMAPDESERETMAP; 1.2280 + return c; 1.2281 +} 1.2282 + 1.2283 + 1.2284 +/* Return width of a fixed-width UCS-4 character in planes 0-2 1.2285 + * Accepts: character 1.2286 + * Returns: width (0, 1, 2) or negative error condition if not valid 1.2287 + */ 1.2288 + 1.2289 +long ucs4_width (unsigned long c) 1.2290 +{ 1.2291 + long ret; 1.2292 + /* out of range, not-a-char, or surrogates */ 1.2293 + if ((c > UCS4_MAXUNICODE) || ((c & 0xfffe) == 0xfffe) || 1.2294 + ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR))) ret = U4W_NOTUNCD; 1.2295 + /* private-use */ 1.2296 + else if (c >= UCS4_PVTBASE) ret = U4W_PRIVATE; 1.2297 + /* SSP are not printing characters */ 1.2298 + else if (c >= UCS4_SSPBASE) ret = U4W_SSPCHAR; 1.2299 + /* unassigned planes */ 1.2300 + else if (c >= UCS4_UNABASE) ret = U4W_UNASSGN; 1.2301 + /* SIP and reserved plane 3 are wide */ 1.2302 + else if (c >= UCS4_SIPBASE) ret = 2; 1.2303 +#if (UCS4_WIDLEN != UCS4_SIPBASE) 1.2304 +#error "UCS4_WIDLEN != UCS4_SIPBASE" 1.2305 +#endif 1.2306 + /* C0/C1 controls */ 1.2307 + else if ((c <= UCS2_C0CONTROLEND) || 1.2308 + ((c >= UCS2_C1CONTROL) && (c <= UCS2_C1CONTROLEND))) 1.2309 + ret = U4W_CONTROL; 1.2310 + /* BMP and SMP get value from table */ 1.2311 + else switch (ret = (ucs4_widthtab[(c >> 2)] >> ((3 - (c & 0x3)) << 1)) &0x3){ 1.2312 + case 0: /* zero-width */ 1.2313 + if (c == 0x00ad) ret = 1; /* force U+00ad (SOFT HYPHEN) to width 1 */ 1.2314 + case 1: /* single-width */ 1.2315 + case 2: /* double-width */ 1.2316 + break; 1.2317 + case 3: /* ambiguous width */ 1.2318 + ret = (c >= 0x2100) ? 2 : 1;/* need to do something better than this */ 1.2319 + break; 1.2320 + } 1.2321 + return ret; 1.2322 +} 1.2323 + 1.2324 +/* Return screen width of UTF-8 string 1.2325 + * Accepts: string 1.2326 + * Returns: width or negative if not valid UTF-8 1.2327 + */ 1.2328 + 1.2329 +long utf8_strwidth (unsigned char *s) 1.2330 +{ 1.2331 + unsigned long c,i,ret; 1.2332 + /* go through string */ 1.2333 + for (ret = 0; *s; ret += ucs4_width (c)) { 1.2334 + /* It's alright to give a fake value for the byte count to utf8_get() 1.2335 + * since the null of a null-terminated string will stop processing anyway. 1.2336 + */ 1.2337 + i = 6; /* fake value */ 1.2338 + if ((c = utf8_get (&s,&i)) & U8G_ERROR) return -1; 1.2339 + } 1.2340 + return ret; 1.2341 +} 1.2342 + 1.2343 + 1.2344 +/* Return screen width of UTF-8 text 1.2345 + * Accepts: SIZEDTEXT to string 1.2346 + * Returns: width or negative if not valid UTF-8 1.2347 + */ 1.2348 + 1.2349 +long utf8_textwidth (SIZEDTEXT *utf8) 1.2350 +{ 1.2351 + unsigned long c; 1.2352 + unsigned char *s = utf8->data; 1.2353 + unsigned long i = utf8->size; 1.2354 + unsigned long ret = 0; 1.2355 + while (i) { /* while there's a string to process */ 1.2356 + if ((c = utf8_get (&s,&i)) & U8G_ERROR) return -1; 1.2357 + ret += ucs4_width (c); 1.2358 + } 1.2359 + return ret; 1.2360 +} 1.2361 + 1.2362 +/* Decomposition (phew!) */ 1.2363 + 1.2364 +#define MORESINGLE 1 /* single UCS-4 tail value */ 1.2365 +#define MOREMULTIPLE 2 /* multiple UCS-2 tail values */ 1.2366 + 1.2367 +struct decomposemore { 1.2368 + short type; /* type of more */ 1.2369 + union { 1.2370 + unsigned long single; /* single decomposed value */ 1.2371 + struct { /* multiple BMP values */ 1.2372 + unsigned short *next; 1.2373 + unsigned long count; 1.2374 + } multiple; 1.2375 + } data; 1.2376 +}; 1.2377 + 1.2378 +#define RECURSIVEMORE struct recursivemore 1.2379 + 1.2380 +RECURSIVEMORE { 1.2381 + struct decomposemore *more; 1.2382 + RECURSIVEMORE *next; 1.2383 +}; 1.2384 + 1.2385 + 1.2386 +/* Return decomposition of a UCS-4 character 1.2387 + * Accepts: character or U8G_ERROR to return next from "more" 1.2388 + * pointer to returned more 1.2389 + * Returns: [next] decomposed value, more set if still more decomposition 1.2390 + */ 1.2391 + 1.2392 +unsigned long ucs4_decompose (unsigned long c,void **more) 1.2393 +{ 1.2394 + unsigned long i,ix,ret; 1.2395 + struct decomposemore *m; 1.2396 + if (c & U8G_ERROR) { /* want to chase more? */ 1.2397 + /* do sanity check */ 1.2398 + if (m = (struct decomposemore *) *more) switch (m->type) { 1.2399 + case MORESINGLE: /* single value */ 1.2400 + ret = m->data.single; 1.2401 + fs_give (more); /* no more decomposition */ 1.2402 + break; 1.2403 + case MOREMULTIPLE: /* multiple value */ 1.2404 + ret = *m->data.multiple.next++; 1.2405 + if (!--m->data.multiple.count) fs_give (more); 1.2406 + break; 1.2407 + default: /* uh-oh */ 1.2408 + fatal ("invalid more block argument to ucs4_decompose!"); 1.2409 + } 1.2410 + else fatal ("no more block provided to ucs4_decompose!"); 1.2411 + } 1.2412 + 1.2413 + else { /* start decomposition */ 1.2414 + *more = NIL; /* initially set no more */ 1.2415 + /* BMP low decompositions */ 1.2416 + if (c < UCS4_BMPLOMIN) ret = c; 1.2417 + /* fix this someday */ 1.2418 + else if (c == UCS4_BMPLOMIN) ret = ucs4_dbmplotab[0]; 1.2419 + else if (c <= UCS4_BMPLOMAX) { 1.2420 + /* within range - have a decomposition? */ 1.2421 + if (i = ucs4_dbmploixtab[c - UCS4_BMPLOMIN]) { 1.2422 + /* get first value of decomposition */ 1.2423 + ret = ucs4_dbmplotab[ix = i & UCS4_BMPLOIXMASK]; 1.2424 + /* has continuation? */ 1.2425 + if (i & UCS4_BMPLOSIZEMASK) { 1.2426 + m = (struct decomposemore *) 1.2427 + (*more = memset (fs_get (sizeof (struct decomposemore)),0, 1.2428 + sizeof (struct decomposemore))); 1.2429 + m->type = MOREMULTIPLE; 1.2430 + m->data.multiple.next = &ucs4_dbmplotab[++ix]; 1.2431 + m->data.multiple.count = i >> UCS4_BMPLOSIZESHIFT; 1.2432 + } 1.2433 + } 1.2434 + else ret = c; /* in range but doesn't decompose */ 1.2435 + } 1.2436 + /* BMP CJK compatibility */ 1.2437 + else if (c < UCS4_BMPCJKMIN) ret = c; 1.2438 + else if (c <= UCS4_BMPCJKMAX) { 1.2439 + if (!(ret = ucs4_bmpcjk1decomptab[c - UCS4_BMPCJKMIN])) ret = c; 1.2440 + } 1.2441 + /* BMP CJK compatibility - some not in BMP */ 1.2442 +#if UCS4_BMPCJK2MIN - (UCS4_BMPCJKMAX + 1) 1.2443 + else if (c < UCS4_BMPCJK2MIN) ret = c; 1.2444 +#endif 1.2445 + else if (c <= UCS4_BMPCJK2MAX) 1.2446 + ret = ucs4_bmpcjk2decomptab[c - UCS4_BMPCJK2MIN]; 1.2447 + /* BMP high decompositions */ 1.2448 + else if (c < UCS4_BMPHIMIN) ret = c; 1.2449 + else if (c <= UCS4_BMPHIMAX) { 1.2450 + /* within range - have a decomposition? */ 1.2451 + if (i = ucs4_dbmphiixtab[c - UCS4_BMPHIMIN]) { 1.2452 + /* get first value of decomposition */ 1.2453 + ret = ucs4_dbmphitab[ix = i & UCS4_BMPHIIXMASK]; 1.2454 + /* has continuation? */ 1.2455 + if (i & UCS4_BMPHISIZEMASK) { 1.2456 + m = (struct decomposemore *) 1.2457 + (*more = memset (fs_get (sizeof (struct decomposemore)),0, 1.2458 + sizeof (struct decomposemore))); 1.2459 + m->type = MOREMULTIPLE; 1.2460 + m->data.multiple.next = &ucs4_dbmphitab[++ix]; 1.2461 + m->data.multiple.count = i >> UCS4_BMPHISIZESHIFT; 1.2462 + } 1.2463 + } 1.2464 + else ret = c; /* in range but doesn't decompose */ 1.2465 + } 1.2466 + 1.2467 + /* BMP half and full width forms */ 1.2468 + else if (c < UCS4_BMPHALFFULLMIN) ret = c; 1.2469 + else if (c <= UCS4_BMPHALFFULLMAX) { 1.2470 + if (!(ret = ucs4_bmphalffulldecomptab[c - UCS4_BMPHALFFULLMIN])) ret = c; 1.2471 + } 1.2472 + /* SMP music */ 1.2473 + else if (c < UCS4_SMPMUSIC1MIN) ret = c; 1.2474 + else if (c <= UCS4_SMPMUSIC1MAX) { 1.2475 + ret = ucs4_smpmusic1decomptab[c -= UCS4_SMPMUSIC1MIN][0]; 1.2476 + m = (struct decomposemore *) 1.2477 + (*more = memset (fs_get (sizeof (struct decomposemore)),0, 1.2478 + sizeof (struct decomposemore))); 1.2479 + m->type = MORESINGLE; 1.2480 + m->data.single = ucs4_smpmusic1decomptab[c][1]; 1.2481 + } 1.2482 + else if (c < UCS4_SMPMUSIC2MIN) ret = c; 1.2483 + else if (c <= UCS4_SMPMUSIC2MAX) { 1.2484 + ret = ucs4_smpmusic2decomptab[c -= UCS4_SMPMUSIC2MIN][0]; 1.2485 + m = (struct decomposemore *) 1.2486 + (*more = memset (fs_get (sizeof (struct decomposemore)),0, 1.2487 + sizeof (struct decomposemore))); 1.2488 + m->type = MORESINGLE; 1.2489 + m->data.single = ucs4_smpmusic2decomptab[c][1]; 1.2490 + } 1.2491 + /* SMP mathematical forms */ 1.2492 + else if (c < UCS4_SMPMATHMIN) ret = c; 1.2493 + else if (c <= UCS4_SMPMATHMAX) { 1.2494 + if (!(ret = ucs4_smpmathdecomptab[c - UCS4_SMPMATHMIN])) ret = c; 1.2495 + } 1.2496 + /* CJK compatibility ideographs in SIP */ 1.2497 + else if (!(ret = ((c >= UCS4_SIPMIN) && (c <= UCS4_SIPMAX)) ? 1.2498 + ucs4_sipdecomptab[c - UCS4_SIPMIN] : c)) ret = c; 1.2499 + } 1.2500 + return ret; 1.2501 +} 1.2502 + 1.2503 +/* Return recursive decomposition of a UCS-4 character 1.2504 + * Accepts: character or U8G_ERROR to return next from "more" 1.2505 + * pointer to returned more 1.2506 + * Returns: [next] decomposed value, more set if still more decomposition 1.2507 + */ 1.2508 + 1.2509 +unsigned long ucs4_decompose_recursive (unsigned long c,void **more) 1.2510 +{ 1.2511 + unsigned long c1; 1.2512 + void *m,*mn; 1.2513 + RECURSIVEMORE *mr; 1.2514 + if (c & U8G_ERROR) { /* want to chase more? */ 1.2515 + mn = NIL; 1.2516 + if (mr = (RECURSIVEMORE *) *more) switch (mr->more->type) { 1.2517 + case MORESINGLE: /* decompose single value */ 1.2518 + c = ucs4_decompose_recursive (mr->more->data.single,&mn); 1.2519 + *more = mr->next; /* done with this more, remove it */ 1.2520 + fs_give ((void **) &mr->more); 1.2521 + fs_give ((void **) &mr); 1.2522 + break; 1.2523 + case MOREMULTIPLE: /* decompose current value in multiple */ 1.2524 + c = ucs4_decompose_recursive (*mr->more->data.multiple.next++,&mn); 1.2525 + /* if done with this multiple decomposition */ 1.2526 + if (!--mr->more->data.multiple.count) { 1.2527 + *more = mr->next; /* done with this more, remove it */ 1.2528 + fs_give ((void **) &mr->more); 1.2529 + fs_give ((void **) &mr); 1.2530 + } 1.2531 + break; 1.2532 + default: /* uh-oh */ 1.2533 + fatal ("invalid more block argument to ucs4_decompose_recursive!"); 1.2534 + } 1.2535 + else fatal ("no more block provided to ucs4_decompose_recursive!"); 1.2536 + if (mr = mn) { /* did this value recurse on us? */ 1.2537 + mr->next = *more; /* yes, insert new more at head */ 1.2538 + *more = mr; 1.2539 + } 1.2540 + } 1.2541 + else { /* start decomposition */ 1.2542 + *more = NIL; /* initially set no more */ 1.2543 + mr = NIL; 1.2544 + do { /* repeatedly decompose this codepoint */ 1.2545 + c = ucs4_decompose (c1 = c,&m); 1.2546 + if (m) { /* multi-byte decomposition */ 1.2547 + if (c1 == c) fatal ("endless multiple decomposition!"); 1.2548 + /* create a block to stash this more */ 1.2549 + mr = memset (fs_get (sizeof (RECURSIVEMORE)),0,sizeof (RECURSIVEMORE)); 1.2550 + mr->more = m; /* note the expansion */ 1.2551 + mr->next = *more; /* old list is the tail */ 1.2552 + *more = mr; /* and this is the new head */ 1.2553 + } 1.2554 + } while (c1 != c); /* until nothing more to decompose */ 1.2555 + } 1.2556 + return c; 1.2557 +}