imapext-2007

annotate src/c-client/utf8.c @ 0:ada5e610ab86

imap-2007e
author yuuji@gentei.org
date Mon, 14 Sep 2009 15:17:45 +0900
parents
children
rev   line source
yuuji@0 1 /* ========================================================================
yuuji@0 2 * Copyright 1988-2008 University of Washington
yuuji@0 3 *
yuuji@0 4 * Licensed under the Apache License, Version 2.0 (the "License");
yuuji@0 5 * you may not use this file except in compliance with the License.
yuuji@0 6 * You may obtain a copy of the License at
yuuji@0 7 *
yuuji@0 8 * http://www.apache.org/licenses/LICENSE-2.0
yuuji@0 9 *
yuuji@0 10 *
yuuji@0 11 * ========================================================================
yuuji@0 12 */
yuuji@0 13
yuuji@0 14 /*
yuuji@0 15 * Program: UTF-8 routines
yuuji@0 16 *
yuuji@0 17 * Author: Mark Crispin
yuuji@0 18 * Networks and Distributed Computing
yuuji@0 19 * Computing & Communications
yuuji@0 20 * University of Washington
yuuji@0 21 * Administration Building, AG-44
yuuji@0 22 * Seattle, WA 98195
yuuji@0 23 * Internet: MRC@CAC.Washington.EDU
yuuji@0 24 *
yuuji@0 25 * Date: 11 June 1997
yuuji@0 26 * Last Edited: 17 January 2008
yuuji@0 27 */
yuuji@0 28
yuuji@0 29
yuuji@0 30 #include <stdio.h>
yuuji@0 31 #include <ctype.h>
yuuji@0 32 #include "c-client.h"
yuuji@0 33
yuuji@0 34 /* *** IMPORTANT ***
yuuji@0 35 *
yuuji@0 36 * There is a very important difference between "character set" and "charset",
yuuji@0 37 * and the comments in this file reflect these differences. A "character set"
yuuji@0 38 * (also known as "coded character set") is a mapping between codepoints and
yuuji@0 39 * characters. A "charset" is as defined in MIME, and incorporates one or more
yuuji@0 40 * coded character sets in a character encoding scheme. See RFC 2130 for more
yuuji@0 41 * details.
yuuji@0 42 */
yuuji@0 43
yuuji@0 44
yuuji@0 45 /* Character set conversion tables */
yuuji@0 46
yuuji@0 47 #include "iso_8859.c" /* 8-bit single-byte coded graphic */
yuuji@0 48 #include "koi8_r.c" /* Cyrillic - Russia */
yuuji@0 49 #include "koi8_u.c" /* Cyrillic - Ukraine */
yuuji@0 50 #include "tis_620.c" /* Thai */
yuuji@0 51 #include "viscii.c" /* Vietnamese */
yuuji@0 52 #include "windows.c" /* Windows */
yuuji@0 53 #include "ibm.c" /* IBM */
yuuji@0 54 #include "gb_2312.c" /* Chinese (PRC) - simplified */
yuuji@0 55 #include "gb_12345.c" /* Chinese (PRC) - traditional */
yuuji@0 56 #include "jis_0208.c" /* Japanese - basic */
yuuji@0 57 #include "jis_0212.c" /* Japanese - supplementary */
yuuji@0 58 #include "ksc_5601.c" /* Korean */
yuuji@0 59 #include "big5.c" /* Taiwanese (ROC) - industrial standard */
yuuji@0 60 #include "cns11643.c" /* Taiwanese (ROC) - national standard */
yuuji@0 61
yuuji@0 62
yuuji@0 63 #include "widths.c" /* Unicode character widths */
yuuji@0 64 #include "tmap.c" /* Unicode titlecase mapping */
yuuji@0 65 #include "decomtab.c" /* Unicode decomposions */
yuuji@0 66
yuuji@0 67 /* EUC parameters */
yuuji@0 68
yuuji@0 69 #ifdef GBTOUNICODE /* PRC simplified Chinese */
yuuji@0 70 static const struct utf8_eucparam gb_param = {
yuuji@0 71 BASE_GB2312_KU,BASE_GB2312_TEN,MAX_GB2312_KU,MAX_GB2312_TEN,
yuuji@0 72 (void *) gb2312tab};
yuuji@0 73 #endif
yuuji@0 74
yuuji@0 75
yuuji@0 76 #ifdef GB12345TOUNICODE /* PRC traditional Chinese */
yuuji@0 77 static const struct utf8_eucparam gbt_param = {
yuuji@0 78 BASE_GB12345_KU,BASE_GB12345_TEN,MAX_GB12345_KU,MAX_GB12345_TEN,
yuuji@0 79 (void *) gb12345tab};
yuuji@0 80 #endif
yuuji@0 81
yuuji@0 82
yuuji@0 83 #ifdef BIG5TOUNICODE /* ROC traditional Chinese */
yuuji@0 84 static const struct utf8_eucparam big5_param[] = {
yuuji@0 85 {BASE_BIG5_KU,BASE_BIG5_TEN_0,MAX_BIG5_KU,MAX_BIG5_TEN_0,(void *) big5tab},
yuuji@0 86 {BASE_BIG5_KU,BASE_BIG5_TEN_1,MAX_BIG5_KU,MAX_BIG5_TEN_1,NIL}
yuuji@0 87 };
yuuji@0 88 #endif
yuuji@0 89
yuuji@0 90
yuuji@0 91 #ifdef JISTOUNICODE /* Japanese */
yuuji@0 92 static const struct utf8_eucparam jis_param[] = {
yuuji@0 93 {BASE_JIS0208_KU,BASE_JIS0208_TEN,MAX_JIS0208_KU,MAX_JIS0208_TEN,
yuuji@0 94 (void *) jis0208tab},
yuuji@0 95 {MIN_KANA_8,0,MAX_KANA_8,0,(void *) KANA_8},
yuuji@0 96 #ifdef JIS0212TOUNICODE /* Japanese extended */
yuuji@0 97 {BASE_JIS0212_KU,BASE_JIS0212_TEN,MAX_JIS0212_KU,MAX_JIS0212_TEN,
yuuji@0 98 (void *) jis0212tab}
yuuji@0 99 #else
yuuji@0 100 {0,0,0,0,NIL}
yuuji@0 101 #endif
yuuji@0 102 };
yuuji@0 103 #endif
yuuji@0 104
yuuji@0 105
yuuji@0 106 #ifdef KSCTOUNICODE /* Korean */
yuuji@0 107 static const struct utf8_eucparam ksc_param = {
yuuji@0 108 BASE_KSC5601_KU,BASE_KSC5601_TEN,MAX_KSC5601_KU,MAX_KSC5601_TEN,
yuuji@0 109 (void *) ksc5601tab};
yuuji@0 110 #endif
yuuji@0 111
yuuji@0 112 /* List of supported charsets */
yuuji@0 113
yuuji@0 114 static const CHARSET utf8_csvalid[] = {
yuuji@0 115 {"US-ASCII",CT_ASCII,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 116 NIL,NIL,NIL},
yuuji@0 117 {"UTF-8",CT_UTF8,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 118 NIL,SC_UNICODE,NIL},
yuuji@0 119 {"UTF-7",CT_UTF7,CF_PRIMARY | CF_POSTING | CF_UNSUPRT,
yuuji@0 120 NIL,SC_UNICODE,"UTF-8"},
yuuji@0 121 {"ISO-8859-1",CT_1BYTE0,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 122 NIL,SC_LATIN_1,NIL},
yuuji@0 123 {"ISO-8859-2",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 124 (void *) iso8859_2tab,SC_LATIN_2,NIL},
yuuji@0 125 {"ISO-8859-3",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 126 (void *) iso8859_3tab,SC_LATIN_3,NIL},
yuuji@0 127 {"ISO-8859-4",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 128 (void *) iso8859_4tab,SC_LATIN_4,NIL},
yuuji@0 129 {"ISO-8859-5",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 130 (void *) iso8859_5tab,SC_CYRILLIC,"KOI8-R"},
yuuji@0 131 {"ISO-8859-6",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 132 (void *) iso8859_6tab,SC_ARABIC,NIL},
yuuji@0 133 {"ISO-8859-7",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 134 (void *) iso8859_7tab,SC_GREEK,NIL},
yuuji@0 135 {"ISO-8859-8",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 136 (void *) iso8859_8tab,SC_HEBREW,NIL},
yuuji@0 137 {"ISO-8859-9",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 138 (void *) iso8859_9tab,SC_LATIN_5,NIL},
yuuji@0 139 {"ISO-8859-10",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 140 (void *) iso8859_10tab,SC_LATIN_6,NIL},
yuuji@0 141 {"ISO-8859-11",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 142 (void *) iso8859_11tab,SC_THAI,NIL},
yuuji@0 143 #if 0 /* ISO 8859-12 reserved for ISCII(?) */
yuuji@0 144 {"ISO-8859-12",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 145 (void *) iso8859_12tab,NIL,NIL},
yuuji@0 146 #endif
yuuji@0 147 {"ISO-8859-13",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 148 (void *) iso8859_13tab,SC_LATIN_7,NIL},
yuuji@0 149 {"ISO-8859-14",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 150 (void *) iso8859_14tab,SC_LATIN_8,NIL},
yuuji@0 151 {"ISO-8859-15",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 152 (void *) iso8859_15tab,SC_LATIN_9,NIL},
yuuji@0 153 {"ISO-8859-16",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 154 (void *) iso8859_16tab,SC_LATIN_10,NIL},
yuuji@0 155 {"KOI8-R",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 156 (void *) koi8rtab,SC_CYRILLIC,NIL},
yuuji@0 157 {"KOI8-U",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 158 (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,NIL},
yuuji@0 159 {"KOI8-RU",CT_1BYTE,CF_DISPLAY,
yuuji@0 160 (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,"KOI8-U"},
yuuji@0 161 {"TIS-620",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 162 (void *) tis620tab,SC_THAI,"ISO-8859-11"},
yuuji@0 163 {"VISCII",CT_1BYTE8,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 164 (void *) visciitab,SC_VIETNAMESE,NIL},
yuuji@0 165
yuuji@0 166 #ifdef GBTOUNICODE
yuuji@0 167 {"GBK",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 168 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,NIL},
yuuji@0 169 {"GB2312",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 170 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
yuuji@0 171 {"CN-GB",CT_DBYTE,CF_DISPLAY,
yuuji@0 172 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
yuuji@0 173 #ifdef CNS1TOUNICODE
yuuji@0 174 {"ISO-2022-CN",CT_2022,CF_PRIMARY | CF_UNSUPRT,
yuuji@0 175 NIL,SC_CHINESE_SIMPLIFIED | SC_CHINESE_TRADITIONAL,
yuuji@0 176 NIL},
yuuji@0 177 #endif
yuuji@0 178 #endif
yuuji@0 179 #ifdef GB12345TOUNICODE
yuuji@0 180 {"CN-GB-12345",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 181 (void *) &gbt_param,SC_CHINESE_TRADITIONAL,"BIG5"},
yuuji@0 182 #endif
yuuji@0 183 #ifdef BIG5TOUNICODE
yuuji@0 184 {"BIG5",CT_DBYTE2,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 185 (void *) big5_param,SC_CHINESE_TRADITIONAL,NIL},
yuuji@0 186 {"CN-BIG5",CT_DBYTE2,CF_DISPLAY,
yuuji@0 187 (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"},
yuuji@0 188 {"BIG-5",CT_DBYTE2,CF_DISPLAY,
yuuji@0 189 (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"},
yuuji@0 190 #endif
yuuji@0 191 #ifdef JISTOUNICODE
yuuji@0 192 {"ISO-2022-JP",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 193 NIL,SC_JAPANESE,NIL},
yuuji@0 194 {"EUC-JP",CT_EUC,CF_PRIMARY | CF_DISPLAY,
yuuji@0 195 (void *) jis_param,SC_JAPANESE,"ISO-2022-JP"},
yuuji@0 196 {"SHIFT_JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY,
yuuji@0 197 NIL,SC_JAPANESE,"ISO-2022-JP"},
yuuji@0 198 {"SHIFT-JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY,
yuuji@0 199 NIL,SC_JAPANESE,"ISO-2022-JP"},
yuuji@0 200 #ifdef JIS0212TOUNICODE
yuuji@0 201 {"ISO-2022-JP-1",CT_2022,CF_UNSUPRT,
yuuji@0 202 NIL,SC_JAPANESE,"ISO-2022-JP"},
yuuji@0 203 #ifdef GBTOUNICODE
yuuji@0 204 #ifdef KSCTOUNICODE
yuuji@0 205 {"ISO-2022-JP-2",CT_2022,CF_UNSUPRT,
yuuji@0 206 NIL,
yuuji@0 207 SC_LATIN_1 | SC_LATIN_2 | SC_LATIN_3 | SC_LATIN_4 | SC_LATIN_5 |
yuuji@0 208 SC_LATIN_6 | SC_LATIN_7 | SC_LATIN_8 | SC_LATIN_9 | SC_LATIN_10 |
yuuji@0 209 SC_ARABIC | SC_CYRILLIC | SC_GREEK | SC_HEBREW | SC_THAI |
yuuji@0 210 SC_VIETNAMESE | SC_CHINESE_SIMPLIFIED | SC_JAPANESE | SC_KOREAN
yuuji@0 211 #ifdef CNS1TOUNICODE
yuuji@0 212 | SC_CHINESE_TRADITIONAL
yuuji@0 213 #endif
yuuji@0 214 ,"UTF-8"},
yuuji@0 215 #endif
yuuji@0 216 #endif
yuuji@0 217 #endif
yuuji@0 218 #endif
yuuji@0 219
yuuji@0 220 #ifdef KSCTOUNICODE
yuuji@0 221 {"ISO-2022-KR",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_UNSUPRT,
yuuji@0 222 NIL,SC_KOREAN,"EUC-KR"},
yuuji@0 223 {"EUC-KR",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 224 (void *) &ksc_param,SC_KOREAN,NIL},
yuuji@0 225 {"KSC5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 226 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
yuuji@0 227 {"KSC_5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 228 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
yuuji@0 229 {"KS_C_5601-1987",CT_DBYTE,CF_DISPLAY,
yuuji@0 230 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
yuuji@0 231 {"KS_C_5601-1989",CT_DBYTE,CF_DISPLAY,
yuuji@0 232 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
yuuji@0 233 {"KS_C_5601-1992",CT_DBYTE,CF_DISPLAY,
yuuji@0 234 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
yuuji@0 235 {"KS_C_5601-1997",CT_DBYTE,CF_DISPLAY,
yuuji@0 236 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
yuuji@0 237 #endif
yuuji@0 238
yuuji@0 239 /* deep sigh */
yuuji@0 240 {"WINDOWS-874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 241 (void *) windows_874tab,SC_THAI,"ISO-8859-11"},
yuuji@0 242 {"CP874",CT_1BYTE,CF_DISPLAY,
yuuji@0 243 (void *) windows_874tab,SC_THAI,"ISO-8859-11"},
yuuji@0 244 #ifdef GBTOUNICODE
yuuji@0 245 {"WINDOWS-936",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 246 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
yuuji@0 247 {"CP936",CT_DBYTE,CF_DISPLAY,
yuuji@0 248 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
yuuji@0 249 #endif
yuuji@0 250 #ifdef KSCTOUNICODE
yuuji@0 251 {"WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 252 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
yuuji@0 253 {"CP949",CT_DBYTE,CF_DISPLAY,
yuuji@0 254 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
yuuji@0 255 {"X-WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 256 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
yuuji@0 257 #endif
yuuji@0 258 {"WINDOWS-1250",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 259 (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"},
yuuji@0 260 {"CP1250",CT_1BYTE,CF_DISPLAY,
yuuji@0 261 (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"},
yuuji@0 262 {"WINDOWS-1251",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
yuuji@0 263 (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"},
yuuji@0 264 {"CP1251",CT_1BYTE,CF_DISPLAY,
yuuji@0 265 (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"},
yuuji@0 266 {"WINDOWS-1252",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 267 (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"},
yuuji@0 268 {"CP1252",CT_1BYTE,CF_DISPLAY,
yuuji@0 269 (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"},
yuuji@0 270 {"WINDOWS-1253",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 271 (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"},
yuuji@0 272 {"CP1253",CT_1BYTE,CF_DISPLAY,
yuuji@0 273 (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"},
yuuji@0 274 {"WINDOWS-1254",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 275 (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"},
yuuji@0 276 {"CP1254",CT_1BYTE,CF_DISPLAY,
yuuji@0 277 (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"},
yuuji@0 278 {"WINDOWS-1255",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 279 (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"},
yuuji@0 280 {"CP1255",CT_1BYTE,CF_DISPLAY,
yuuji@0 281 (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"},
yuuji@0 282 {"WINDOWS-1256",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 283 (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"},
yuuji@0 284 {"CP1256",CT_1BYTE,CF_DISPLAY,
yuuji@0 285 (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"},
yuuji@0 286 {"WINDOWS-1257",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 287 (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"},
yuuji@0 288 {"CP1257",CT_1BYTE,CF_DISPLAY,
yuuji@0 289 (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"},
yuuji@0 290 {"WINDOWS-1258",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 291 (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"},
yuuji@0 292 {"CP1258",CT_1BYTE,CF_DISPLAY,
yuuji@0 293 (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"},
yuuji@0 294
yuuji@0 295 /* deeper sigh */
yuuji@0 296 {"IBM367",CT_ASCII,CF_PRIMARY | CF_DISPLAY,
yuuji@0 297 NIL,NIL,"US-ASCII"},
yuuji@0 298 {"IBM437",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 299 (void *) ibm_437tab,SC_LATIN_1,"ISO-8859-1"},
yuuji@0 300 {"IBM737",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 301 (void *) ibm_737tab,SC_GREEK,"ISO-8859-7"},
yuuji@0 302 {"IBM775",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 303 (void *) ibm_775tab,SC_LATIN_7,"ISO-8859-13"},
yuuji@0 304 {"IBM850",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 305 (void *) ibm_850tab,SC_LATIN_1,"ISO-8859-1"},
yuuji@0 306 {"IBM852",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 307 (void *) ibm_852tab,SC_LATIN_2,"ISO-8859-2"},
yuuji@0 308 {"IBM855",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 309 (void *) ibm_855tab,SC_CYRILLIC,"ISO-8859-5"},
yuuji@0 310 {"IBM857",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 311 (void *) ibm_857tab,SC_LATIN_5,"ISO-8859-9"},
yuuji@0 312 {"IBM860",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 313 (void *) ibm_860tab,SC_LATIN_1,"ISO-8859-1"},
yuuji@0 314 {"IBM861",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 315 (void *) ibm_861tab,SC_LATIN_6,"ISO-8859-10"},
yuuji@0 316 {"IBM862",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 317 (void *) ibm_862tab,SC_HEBREW,"ISO-8859-8"},
yuuji@0 318 {"IBM863",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 319 (void *) ibm_863tab,SC_LATIN_1,"ISO-8859-1"},
yuuji@0 320 {"IBM864",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 321 (void *) ibm_864tab,SC_ARABIC,"ISO-8859-6"},
yuuji@0 322 {"IBM865",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 323 (void *) ibm_865tab,SC_LATIN_6,"ISO-8859-10"},
yuuji@0 324 {"IBM866",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 325 (void *) ibm_866tab,SC_CYRILLIC,"KOI8-R"},
yuuji@0 326 {"IBM869",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 327 (void *) ibm_869tab,SC_GREEK,"ISO-8859-7"},
yuuji@0 328 {"IBM874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
yuuji@0 329 (void *) ibm_874tab,SC_THAI,"ISO-8859-11"},
yuuji@0 330 /* deepest sigh */
yuuji@0 331 {"ANSI_X3.4-1968",CT_ASCII,CF_DISPLAY,
yuuji@0 332 NIL,NIL,"US-ASCII"},
yuuji@0 333 {"UNICODE-1-1-UTF-7",CT_UTF7,CF_UNSUPRT,
yuuji@0 334 NIL,SC_UNICODE,"UTF-8"},
yuuji@0 335 /* these should never appear in email */
yuuji@0 336 {"UCS-2",CT_UCS2,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
yuuji@0 337 NIL,SC_UNICODE,"UTF-8"},
yuuji@0 338 {"UCS-4",CT_UCS4,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
yuuji@0 339 NIL,SC_UNICODE,"UTF-8"},
yuuji@0 340 {"UTF-16",CT_UTF16,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
yuuji@0 341 NIL,SC_UNICODE,"UTF-8"},
yuuji@0 342 NIL
yuuji@0 343 };
yuuji@0 344
yuuji@0 345 /* Non-Unicode Script table */
yuuji@0 346
yuuji@0 347 static const SCRIPT utf8_scvalid[] = {
yuuji@0 348 {"Arabic",NIL,SC_ARABIC},
yuuji@0 349 {"Chinese Simplified","China, Singapore",SC_CHINESE_SIMPLIFIED},
yuuji@0 350 {"Chinese Traditional","Taiwan, Hong Kong, Macao",SC_CHINESE_TRADITIONAL},
yuuji@0 351 {"Cyrillic",NIL,SC_CYRILLIC},
yuuji@0 352 {"Cyrillic Ukranian",NIL,SC_UKRANIAN},
yuuji@0 353 {"Greek",NIL,SC_GREEK},
yuuji@0 354 {"Hebrew",NIL,SC_HEBREW},
yuuji@0 355 {"Japanese",NIL,SC_JAPANESE},
yuuji@0 356 {"Korean",NIL,SC_KOREAN},
yuuji@0 357 {"Latin-1","Western Europe",SC_LATIN_1},
yuuji@0 358 {"Latin-2","Eastern Europe",SC_LATIN_2},
yuuji@0 359 {"Latin-3","Southern Europe",SC_LATIN_3},
yuuji@0 360 {"Latin-4","Northern Europe",SC_LATIN_4},
yuuji@0 361 {"Latin-5","Turkish",SC_LATIN_5},
yuuji@0 362 {"Latin-6","Nordic",SC_LATIN_6},
yuuji@0 363 {"Latin-7","Baltic",SC_LATIN_7},
yuuji@0 364 {"Latin-8","Celtic",SC_LATIN_8},
yuuji@0 365 {"Latin-9","Euro",SC_LATIN_9},
yuuji@0 366 {"Latin-10","Balkan",SC_LATIN_10},
yuuji@0 367 {"Thai",NIL,SC_THAI},
yuuji@0 368 {"Vietnamese",NIL,SC_VIETNAMESE},
yuuji@0 369 NIL
yuuji@0 370 };
yuuji@0 371
yuuji@0 372 /* Look up script name or return entire table
yuuji@0 373 * Accepts: script name or NIL
yuuji@0 374 * Returns: pointer to script table entry or NIL if unknown
yuuji@0 375 */
yuuji@0 376
yuuji@0 377 SCRIPT *utf8_script (char *script)
yuuji@0 378 {
yuuji@0 379 unsigned long i;
yuuji@0 380 if (!script) return (SCRIPT *) &utf8_scvalid[0];
yuuji@0 381 else if (*script && (strlen (script) < 128))
yuuji@0 382 for (i = 0; utf8_scvalid[i].name; i++)
yuuji@0 383 if (!compare_cstring (script,utf8_scvalid[i].name))
yuuji@0 384 return (SCRIPT *) &utf8_scvalid[i];
yuuji@0 385 return NIL; /* failed */
yuuji@0 386 }
yuuji@0 387
yuuji@0 388
yuuji@0 389 /* Look up charset name or return entire table
yuuji@0 390 * Accepts: charset name or NIL
yuuji@0 391 * Returns: charset table entry or NIL if unknown
yuuji@0 392 */
yuuji@0 393
yuuji@0 394 const CHARSET *utf8_charset (char *charset)
yuuji@0 395 {
yuuji@0 396 unsigned long i;
yuuji@0 397 if (!charset) return (CHARSET *) &utf8_csvalid[0];
yuuji@0 398 else if (*charset && (strlen (charset) < 128))
yuuji@0 399 for (i = 0; utf8_csvalid[i].name; i++)
yuuji@0 400 if (!compare_cstring (charset,utf8_csvalid[i].name))
yuuji@0 401 return (CHARSET *) &utf8_csvalid[i];
yuuji@0 402 return NIL; /* failed */
yuuji@0 403 }
yuuji@0 404
yuuji@0 405 /* Validate charset and generate error message if invalid
yuuji@0 406 * Accepts: bad character set
yuuji@0 407 * Returns: NIL if good charset, else error message string
yuuji@0 408 */
yuuji@0 409
yuuji@0 410 #define BADCSS "[BADCHARSET ("
yuuji@0 411 #define BADCSE ")] Unknown charset: "
yuuji@0 412
yuuji@0 413 char *utf8_badcharset (char *charset)
yuuji@0 414 {
yuuji@0 415 char *msg = NIL;
yuuji@0 416 if (!utf8_charset (charset)) {
yuuji@0 417 char *s,*t;
yuuji@0 418 unsigned long i,j;
yuuji@0 419 /* calculate size of header, trailer, and bad
yuuji@0 420 * charset plus charset names */
yuuji@0 421 for (i = 0, j = sizeof (BADCSS) + sizeof (BADCSE) + strlen (charset) - 2;
yuuji@0 422 utf8_csvalid[i].name; i++)
yuuji@0 423 j += strlen (utf8_csvalid[i].name) + 1;
yuuji@0 424 /* not built right */
yuuji@0 425 if (!i) fatal ("No valid charsets!");
yuuji@0 426 /* header */
yuuji@0 427 for (s = msg = (char *) fs_get (j), t = BADCSS; *t; *s++ = *t++);
yuuji@0 428 /* each charset */
yuuji@0 429 for (i = 0; utf8_csvalid[i].name; *s++ = ' ', i++)
yuuji@0 430 for (t = utf8_csvalid[i].name; *t; *s++ = *t++);
yuuji@0 431 /* back over last space, trailer */
yuuji@0 432 for (t = BADCSE, --s; *t; *s++ = *t++);
yuuji@0 433 /* finally bogus charset */
yuuji@0 434 for (t = charset; *t; *s++ = *t++);
yuuji@0 435 *s++ = '\0'; /* finally tie off string */
yuuji@0 436 if (s != (msg + j)) fatal ("charset msg botch");
yuuji@0 437 }
yuuji@0 438 return msg;
yuuji@0 439 }
yuuji@0 440
yuuji@0 441 /* Convert charset labelled sized text to UTF-8
yuuji@0 442 * Accepts: source sized text
yuuji@0 443 * charset
yuuji@0 444 * pointer to returned sized text if non-NIL
yuuji@0 445 * flags
yuuji@0 446 * Returns: T if successful, NIL if failure
yuuji@0 447 */
yuuji@0 448
yuuji@0 449 long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags)
yuuji@0 450 {
yuuji@0 451 ucs4cn_t cv = (flags & U8T_CASECANON) ? ucs4_titlecase : NIL;
yuuji@0 452 ucs4de_t de = (flags & U8T_DECOMPOSE) ? ucs4_decompose_recursive : NIL;
yuuji@0 453 const CHARSET *cs = (charset && *charset) ?
yuuji@0 454 utf8_charset (charset) : utf8_infercharset (text);
yuuji@0 455 if (cs) return (text && ret) ? utf8_text_cs (text,cs,ret,cv,de) : LONGT;
yuuji@0 456 if (ret) { /* no conversion possible */
yuuji@0 457 ret->data = text->data; /* so return source */
yuuji@0 458 ret->size = text->size;
yuuji@0 459 }
yuuji@0 460 return NIL; /* failure */
yuuji@0 461 }
yuuji@0 462
yuuji@0 463
yuuji@0 464 /* Operations used in converting data */
yuuji@0 465
yuuji@0 466 #define UTF8_COUNT_BMP(count,c,cv,de) { \
yuuji@0 467 void *more = NIL; \
yuuji@0 468 if (cv) c = (*cv) (c); \
yuuji@0 469 if (de) c = (*de) (c,&more); \
yuuji@0 470 do count += UTF8_SIZE_BMP(c); \
yuuji@0 471 while (more && (c = (*de) (U8G_ERROR,&more)));\
yuuji@0 472 }
yuuji@0 473
yuuji@0 474 #define UTF8_WRITE_BMP(b,c,cv,de) { \
yuuji@0 475 void *more = NIL; \
yuuji@0 476 if (cv) c = (*cv) (c); \
yuuji@0 477 if (de) c = (*de) (c,&more); \
yuuji@0 478 do UTF8_PUT_BMP (b,c) \
yuuji@0 479 while (more && (c = (*de) (U8G_ERROR,&more)));\
yuuji@0 480 }
yuuji@0 481
yuuji@0 482 #define UTF8_COUNT(count,c,cv,de) { \
yuuji@0 483 void *more = NIL; \
yuuji@0 484 if (cv) c = (*cv) (c); \
yuuji@0 485 if (de) c = (*de) (c,&more); \
yuuji@0 486 do count += utf8_size (c); \
yuuji@0 487 while (more && (c = (*de) (U8G_ERROR,&more)));\
yuuji@0 488 }
yuuji@0 489
yuuji@0 490 #define UTF8_WRITE(b,c,cv,de) { \
yuuji@0 491 void *more = NIL; \
yuuji@0 492 if (cv) c = (*cv) (c); \
yuuji@0 493 if (de) c = (*de) (c,&more); \
yuuji@0 494 do b = utf8_put (b,c); \
yuuji@0 495 while (more && (c = (*de) (U8G_ERROR,&more)));\
yuuji@0 496 }
yuuji@0 497
yuuji@0 498 /* Convert sized text to UTF-8 given CHARSET block
yuuji@0 499 * Accepts: source sized text
yuuji@0 500 * CHARSET block
yuuji@0 501 * pointer to returned sized text
yuuji@0 502 * canonicalization function
yuuji@0 503 * decomposition function
yuuji@0 504 * Returns: T if successful, NIL if failure
yuuji@0 505 */
yuuji@0 506
yuuji@0 507 long utf8_text_cs (SIZEDTEXT *text,const CHARSET *cs,SIZEDTEXT *ret,
yuuji@0 508 ucs4cn_t cv,ucs4de_t de)
yuuji@0 509 {
yuuji@0 510 ret->data = text->data; /* default to source */
yuuji@0 511 ret->size = text->size;
yuuji@0 512 switch (cs->type) { /* convert if type known */
yuuji@0 513 case CT_ASCII: /* 7-bit ASCII no table */
yuuji@0 514 case CT_UTF8: /* variable UTF-8 encoded Unicode no table */
yuuji@0 515 if (cv || de) utf8_text_utf8 (text,ret,cv,de);
yuuji@0 516 break;
yuuji@0 517 case CT_1BYTE0: /* 1 byte no table */
yuuji@0 518 utf8_text_1byte0 (text,ret,cv,de);
yuuji@0 519 break;
yuuji@0 520 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
yuuji@0 521 utf8_text_1byte (text,ret,cs->tab,cv,de);
yuuji@0 522 break;
yuuji@0 523 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
yuuji@0 524 utf8_text_1byte8 (text,ret,cs->tab,cv,de);
yuuji@0 525 break;
yuuji@0 526 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
yuuji@0 527 utf8_text_euc (text,ret,cs->tab,cv,de);
yuuji@0 528 break;
yuuji@0 529 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
yuuji@0 530 utf8_text_dbyte (text,ret,cs->tab,cv,de);
yuuji@0 531 break;
yuuji@0 532 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
yuuji@0 533 utf8_text_dbyte2 (text,ret,cs->tab,cv,de);
yuuji@0 534 break;
yuuji@0 535 case CT_UTF7: /* variable UTF-7 encoded Unicode no table */
yuuji@0 536 utf8_text_utf7 (text,ret,cv,de);
yuuji@0 537 break;
yuuji@0 538 case CT_UCS2: /* 2 byte 16-bit Unicode no table */
yuuji@0 539 utf8_text_ucs2 (text,ret,cv,de);
yuuji@0 540 break;
yuuji@0 541 case CT_UCS4: /* 4 byte 32-bit Unicode no table */
yuuji@0 542 utf8_text_ucs4 (text,ret,cv,de);
yuuji@0 543 break;
yuuji@0 544 case CT_UTF16: /* variable UTF-16 encoded Unicode no table */
yuuji@0 545 utf8_text_utf16 (text,ret,cv,de);
yuuji@0 546 break;
yuuji@0 547 case CT_2022: /* variable ISO-2022 encoded no table*/
yuuji@0 548 utf8_text_2022 (text,ret,cv,de);
yuuji@0 549 break;
yuuji@0 550 case CT_SJIS: /* 2 byte Shift-JIS encoded JIS no table */
yuuji@0 551 utf8_text_sjis (text,ret,cv,de);
yuuji@0 552 break;
yuuji@0 553 default: /* unknown character set type */
yuuji@0 554 return NIL;
yuuji@0 555 }
yuuji@0 556 return LONGT; /* return success */
yuuji@0 557 }
yuuji@0 558
yuuji@0 559 /* Reverse mapping routines
yuuji@0 560 *
yuuji@0 561 * These routines only support character sets, not all possible charsets. In
yuuji@0 562 * particular, they do not support any Unicode encodings or ISO 2022.
yuuji@0 563 *
yuuji@0 564 * As a special dispensation, utf8_cstext() and utf8_cstocstext() support
yuuji@0 565 * support ISO-2022-JP if EUC-JP can be reverse mapped; and utf8_rmaptext()
yuuji@0 566 * will generated ISO-2022-JP using an EUC-JP rmap if flagged to do so.
yuuji@0 567 *
yuuji@0 568 * No attempt is made to map "equivalent" Unicode characters or Unicode
yuuji@0 569 * characters that have the same glyph; nor is there any attempt to handle
yuuji@0 570 * combining characters or otherwise do any stringprep. Maybe later.
yuuji@0 571 */
yuuji@0 572
yuuji@0 573
yuuji@0 574 /* Convert UTF-8 sized text to charset
yuuji@0 575 * Accepts: source sized text
yuuji@0 576 * destination charset
yuuji@0 577 * pointer to returned sized text
yuuji@0 578 * substitute character if not in cs, else NIL to return failure
yuuji@0 579 * Returns: T if successful, NIL if failure
yuuji@0 580 */
yuuji@0 581
yuuji@0 582
yuuji@0 583 long utf8_cstext (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,
yuuji@0 584 unsigned long errch)
yuuji@0 585 {
yuuji@0 586 short iso2022jp = !compare_cstring (charset,"ISO-2022-JP");
yuuji@0 587 unsigned short *rmap = utf8_rmap (iso2022jp ? "EUC-JP" : charset);
yuuji@0 588 return rmap ? utf8_rmaptext (text,rmap,ret,errch,iso2022jp) : NIL;
yuuji@0 589 }
yuuji@0 590
yuuji@0 591 /* Convert charset labelled sized text to another charset
yuuji@0 592 * Accepts: source sized text
yuuji@0 593 * source charset
yuuji@0 594 * pointer to returned sized text
yuuji@0 595 * destination charset
yuuji@0 596 * substitute character if not in dest cs, else NIL to return failure
yuuji@0 597 * Returns: T if successful, NIL if failure
yuuji@0 598 *
yuuji@0 599 * This routine has the same restricts as utf8_cstext().
yuuji@0 600 */
yuuji@0 601
yuuji@0 602 long utf8_cstocstext (SIZEDTEXT *src,char *sc,SIZEDTEXT *dst,char *dc,
yuuji@0 603 unsigned long errch)
yuuji@0 604 {
yuuji@0 605 SIZEDTEXT utf8;
yuuji@0 606 const CHARSET *scs,*dcs;
yuuji@0 607 unsigned short *rmap;
yuuji@0 608 long ret = NIL;
yuuji@0 609 long iso2022jp;
yuuji@0 610 /* lookup charsets and reverse map */
yuuji@0 611 if ((dc && (dcs = utf8_charset (dc))) &&
yuuji@0 612 (rmap = (iso2022jp = ((dcs->type == CT_2022) &&
yuuji@0 613 !compare_cstring (dcs->name,"ISO-2022-JP"))) ?
yuuji@0 614 utf8_rmap ("EUC-JP") : utf8_rmap_cs (dcs)) &&
yuuji@0 615 (scs = (sc && *sc) ? utf8_charset (sc) : utf8_infercharset (src))) {
yuuji@0 616 /* init temporary buffer */
yuuji@0 617 memset (&utf8,NIL,sizeof (SIZEDTEXT));
yuuji@0 618 /* source cs equivalent to dest cs? */
yuuji@0 619 if ((scs->type == dcs->type) && (scs->tab == dcs->tab)) {
yuuji@0 620 dst->data = src->data; /* yes, just copy pointers */
yuuji@0 621 dst->size = src->size;
yuuji@0 622 ret = LONGT;
yuuji@0 623 }
yuuji@0 624 /* otherwise do the conversion */
yuuji@0 625 else ret = (utf8_text_cs (src,scs,&utf8,NIL,NIL) &&
yuuji@0 626 utf8_rmaptext (&utf8,rmap,dst,errch,iso2022jp));
yuuji@0 627 /* flush temporary buffer */
yuuji@0 628 if (utf8.data && (utf8.data != src->data) && (utf8.data != dst->data))
yuuji@0 629 fs_give ((void **) &utf8.data);
yuuji@0 630 }
yuuji@0 631 return ret;
yuuji@0 632 }
yuuji@0 633
yuuji@0 634 /* Cached rmap */
yuuji@0 635
yuuji@0 636 static const CHARSET *currmapcs = NIL;
yuuji@0 637 static unsigned short *currmap = NIL;
yuuji@0 638
yuuji@0 639
yuuji@0 640 /* Cache and return map for UTF-8 -> character set
yuuji@0 641 * Accepts: character set name
yuuji@0 642 * Returns: cached map if character set found, else NIL
yuuji@0 643 */
yuuji@0 644
yuuji@0 645 unsigned short *utf8_rmap (char *charset)
yuuji@0 646 {
yuuji@0 647 return (currmapcs && !compare_cstring (charset,currmapcs->name)) ? currmap :
yuuji@0 648 utf8_rmap_cs (utf8_charset (charset));
yuuji@0 649 }
yuuji@0 650
yuuji@0 651
yuuji@0 652 /* Cache and return map for UTF-8 -> character set given CHARSET block
yuuji@0 653 * Accepts: CHARSET block
yuuji@0 654 * Returns: cached map if character set found, else NIL
yuuji@0 655 */
yuuji@0 656
yuuji@0 657 unsigned short *utf8_rmap_cs (const CHARSET *cs)
yuuji@0 658 {
yuuji@0 659 unsigned short *ret = NIL;
yuuji@0 660 if (!cs); /* have charset? */
yuuji@0 661 else if (cs == currmapcs) ret = currmap;
yuuji@0 662 else if (ret = utf8_rmap_gen (cs,currmap)) {
yuuji@0 663 currmapcs = cs;
yuuji@0 664 currmap = ret;
yuuji@0 665 }
yuuji@0 666 return ret;
yuuji@0 667 }
yuuji@0 668
yuuji@0 669 /* Return map for UTF-8 -> character set given CHARSET block
yuuji@0 670 * Accepts: CHARSET block
yuuji@0 671 * old map to recycle
yuuji@0 672 * Returns: map if character set found, else NIL
yuuji@0 673 */
yuuji@0 674
yuuji@0 675 unsigned short *utf8_rmap_gen (const CHARSET *cs,unsigned short *oldmap)
yuuji@0 676 {
yuuji@0 677 unsigned short u,*tab,*rmap;
yuuji@0 678 unsigned int i,m,ku,ten;
yuuji@0 679 struct utf8_eucparam *param,*p2;
yuuji@0 680 switch (cs->type) { /* is a character set? */
yuuji@0 681 case CT_ASCII: /* 7-bit ASCII no table */
yuuji@0 682 case CT_1BYTE0: /* 1 byte no table */
yuuji@0 683 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
yuuji@0 684 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
yuuji@0 685 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
yuuji@0 686 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
yuuji@0 687 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
yuuji@0 688 case CT_SJIS: /* 2 byte Shift-JIS */
yuuji@0 689 rmap = oldmap ? oldmap : /* recycle old map if supplied else make new */
yuuji@0 690 (unsigned short *) fs_get (65536 * sizeof (unsigned short));
yuuji@0 691 /* initialize table for ASCII */
yuuji@0 692 for (i = 0; i < 128; i++) rmap[i] = (unsigned short) i;
yuuji@0 693 /* populate remainder of table with NOCHAR */
yuuji@0 694 #define NOCHARBYTE (NOCHAR & 0xff)
yuuji@0 695 #if NOCHAR - ((NOCHARBYTE << 8) | NOCHARBYTE)
yuuji@0 696 while (i < 65536) rmap[i++] = NOCHAR;
yuuji@0 697 #else
yuuji@0 698 memset (rmap + 128,NOCHARBYTE,(65536 - 128) * sizeof (unsigned short));
yuuji@0 699 #endif
yuuji@0 700 break;
yuuji@0 701 default: /* unsupported charset type */
yuuji@0 702 rmap = NIL; /* no map possible */
yuuji@0 703 }
yuuji@0 704 if (rmap) { /* have a map? */
yuuji@0 705 switch (cs->type) { /* additional reverse map actions */
yuuji@0 706 case CT_1BYTE0: /* 1 byte no table */
yuuji@0 707 for (i = 128; i < 256; i++) rmap[i] = (unsigned short) i;
yuuji@0 708 break;
yuuji@0 709 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
yuuji@0 710 for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++)
yuuji@0 711 if (tab[i & BITS7] != UBOGON) rmap[tab[i & BITS7]] = (unsigned short)i;
yuuji@0 712 break;
yuuji@0 713 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
yuuji@0 714 for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++)
yuuji@0 715 if (tab[i] != UBOGON) rmap[tab[i]] = (unsigned short) i;
yuuji@0 716 break;
yuuji@0 717 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
yuuji@0 718 for (param = (struct utf8_eucparam *) cs->tab,
yuuji@0 719 tab = (unsigned short *) param->tab, ku = 0;
yuuji@0 720 ku < param->max_ku; ku++)
yuuji@0 721 for (ten = 0; ten < param->max_ten; ten++)
yuuji@0 722 if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
yuuji@0 723 rmap[u] = ((ku + param->base_ku) << 8) +
yuuji@0 724 (ten + param->base_ten) + 0x8080;
yuuji@0 725 break;
yuuji@0 726
yuuji@0 727 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
yuuji@0 728 for (param = (struct utf8_eucparam *) cs->tab,
yuuji@0 729 tab = (unsigned short *) param->tab, ku = 0;
yuuji@0 730 ku < param->max_ku; ku++)
yuuji@0 731 for (ten = 0; ten < param->max_ten; ten++)
yuuji@0 732 if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
yuuji@0 733 rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten);
yuuji@0 734 break;
yuuji@0 735 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
yuuji@0 736 param = (struct utf8_eucparam *) cs->tab;
yuuji@0 737 p2 = param + 1; /* plane 2 parameters */
yuuji@0 738 /* only ten parameters should differ */
yuuji@0 739 if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku))
yuuji@0 740 fatal ("ku definition error for CT_DBYTE2 charset");
yuuji@0 741 /* total codepoints in each ku */
yuuji@0 742 m = param->max_ten + p2->max_ten;
yuuji@0 743 tab = (unsigned short *) param->tab;
yuuji@0 744 for (ku = 0; ku < param->max_ku; ku++) {
yuuji@0 745 for (ten = 0; ten < param->max_ten; ten++)
yuuji@0 746 if ((u = tab[(ku * m) + ten]) != UBOGON)
yuuji@0 747 rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten);
yuuji@0 748 for (ten = 0; ten < p2->max_ten; ten++)
yuuji@0 749 if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON)
yuuji@0 750 rmap[u] = ((ku + param->base_ku) << 8) + (ten + p2->base_ten);
yuuji@0 751 }
yuuji@0 752 break;
yuuji@0 753 case CT_SJIS: /* 2 byte Shift-JIS */
yuuji@0 754 for (ku = 0; ku < MAX_JIS0208_KU; ku++)
yuuji@0 755 for (ten = 0; ten < MAX_JIS0208_TEN; ten++)
yuuji@0 756 if ((u = jis0208tab[ku][ten]) != UBOGON) {
yuuji@0 757 int sku = ku + BASE_JIS0208_KU;
yuuji@0 758 int sten = ten + BASE_JIS0208_TEN;
yuuji@0 759 rmap[u] = ((((sku + 1) >> 1) + ((sku < 95) ? 112 : 176)) << 8) +
yuuji@0 760 sten + ((sku % 2) ? ((sten > 95) ? 32 : 31) : 126);
yuuji@0 761 }
yuuji@0 762 /* JIS Roman */
yuuji@0 763 rmap[UCS2_YEN] = JISROMAN_YEN;
yuuji@0 764 rmap[UCS2_OVERLINE] = JISROMAN_OVERLINE;
yuuji@0 765 /* JIS hankaku katakana */
yuuji@0 766 for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++)
yuuji@0 767 rmap[UCS2_KATAKANA + u] = MIN_KANA_8 + u;
yuuji@0 768 break;
yuuji@0 769 }
yuuji@0 770 /* hack: map NBSP to SP if otherwise no map */
yuuji@0 771 if (rmap[0x00a0] == NOCHAR) rmap[0x00a0] = rmap[0x0020];
yuuji@0 772 }
yuuji@0 773 return rmap; /* return map */
yuuji@0 774 }
yuuji@0 775
yuuji@0 776 /* Convert UTF-8 sized text to charset using rmap
yuuji@0 777 * Accepts: source sized text
yuuji@0 778 * conversion rmap
yuuji@0 779 * pointer to returned sized text
yuuji@0 780 * substitute character if not in rmap, else NIL to return failure
yuuji@0 781 * ISO-2022-JP conversion flag
yuuji@0 782 * Returns T if successful, NIL if failure
yuuji@0 783 *
yuuji@0 784 * This routine doesn't try to convert to all possible charsets; in particular
yuuji@0 785 * it doesn't support other Unicode encodings or any ISO 2022 other than
yuuji@0 786 * ISO-2022-JP.
yuuji@0 787 */
yuuji@0 788
yuuji@0 789 long utf8_rmaptext (SIZEDTEXT *text,unsigned short *rmap,SIZEDTEXT *ret,
yuuji@0 790 unsigned long errch,long iso2022jp)
yuuji@0 791 {
yuuji@0 792 unsigned long i,u,c;
yuuji@0 793 /* get size of buffer */
yuuji@0 794 if (i = utf8_rmapsize (text,rmap,errch,iso2022jp)) {
yuuji@0 795 unsigned char *s = text->data;
yuuji@0 796 unsigned char *t = ret->data = (unsigned char *) fs_get (i);
yuuji@0 797 ret->size = i - 1; /* number of octets in destination buffer */
yuuji@0 798 /* start non-zero ISO-2022-JP state at 1 */
yuuji@0 799 if (iso2022jp) iso2022jp = 1;
yuuji@0 800 /* convert string, ignore BOM */
yuuji@0 801 for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) {
yuuji@0 802 /* substitute error character for NOCHAR */
yuuji@0 803 if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch;
yuuji@0 804 switch (iso2022jp) { /* depends upon ISO 2022 mode */
yuuji@0 805 case 0: /* ISO 2022 not in effect */
yuuji@0 806 /* two-byte character */
yuuji@0 807 if (c > 0xff) *t++ = (unsigned char) (c >> 8);
yuuji@0 808 /* single-byte or low-byte of two-byte */
yuuji@0 809 *t++ = (unsigned char) (c & 0xff);
yuuji@0 810 break;
yuuji@0 811 case 1: /* ISO 2022 Roman */
yuuji@0 812 /* <ch> */
yuuji@0 813 if (c < 0x80) *t++ = (unsigned char) c;
yuuji@0 814 else { /* JIS character */
yuuji@0 815 *t++ = I2C_ESC; /* ESC $ B <hi> <lo> */
yuuji@0 816 *t++ = I2C_MULTI;
yuuji@0 817 *t++ = I2CS_94x94_JIS_NEW;
yuuji@0 818 *t++ = (unsigned char) (c >> 8) & 0x7f;
yuuji@0 819 *t++ = (unsigned char) c & 0x7f;
yuuji@0 820 iso2022jp = 2; /* shift to ISO 2022 JIS */
yuuji@0 821 }
yuuji@0 822 break;
yuuji@0 823 case 2: /* ISO 2022 JIS */
yuuji@0 824 if (c > 0x7f) { /* <hi> <lo> */
yuuji@0 825 *t++ = (unsigned char) (c >> 8) & 0x7f;
yuuji@0 826 *t++ = (unsigned char) c & 0x7f;
yuuji@0 827 }
yuuji@0 828 else { /* ASCII character */
yuuji@0 829 *t++ = I2C_ESC; /* ESC ( J <ch> */
yuuji@0 830 *t++ = I2C_G0_94;
yuuji@0 831 *t++ = I2CS_94_JIS_ROMAN;
yuuji@0 832 *t++ = (unsigned char) c;
yuuji@0 833 iso2022jp = 1; /* shift to ISO 2022 Roman */
yuuji@0 834 }
yuuji@0 835 break;
yuuji@0 836 }
yuuji@0 837 }
yuuji@0 838 if (iso2022jp == 2) { /* ISO-2022-JP string must end in Roman */
yuuji@0 839 *t++ = I2C_ESC; /* ESC ( J */
yuuji@0 840 *t++ = I2C_G0_94;
yuuji@0 841 *t++ = I2CS_94_JIS_ROMAN;
yuuji@0 842 }
yuuji@0 843 *t++ = NIL; /* tie off returned data */
yuuji@0 844 return LONGT; /* return success */
yuuji@0 845 }
yuuji@0 846 ret->data = NIL;
yuuji@0 847 ret->size = 0;
yuuji@0 848 return NIL; /* failure */
yuuji@0 849 }
yuuji@0 850
yuuji@0 851 /* Calculate size of convertsion of UTF-8 sized text to charset using rmap
yuuji@0 852 * Accepts: source sized text
yuuji@0 853 * conversion rmap
yuuji@0 854 * pointer to returned sized text
yuuji@0 855 * substitute character if not in rmap, else NIL to return failure
yuuji@0 856 * ISO-2022-JP conversion flag
yuuji@0 857 * Returns size+1 if successful, NIL if failure
yuuji@0 858 *
yuuji@0 859 * This routine doesn't try to handle to all possible charsets; in particular
yuuji@0 860 * it doesn't support other Unicode encodings or any ISO 2022 other than
yuuji@0 861 * ISO-2022-JP.
yuuji@0 862 */
yuuji@0 863
yuuji@0 864 unsigned long utf8_rmapsize (SIZEDTEXT *text,unsigned short *rmap,
yuuji@0 865 unsigned long errch,long iso2022jp)
yuuji@0 866 {
yuuji@0 867 unsigned long i,u,c;
yuuji@0 868 unsigned long ret = 1; /* terminating NUL */
yuuji@0 869 unsigned char *s = text->data;
yuuji@0 870 if (iso2022jp) iso2022jp = 1; /* start non-zero ISO-2022-JP state at 1 */
yuuji@0 871 for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) {
yuuji@0 872 if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch)))
yuuji@0 873 return NIL; /* not in BMP, or NOCHAR and no err char */
yuuji@0 874 switch (iso2022jp) { /* depends upon ISO 2022 mode */
yuuji@0 875 case 0: /* ISO 2022 not in effect */
yuuji@0 876 ret += (c > 0xff) ? 2 : 1;
yuuji@0 877 break;
yuuji@0 878 case 1: /* ISO 2022 Roman */
yuuji@0 879 if (c < 0x80) ret += 1; /* <ch> */
yuuji@0 880 else { /* JIS character */
yuuji@0 881 ret += 5; /* ESC $ B <hi> <lo> */
yuuji@0 882 iso2022jp = 2; /* shift to ISO 2022 JIS */
yuuji@0 883 }
yuuji@0 884 break;
yuuji@0 885 case 2: /* ISO 2022 JIS */
yuuji@0 886 if (c > 0x7f) ret += 2; /* <hi> <lo> */
yuuji@0 887 else { /* ASCII character */
yuuji@0 888 ret += 4; /* ESC ( J <ch> */
yuuji@0 889 iso2022jp = 1; /* shift to ISO 2022 Roman */
yuuji@0 890 }
yuuji@0 891 break;
yuuji@0 892 }
yuuji@0 893 }
yuuji@0 894 if (iso2022jp == 2) { /* ISO-2022-JP string must end in Roman */
yuuji@0 895 ret += 3; /* ESC ( J */
yuuji@0 896 iso2022jp = 1; /* reset state to Roman */
yuuji@0 897 }
yuuji@0 898 return ret;
yuuji@0 899 }
yuuji@0 900
yuuji@0 901 /* Convert UCS-4 to charset using rmap
yuuji@0 902 * Accepts: source UCS-4 character(s)
yuuji@0 903 * numver of UCS-4 characters
yuuji@0 904 * conversion rmap
yuuji@0 905 * pointer to returned sized text
yuuji@0 906 * substitute character if not in rmap, else NIL to return failure
yuuji@0 907 * Returns T if successful, NIL if failure
yuuji@0 908 *
yuuji@0 909 * Currently only supports BMP characters, and does not support ISO-2022
yuuji@0 910 */
yuuji@0 911
yuuji@0 912 long ucs4_rmaptext (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
yuuji@0 913 SIZEDTEXT *ret,unsigned long errch)
yuuji@0 914 {
yuuji@0 915 long size = ucs4_rmaplen (ucs4,len,rmap,errch);
yuuji@0 916 return (size >= 0) ? /* build in newly-created buffer */
yuuji@0 917 ucs4_rmapbuf (ret->data = (unsigned char *) fs_get ((ret->size = size) +1),
yuuji@0 918 ucs4,len,rmap,errch) : NIL;
yuuji@0 919 }
yuuji@0 920
yuuji@0 921 /* Return size of UCS-4 string converted to other CS via rmap
yuuji@0 922 * Accepts: source UCS-4 character(s)
yuuji@0 923 * numver of UCS-4 characters
yuuji@0 924 * conversion rmap
yuuji@0 925 * substitute character if not in rmap, else NIL to return failure
yuuji@0 926 * Returns: length if success, negative if failure (no-convert)
yuuji@0 927 */
yuuji@0 928
yuuji@0 929 long ucs4_rmaplen (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
yuuji@0 930 unsigned long errch)
yuuji@0 931 {
yuuji@0 932 long ret;
yuuji@0 933 unsigned long i,u,c;
yuuji@0 934 /* count non-BOM characters */
yuuji@0 935 for (ret = 0,i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) {
yuuji@0 936 if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch)))
yuuji@0 937 return -1; /* not in BMP, or NOCHAR and no err char? */
yuuji@0 938 ret += (c > 0xff) ? 2 : 1;
yuuji@0 939 }
yuuji@0 940 return ret;
yuuji@0 941 }
yuuji@0 942
yuuji@0 943
yuuji@0 944 /* Stuff buffer with UCS-4 string converted to other CS via rmap
yuuji@0 945 * Accepts: destination buffer
yuuji@0 946 * source UCS-4 character(s)
yuuji@0 947 * number of UCS-4 characters
yuuji@0 948 * conversion rmap
yuuji@0 949 * substitute character if not in rmap, else NIL to return failure
yuuji@0 950 * Returns: T, always
yuuji@0 951 */
yuuji@0 952
yuuji@0 953 long ucs4_rmapbuf (unsigned char *t,unsigned long *ucs4,unsigned long len,
yuuji@0 954 unsigned short *rmap,unsigned long errch)
yuuji@0 955 {
yuuji@0 956 unsigned long i,u,c;
yuuji@0 957 /* convert non-BOM characters */
yuuji@0 958 for (i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) {
yuuji@0 959 /* substitute error character for NOCHAR */
yuuji@0 960 if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch;
yuuji@0 961 /* two-byte character? */
yuuji@0 962 if (c > 0xff) *t++ = (unsigned char) (c >> 8);
yuuji@0 963 /* single-byte or low-byte of two-byte */
yuuji@0 964 *t++ = (unsigned char) (c & 0xff);
yuuji@0 965 }
yuuji@0 966 *t++ = NIL; /* tie off returned data */
yuuji@0 967 return LONGT;
yuuji@0 968 }
yuuji@0 969
yuuji@0 970 /* Return UCS-4 Unicode character from UTF-8 string
yuuji@0 971 * Accepts: pointer to string
yuuji@0 972 * remaining octets in string
yuuji@0 973 * Returns: UCS-4 character with pointer and count updated
yuuji@0 974 * or error code with pointer and count unchanged
yuuji@0 975 */
yuuji@0 976
yuuji@0 977 unsigned long utf8_get (unsigned char **s,unsigned long *i)
yuuji@0 978 {
yuuji@0 979 unsigned char *t = *s;
yuuji@0 980 unsigned long j = *i;
yuuji@0 981 /* decode raw UTF-8 string */
yuuji@0 982 unsigned long ret = utf8_get_raw (&t,&j);
yuuji@0 983 if (ret & U8G_ERROR); /* invalid raw UTF-8 decoding? */
yuuji@0 984 /* no, is it surrogate? */
yuuji@0 985 else if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) ret = U8G_SURROGA;
yuuji@0 986 /* or in non-Unicode ISO 10646 space? */
yuuji@0 987 else if (ret > UCS4_MAXUNICODE) ret = U8G_NOTUNIC;
yuuji@0 988 else {
yuuji@0 989 *s = t; /* all is well, update pointer */
yuuji@0 990 *i = j; /* and counter */
yuuji@0 991 }
yuuji@0 992 return ret; /* return value */
yuuji@0 993 }
yuuji@0 994
yuuji@0 995 /* Return raw (including non-Unicode) UCS-4 character from UTF-8 string
yuuji@0 996 * Accepts: pointer to string
yuuji@0 997 * remaining octets in string
yuuji@0 998 * Returns: UCS-4 character with pointer and count updated
yuuji@0 999 * or error code with pointer and count unchanged
yuuji@0 1000 */
yuuji@0 1001
yuuji@0 1002 unsigned long utf8_get_raw (unsigned char **s,unsigned long *i)
yuuji@0 1003 {
yuuji@0 1004 unsigned char c,c1;
yuuji@0 1005 unsigned char *t = *s;
yuuji@0 1006 unsigned long j = *i;
yuuji@0 1007 unsigned long ret = U8G_NOTUTF8;
yuuji@0 1008 int more = 0;
yuuji@0 1009 do { /* make sure have source octets available */
yuuji@0 1010 if (!j--) return more ? U8G_ENDSTRI : U8G_ENDSTRG;
yuuji@0 1011 /* UTF-8 continuation? */
yuuji@0 1012 else if (((c = *t++) > 0x7f) && (c < 0xc0)) {
yuuji@0 1013 /* continuation when not in progress */
yuuji@0 1014 if (!more) return U8G_BADCONT;
yuuji@0 1015 --more; /* found a continuation octet */
yuuji@0 1016 ret <<= 6; /* shift current value by 6 bits */
yuuji@0 1017 ret |= c & 0x3f; /* merge continuation octet */
yuuji@0 1018 }
yuuji@0 1019 /* incomplete UTF-8 character */
yuuji@0 1020 else if (more) return U8G_INCMPLT;
yuuji@0 1021 else { /* start of sequence */
yuuji@0 1022 c1 = j ? *t : 0xbf; /* assume valid continuation if incomplete */
yuuji@0 1023 if (c < 0x80) ret = c; /* U+0000 - U+007f */
yuuji@0 1024 else if (c < 0xc2); /* c0 and c1 never valid */
yuuji@0 1025 else if (c < 0xe0) { /* U+0080 - U+07ff */
yuuji@0 1026 if (c &= 0x1f) more = 1;
yuuji@0 1027 }
yuuji@0 1028 else if (c < 0xf0) { /* U+0800 - U+ffff */
yuuji@0 1029 if ((c &= 0x0f) || (c1 >= 0xa0)) more = 2;
yuuji@0 1030 }
yuuji@0 1031 else if (c < 0xf8) { /* U+10000 - U+10ffff (and 110000 - 1fffff) */
yuuji@0 1032 if ((c &= 0x07) || (c1 >= 0x90)) more = 3;
yuuji@0 1033 }
yuuji@0 1034 else if (c < 0xfc) { /* ISO 10646 200000 - 3ffffff */
yuuji@0 1035 if ((c &= 0x03) || (c1 >= 0x88)) more = 4;
yuuji@0 1036 }
yuuji@0 1037 else if (c < 0xfe) { /* ISO 10646 4000000 - 7fffffff */
yuuji@0 1038 if ((c &= 0x01) || (c1 >= 0x84)) more = 5;
yuuji@0 1039 }
yuuji@0 1040 /* fe and ff never valid */
yuuji@0 1041 if (more) { /* multi-octet, make sure more to come */
yuuji@0 1042 if (!j) return U8G_ENDSTRI;
yuuji@0 1043 ret = c; /* continuation needed, save start bits */
yuuji@0 1044 }
yuuji@0 1045 }
yuuji@0 1046 } while (more);
yuuji@0 1047 if (!(ret & U8G_ERROR)) { /* success return? */
yuuji@0 1048 *s = t; /* yes, update pointer */
yuuji@0 1049 *i = j; /* and counter */
yuuji@0 1050 }
yuuji@0 1051 return ret; /* return value */
yuuji@0 1052 }
yuuji@0 1053
yuuji@0 1054 /* Return UCS-4 character from named charset string
yuuji@0 1055 * Accepts: charset
yuuji@0 1056 * pointer to string
yuuji@0 1057 * remaining octets in string
yuuji@0 1058 * Returns: UCS-4 character with pointer and count updated, negative if error
yuuji@0 1059 *
yuuji@0 1060 * Error codes are the same as utf8_get().
yuuji@0 1061 */
yuuji@0 1062
yuuji@0 1063 unsigned long ucs4_cs_get (CHARSET *cs,unsigned char **s,unsigned long *i)
yuuji@0 1064 {
yuuji@0 1065 unsigned char c,c1,ku,ten;
yuuji@0 1066 unsigned long ret,d;
yuuji@0 1067 unsigned char *t = *s;
yuuji@0 1068 unsigned long j = *i;
yuuji@0 1069 struct utf8_eucparam *p1,*p2,*p3;
yuuji@0 1070 if (j--) c = *t++; /* get first octet */
yuuji@0 1071 else return U8G_ENDSTRG; /* empty string */
yuuji@0 1072 switch (cs->type) { /* convert if type known */
yuuji@0 1073 case CT_UTF8: /* variable UTF-8 encoded Unicode no table */
yuuji@0 1074 return utf8_get (s,i);
yuuji@0 1075 case CT_ASCII: /* 7-bit ASCII no table */
yuuji@0 1076 if (c >= 0x80) return U8G_NOTUTF8;
yuuji@0 1077 case CT_1BYTE0: /* 1 byte no table */
yuuji@0 1078 ret = c; /* identity */
yuuji@0 1079 break;
yuuji@0 1080 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
yuuji@0 1081 ret = (c > 0x80) ? ((unsigned short *) cs->tab)[c & BITS7] : c;
yuuji@0 1082 break;
yuuji@0 1083 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
yuuji@0 1084 ret = ((unsigned short *) cs->tab)[c];
yuuji@0 1085 break;
yuuji@0 1086
yuuji@0 1087 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
yuuji@0 1088 if (c & BIT8) {
yuuji@0 1089 p1 = (struct utf8_eucparam *) cs->tab;
yuuji@0 1090 p2 = p1 + 1;
yuuji@0 1091 p3 = p1 + 2;
yuuji@0 1092 if (j--) c1 = *t++; /* get second octet */
yuuji@0 1093 else return U8G_ENDSTRI;
yuuji@0 1094 if (!(c1 & BIT8)) return U8G_NOTUTF8;
yuuji@0 1095 switch (c) { /* check 8bit code set */
yuuji@0 1096 case EUC_CS2: /* CS2 */
yuuji@0 1097 if (p2->base_ku) { /* CS2 set up? */
yuuji@0 1098 if (p2->base_ten) { /* yes, multibyte? */
yuuji@0 1099 if (j--) c = *t++; /* get second octet */
yuuji@0 1100 else return U8G_ENDSTRI;
yuuji@0 1101 if ((c & BIT8) &&
yuuji@0 1102 ((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) &&
yuuji@0 1103 ((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) {
yuuji@0 1104 ret = ((unsigned short *) p2->tab)[(ku*p2->max_ten) + ten];
yuuji@0 1105 break;
yuuji@0 1106 }
yuuji@0 1107 }
yuuji@0 1108 else if ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) {
yuuji@0 1109 ret = c1 + ((unsigned long) p2->tab);
yuuji@0 1110 break;
yuuji@0 1111 }
yuuji@0 1112 }
yuuji@0 1113 return U8G_NOTUTF8; /* CS2 not set up or bogus */
yuuji@0 1114 case EUC_CS3: /* CS3 */
yuuji@0 1115 if (p3->base_ku) { /* CS3 set up? */
yuuji@0 1116 if (p3->base_ten) { /* yes, multibyte? */
yuuji@0 1117 if (j--) c = *t++; /* get second octet */
yuuji@0 1118 else return U8G_ENDSTRI;
yuuji@0 1119 if ((c & BIT8) &&
yuuji@0 1120 ((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) &&
yuuji@0 1121 ((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) {
yuuji@0 1122 ret = ((unsigned short *) p3->tab)[(ku*p3->max_ten) + ten];
yuuji@0 1123 break;
yuuji@0 1124 }
yuuji@0 1125 }
yuuji@0 1126 else if ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) {
yuuji@0 1127 ret = c1 + ((unsigned long) p3->tab);
yuuji@0 1128 break;
yuuji@0 1129 }
yuuji@0 1130 }
yuuji@0 1131 return U8G_NOTUTF8; /* CS3 not set up or bogus */
yuuji@0 1132 default:
yuuji@0 1133 if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) ||
yuuji@0 1134 ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten))
yuuji@0 1135 return U8G_NOTUTF8;
yuuji@0 1136 ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten];
yuuji@0 1137 /* special hack for JIS X 0212: merge rows less than 10 */
yuuji@0 1138 if ((ret == UBOGON) && ku && (ku < 10) && p3->tab && p3->base_ten)
yuuji@0 1139 ret = ((unsigned short *) p3->tab)
yuuji@0 1140 [((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten];
yuuji@0 1141 break;
yuuji@0 1142 }
yuuji@0 1143 }
yuuji@0 1144 else ret = c; /* ASCII character */
yuuji@0 1145 break;
yuuji@0 1146
yuuji@0 1147 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
yuuji@0 1148 if (c & BIT8) { /* double-byte character? */
yuuji@0 1149 p1 = (struct utf8_eucparam *) cs->tab;
yuuji@0 1150 if (j--) c1 = *t++; /* get second octet */
yuuji@0 1151 else return U8G_ENDSTRI;
yuuji@0 1152 if (((ku = c - p1->base_ku) < p1->max_ku) &&
yuuji@0 1153 ((ten = c1 - p1->base_ten) < p1->max_ten))
yuuji@0 1154 ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten];
yuuji@0 1155 else return U8G_NOTUTF8;
yuuji@0 1156 }
yuuji@0 1157 else ret = c; /* ASCII character */
yuuji@0 1158 break;
yuuji@0 1159 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
yuuji@0 1160 if (c & BIT8) { /* double-byte character? */
yuuji@0 1161 p1 = (struct utf8_eucparam *) cs->tab;
yuuji@0 1162 p2 = p1 + 1;
yuuji@0 1163 if (j--) c1 = *t++; /* get second octet */
yuuji@0 1164 else return U8G_ENDSTRI;
yuuji@0 1165 if (c1 & BIT8) { /* high vs. low plane */
yuuji@0 1166 if ((ku = c - p2->base_ku) < p2->max_ku &&
yuuji@0 1167 ((ten = c1 - p2->base_ten) < p2->max_ten))
yuuji@0 1168 ret = ((unsigned short *) p1->tab)
yuuji@0 1169 [(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten];
yuuji@0 1170 else return U8G_NOTUTF8;
yuuji@0 1171 }
yuuji@0 1172 else if ((ku = c - p1->base_ku) < p1->max_ku &&
yuuji@0 1173 ((ten = c1 - p1->base_ten) < p1->max_ten))
yuuji@0 1174 ret = ((unsigned short *) p1->tab)
yuuji@0 1175 [(ku*(p1->max_ten + p2->max_ten)) + ten];
yuuji@0 1176 else return U8G_NOTUTF8;
yuuji@0 1177 }
yuuji@0 1178 else ret = c; /* ASCII character */
yuuji@0 1179 break;
yuuji@0 1180 case CT_SJIS: /* 2 byte Shift-JIS encoded JIS no table */
yuuji@0 1181 /* compromise - do yen sign but not overline */
yuuji@0 1182 if (!(c & BIT8)) ret = (c == JISROMAN_YEN) ? UCS2_YEN : c;
yuuji@0 1183 /* half-width katakana? */
yuuji@0 1184 else if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) ret = c + KANA_8;
yuuji@0 1185 else { /* Shift-JIS */
yuuji@0 1186 if (j--) c1 = *t++; /* get second octet */
yuuji@0 1187 else return U8G_ENDSTRI;
yuuji@0 1188 SJISTOJIS (c,c1);
yuuji@0 1189 c = JISTOUNICODE (c,c1,ku,ten);
yuuji@0 1190 }
yuuji@0 1191 break;
yuuji@0 1192
yuuji@0 1193 case CT_UCS2: /* 2 byte 16-bit Unicode no table */
yuuji@0 1194 ret = c << 8;
yuuji@0 1195 if (j--) c = *t++; /* get second octet */
yuuji@0 1196 else return U8G_ENDSTRI; /* empty string */
yuuji@0 1197 ret |= c;
yuuji@0 1198 break;
yuuji@0 1199 case CT_UCS4: /* 4 byte 32-bit Unicode no table */
yuuji@0 1200 if (c & 0x80) return U8G_NOTUTF8;
yuuji@0 1201 if (j < 3) return U8G_ENDSTRI;
yuuji@0 1202 j -= 3; /* count three octets */
yuuji@0 1203 ret = c << 24;
yuuji@0 1204 ret |= (*t++) << 16;
yuuji@0 1205 ret |= (*t++) << 8;
yuuji@0 1206 ret |= (*t++);
yuuji@0 1207 break;
yuuji@0 1208 case CT_UTF16: /* variable UTF-16 encoded Unicode no table */
yuuji@0 1209 ret = c << 8;
yuuji@0 1210 if (j--) c = *t++; /* get second octet */
yuuji@0 1211 else return U8G_ENDSTRI; /* empty string */
yuuji@0 1212 ret |= c;
yuuji@0 1213 /* surrogate? */
yuuji@0 1214 if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) {
yuuji@0 1215 /* invalid first surrogate */
yuuji@0 1216 if ((ret > UTF16_SURRHEND) || (j < 2)) return U8G_NOTUTF8;
yuuji@0 1217 j -= 2; /* count two octets */
yuuji@0 1218 d = (*t++) << 8; /* first octet of second surrogate */
yuuji@0 1219 d |= *t++; /* second octet of second surrogate */
yuuji@0 1220 if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) return U8G_NOTUTF8;
yuuji@0 1221 ret = UTF16_BASE + ((ret & UTF16_MASK) << UTF16_SHIFT) +
yuuji@0 1222 (d & UTF16_MASK);
yuuji@0 1223 }
yuuji@0 1224 break;
yuuji@0 1225 default: /* unknown/unsupported character set type */
yuuji@0 1226 return U8G_NOTUTF8;
yuuji@0 1227 }
yuuji@0 1228 *s = t; /* update pointer and counter */
yuuji@0 1229 *i = j;
yuuji@0 1230 return ret;
yuuji@0 1231 }
yuuji@0 1232
yuuji@0 1233 /* Produce charset validity map for BMP
yuuji@0 1234 * Accepts: list of charsets to map
yuuji@0 1235 * Returns: validity map, indexed by BMP codepoint
yuuji@0 1236 *
yuuji@0 1237 * Bit 0x1 is the "not-CJK" character bit
yuuji@0 1238 */
yuuji@0 1239
yuuji@0 1240 unsigned long *utf8_csvalidmap (char *charsets[])
yuuji@0 1241 {
yuuji@0 1242 unsigned short u,*tab;
yuuji@0 1243 unsigned int m,ku,ten;
yuuji@0 1244 unsigned long i,csi,csb;
yuuji@0 1245 struct utf8_eucparam *param,*p2;
yuuji@0 1246 char *s;
yuuji@0 1247 const CHARSET *cs;
yuuji@0 1248 unsigned long *ret = (unsigned long *)
yuuji@0 1249 fs_get (i = 0x10000 * sizeof (unsigned long));
yuuji@0 1250 memset (ret,0,i); /* zero the entire vector */
yuuji@0 1251 /* mark all the non-CJK codepoints */
yuuji@0 1252 /* U+0000 - U+2E7F non-CJK */
yuuji@0 1253 for (i = 0; i < 0x2E7F; ++i) ret[i] = 0x1;
yuuji@0 1254 /* U+2E80 - U+2EFF CJK Radicals Supplement
yuuji@0 1255 * U+2F00 - U+2FDF Kangxi Radicals
yuuji@0 1256 * U+2FE0 - U+2FEF unassigned
yuuji@0 1257 * U+2FF0 - U+2FFF Ideographic Description Characters
yuuji@0 1258 * U+3000 - U+303F CJK Symbols and Punctuation
yuuji@0 1259 * U+3040 - U+309F Hiragana
yuuji@0 1260 * U+30A0 - U+30FF Katakana
yuuji@0 1261 * U+3100 - U+312F BoPoMoFo
yuuji@0 1262 * U+3130 - U+318F Hangul Compatibility Jamo
yuuji@0 1263 * U+3190 - U+319F Kanbun
yuuji@0 1264 * U+31A0 - U+31BF BoPoMoFo Extended
yuuji@0 1265 * U+31C0 - U+31EF CJK Strokes
yuuji@0 1266 * U+31F0 - U+31FF Katakana Phonetic Extensions
yuuji@0 1267 * U+3200 - U+32FF Enclosed CJK Letters and Months
yuuji@0 1268 * U+3300 - U+33FF CJK Compatibility
yuuji@0 1269 * U+3400 - U+4DBF CJK Unified Ideographs Extension A
yuuji@0 1270 * U+4DC0 - U+4DFF Yijing Hexagram Symbols
yuuji@0 1271 * U+4E00 - U+9FFF CJK Unified Ideographs
yuuji@0 1272 * U+A000 - U+A48F Yi Syllables
yuuji@0 1273 * U+A490 - U+A4CF Yi Radicals
yuuji@0 1274 * U+A700 - U+A71F Modifier Tone Letters
yuuji@0 1275 */
yuuji@0 1276 for (i = 0xa720; i < 0xabff; ++i) ret[i] = 0x1;
yuuji@0 1277 /* U+AC00 - U+D7FF Hangul Syllables */
yuuji@0 1278 for (i = 0xd800; i < 0xf8ff; ++i) ret[i] = 0x1;
yuuji@0 1279 /* U+F900 - U+FAFF CJK Compatibility Ideographs */
yuuji@0 1280 for (i = 0xfb00; i < 0xfe2f; ++i) ret[i] = 0x1;
yuuji@0 1281 /* U+FE30 - U+FE4F CJK Compatibility Forms
yuuji@0 1282 * U+FE50 - U+FE6F Small Form Variants (for CNS 11643)
yuuji@0 1283 */
yuuji@0 1284 for (i = 0xfe70; i < 0xfeff; ++i) ret[i] = 0x1;
yuuji@0 1285 /* U+FF00 - U+FFEF CJK Compatibility Ideographs */
yuuji@0 1286 for (i = 0xfff0; i < 0x10000; ++i) ret[i] = 0x1;
yuuji@0 1287
yuuji@0 1288 /* for each supplied charset */
yuuji@0 1289 for (csi = 1; ret && charsets && (s = charsets[csi - 1]); ++csi) {
yuuji@0 1290 /* substitute EUC-JP for ISO-2022-JP */
yuuji@0 1291 if (!compare_cstring (s,"ISO-2022-JP")) s = "EUC-JP";
yuuji@0 1292 /* look up charset */
yuuji@0 1293 if (cs = utf8_charset (s)) {
yuuji@0 1294 csb = 1 << csi; /* charset bit */
yuuji@0 1295 switch (cs->type) {
yuuji@0 1296 case CT_ASCII: /* 7-bit ASCII no table */
yuuji@0 1297 case CT_1BYTE0: /* 1 byte no table */
yuuji@0 1298 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
yuuji@0 1299 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
yuuji@0 1300 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
yuuji@0 1301 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
yuuji@0 1302 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
yuuji@0 1303 case CT_SJIS: /* 2 byte Shift-JIS */
yuuji@0 1304 /* supported charset type, all ASCII is OK */
yuuji@0 1305 for (i = 0; i < 128; ++i) ret[i] |= csb;
yuuji@0 1306 break;
yuuji@0 1307 default: /* unsupported charset type */
yuuji@0 1308 fs_give ((void **) &ret);
yuuji@0 1309 break;
yuuji@0 1310 }
yuuji@0 1311 /* now do additional operations */
yuuji@0 1312 if (ret) switch (cs->type) {
yuuji@0 1313 case CT_1BYTE0: /* 1 byte no table */
yuuji@0 1314 for (i = 128; i < 256; i++) ret[i] |= csb;
yuuji@0 1315 break;
yuuji@0 1316 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
yuuji@0 1317 for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++)
yuuji@0 1318 if (tab[i & BITS7] != UBOGON) ret[tab[i & BITS7]] |= csb;
yuuji@0 1319 break;
yuuji@0 1320 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
yuuji@0 1321 for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++)
yuuji@0 1322 if (tab[i] != UBOGON) ret[tab[i]] |= csb;
yuuji@0 1323 break;
yuuji@0 1324 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
yuuji@0 1325 for (param = (struct utf8_eucparam *) cs->tab,
yuuji@0 1326 tab = (unsigned short *) param->tab, ku = 0;
yuuji@0 1327 ku < param->max_ku; ku++)
yuuji@0 1328 for (ten = 0; ten < param->max_ten; ten++)
yuuji@0 1329 if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
yuuji@0 1330 ret[u] |= csb;
yuuji@0 1331 break;
yuuji@0 1332
yuuji@0 1333 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
yuuji@0 1334 for (param = (struct utf8_eucparam *) cs->tab,
yuuji@0 1335 tab = (unsigned short *) param->tab, ku = 0;
yuuji@0 1336 ku < param->max_ku; ku++)
yuuji@0 1337 for (ten = 0; ten < param->max_ten; ten++)
yuuji@0 1338 if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
yuuji@0 1339 ret[u] |= csb;
yuuji@0 1340 break;
yuuji@0 1341 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
yuuji@0 1342 param = (struct utf8_eucparam *) cs->tab;
yuuji@0 1343 p2 = param + 1; /* plane 2 parameters */
yuuji@0 1344 /* only ten parameters should differ */
yuuji@0 1345 if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku))
yuuji@0 1346 fatal ("ku definition error for CT_DBYTE2 charset");
yuuji@0 1347 /* total codepoints in each ku */
yuuji@0 1348 m = param->max_ten + p2->max_ten;
yuuji@0 1349 tab = (unsigned short *) param->tab;
yuuji@0 1350 for (ku = 0; ku < param->max_ku; ku++) {
yuuji@0 1351 for (ten = 0; ten < param->max_ten; ten++)
yuuji@0 1352 if ((u = tab[(ku * m) + ten]) != UBOGON)
yuuji@0 1353 ret[u] |= csb;
yuuji@0 1354 for (ten = 0; ten < p2->max_ten; ten++)
yuuji@0 1355 if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON)
yuuji@0 1356 ret[u] |= csb;
yuuji@0 1357 }
yuuji@0 1358 break;
yuuji@0 1359 case CT_SJIS: /* 2 byte Shift-JIS */
yuuji@0 1360 for (ku = 0; ku < MAX_JIS0208_KU; ku++)
yuuji@0 1361 for (ten = 0; ten < MAX_JIS0208_TEN; ten++)
yuuji@0 1362 if ((u = jis0208tab[ku][ten]) != UBOGON) ret[u] |= csb;
yuuji@0 1363 /* JIS hankaku katakana */
yuuji@0 1364 for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++)
yuuji@0 1365 ret[UCS2_KATAKANA + u] |= csb;
yuuji@0 1366 break;
yuuji@0 1367 }
yuuji@0 1368 }
yuuji@0 1369 /* invalid charset, punt */
yuuji@0 1370 else fs_give ((void **) &ret);
yuuji@0 1371 }
yuuji@0 1372 return ret;
yuuji@0 1373 }
yuuji@0 1374
yuuji@0 1375 /* Infer charset from unlabelled sized text
yuuji@0 1376 * Accepts: sized text
yuuji@0 1377 * Returns: charset if one inferred, or NIL if unknown
yuuji@0 1378 */
yuuji@0 1379
yuuji@0 1380 const CHARSET *utf8_infercharset (SIZEDTEXT *src)
yuuji@0 1381 {
yuuji@0 1382 long iso2022jp = NIL;
yuuji@0 1383 long eightbit = NIL;
yuuji@0 1384 unsigned long i;
yuuji@0 1385 /* look for ISO 2022 */
yuuji@0 1386 if (src) for (i = 0; i < src->size; i++) {
yuuji@0 1387 /* ESC sequence? */
yuuji@0 1388 if ((src->data[i] == I2C_ESC) && (++i < src->size)) switch (src->data[i]) {
yuuji@0 1389 case I2C_MULTI: /* yes, multibyte? */
yuuji@0 1390 if (++i < src->size) switch (src->data[i]) {
yuuji@0 1391 case I2CS_94x94_JIS_OLD: /* JIS X 0208-1978 */
yuuji@0 1392 case I2CS_94x94_JIS_NEW: /* JIS X 0208-1983 */
yuuji@0 1393 case I2CS_94x94_JIS_EXT: /* JIS X 0212-1990 (kludge...) */
yuuji@0 1394 iso2022jp = T; /* found an ISO-2022-JP sequence */
yuuji@0 1395 break;
yuuji@0 1396 default: /* other multibyte */
yuuji@0 1397 return NIL; /* definitely invalid */
yuuji@0 1398 }
yuuji@0 1399 break;
yuuji@0 1400 case I2C_G0_94: /* single byte */
yuuji@0 1401 if (++i < src->size) switch (src->data[i]) {
yuuji@0 1402 case I2CS_94_JIS_BUGROM: /* in case old buggy software */
yuuji@0 1403 case I2CS_94_JIS_ROMAN: /* JIS X 0201-1976 left half */
yuuji@0 1404 case I2CS_94_ASCII: /* ASCII */
yuuji@0 1405 case I2CS_94_BRITISH: /* good enough for gov't work */
yuuji@0 1406 break;
yuuji@0 1407 default: /* other 94 single byte */
yuuji@0 1408 return NIL; /* definitely invalid */
yuuji@0 1409 }
yuuji@0 1410 }
yuuji@0 1411 /* if possible UTF-8 and not ISO-2022-JP */
yuuji@0 1412 else if (!iso2022jp && (eightbit >= 0) && (src->data[i] & BIT8) &&
yuuji@0 1413 (eightbit = utf8_validate (src->data + i,src->size - i)) > 0)
yuuji@0 1414 i += eightbit - 1; /* skip past all but last of UTF-8 char */
yuuji@0 1415 }
yuuji@0 1416 /* ISO-2022-JP overrides other guesses */
yuuji@0 1417 if (iso2022jp) return utf8_charset ("ISO-2022-JP");
yuuji@0 1418 if (eightbit > 0) return utf8_charset ("UTF-8");
yuuji@0 1419 return eightbit ? NIL : utf8_charset ("US-ASCII");
yuuji@0 1420 }
yuuji@0 1421
yuuji@0 1422
yuuji@0 1423 /* Validate that character at this position is UTF-8
yuuji@0 1424 * Accepts: string pointer
yuuji@0 1425 * size of remaining string
yuuji@0 1426 * Returns: size of UTF-8 character in octets or -1 if not UTF-8
yuuji@0 1427 */
yuuji@0 1428
yuuji@0 1429 long utf8_validate (unsigned char *s,unsigned long i)
yuuji@0 1430 {
yuuji@0 1431 unsigned long j = i;
yuuji@0 1432 return (utf8_get (&s,&i) & U8G_ERROR) ? -1 : j - i;
yuuji@0 1433 }
yuuji@0 1434
yuuji@0 1435 /* Convert ISO 8859-1 to UTF-8
yuuji@0 1436 * Accepts: source sized text
yuuji@0 1437 * pointer to return sized text
yuuji@0 1438 * canonicalization function
yuuji@0 1439 */
yuuji@0 1440
yuuji@0 1441 void utf8_text_1byte0 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
yuuji@0 1442 {
yuuji@0 1443 unsigned long i;
yuuji@0 1444 unsigned char *s;
yuuji@0 1445 unsigned int c;
yuuji@0 1446 for (ret->size = i = 0; i < text->size;) {
yuuji@0 1447 c = text->data[i++];
yuuji@0 1448 UTF8_COUNT_BMP (ret->size,c,cv,de)
yuuji@0 1449 }
yuuji@0 1450 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
yuuji@0 1451 for (i = 0; i < text->size;) {
yuuji@0 1452 c = text->data[i++];
yuuji@0 1453 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
yuuji@0 1454 }
yuuji@0 1455 }
yuuji@0 1456
yuuji@0 1457
yuuji@0 1458 /* Convert single byte ASCII+8bit character set sized text to UTF-8
yuuji@0 1459 * Accepts: source sized text
yuuji@0 1460 * pointer to return sized text
yuuji@0 1461 * conversion table
yuuji@0 1462 * canonicalization function
yuuji@0 1463 */
yuuji@0 1464
yuuji@0 1465 void utf8_text_1byte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
yuuji@0 1466 ucs4de_t de)
yuuji@0 1467 {
yuuji@0 1468 unsigned long i;
yuuji@0 1469 unsigned char *s;
yuuji@0 1470 unsigned int c;
yuuji@0 1471 unsigned short *tbl = (unsigned short *) tab;
yuuji@0 1472 for (ret->size = i = 0; i < text->size;) {
yuuji@0 1473 if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7];
yuuji@0 1474 UTF8_COUNT_BMP (ret->size,c,cv,de)
yuuji@0 1475 }
yuuji@0 1476 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
yuuji@0 1477 for (i = 0; i < text->size;) {
yuuji@0 1478 if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7];
yuuji@0 1479 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
yuuji@0 1480 }
yuuji@0 1481 }
yuuji@0 1482
yuuji@0 1483 /* Convert single byte 8bit character set sized text to UTF-8
yuuji@0 1484 * Accepts: source sized text
yuuji@0 1485 * pointer to return sized text
yuuji@0 1486 * conversion table
yuuji@0 1487 * canonicalization function
yuuji@0 1488 */
yuuji@0 1489
yuuji@0 1490 void utf8_text_1byte8 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
yuuji@0 1491 ucs4de_t de)
yuuji@0 1492 {
yuuji@0 1493 unsigned long i;
yuuji@0 1494 unsigned char *s;
yuuji@0 1495 unsigned int c;
yuuji@0 1496 unsigned short *tbl = (unsigned short *) tab;
yuuji@0 1497 for (ret->size = i = 0; i < text->size;) {
yuuji@0 1498 c = tbl[text->data[i++]];
yuuji@0 1499 UTF8_COUNT_BMP (ret->size,c,cv,de)
yuuji@0 1500 }
yuuji@0 1501 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
yuuji@0 1502 for (i = 0; i < text->size;) {
yuuji@0 1503 c = tbl[text->data[i++]];
yuuji@0 1504 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
yuuji@0 1505 }
yuuji@0 1506 }
yuuji@0 1507
yuuji@0 1508 /* Convert EUC sized text to UTF-8
yuuji@0 1509 * Accepts: source sized text
yuuji@0 1510 * pointer to return sized text
yuuji@0 1511 * EUC parameter table
yuuji@0 1512 * canonicalization function
yuuji@0 1513 */
yuuji@0 1514
yuuji@0 1515 void utf8_text_euc (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
yuuji@0 1516 ucs4de_t de)
yuuji@0 1517 {
yuuji@0 1518 unsigned long i;
yuuji@0 1519 unsigned char *s;
yuuji@0 1520 unsigned int pass,c,c1,ku,ten;
yuuji@0 1521 struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
yuuji@0 1522 struct utf8_eucparam *p2 = p1 + 1;
yuuji@0 1523 struct utf8_eucparam *p3 = p1 + 2;
yuuji@0 1524 unsigned short *t1 = (unsigned short *) p1->tab;
yuuji@0 1525 unsigned short *t2 = (unsigned short *) p2->tab;
yuuji@0 1526 unsigned short *t3 = (unsigned short *) p3->tab;
yuuji@0 1527 for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
yuuji@0 1528 for (i = 0; i < text->size;) {
yuuji@0 1529 /* not CS0? */
yuuji@0 1530 if ((c = text->data[i++]) & BIT8) {
yuuji@0 1531 /* yes, must have another high byte */
yuuji@0 1532 if ((i >= text->size) || !((c1 = text->data[i++]) & BIT8))
yuuji@0 1533 c = UBOGON; /* out of space or bogon */
yuuji@0 1534 else switch (c) { /* check 8bit code set */
yuuji@0 1535 case EUC_CS2: /* CS2 */
yuuji@0 1536 if (p2->base_ku) { /* CS2 set up? */
yuuji@0 1537 if (p2->base_ten) /* yes, multibyte? */
yuuji@0 1538 c = ((i < text->size) && ((c = text->data[i++]) & BIT8) &&
yuuji@0 1539 ((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) &&
yuuji@0 1540 ((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) ?
yuuji@0 1541 t2[(ku*p2->max_ten) + ten] : UBOGON;
yuuji@0 1542 else c = ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) ?
yuuji@0 1543 c1 + ((unsigned long) p2->tab) : UBOGON;
yuuji@0 1544 }
yuuji@0 1545 else { /* CS2 not set up */
yuuji@0 1546 c = UBOGON; /* swallow byte, say bogon */
yuuji@0 1547 if (i < text->size) i++;
yuuji@0 1548 }
yuuji@0 1549 break;
yuuji@0 1550 case EUC_CS3: /* CS3 */
yuuji@0 1551 if (p3->base_ku) { /* CS3 set up? */
yuuji@0 1552 if (p3->base_ten) /* yes, multibyte? */
yuuji@0 1553 c = ((i < text->size) && ((c = text->data[i++]) & BIT8) &&
yuuji@0 1554 ((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) &&
yuuji@0 1555 ((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) ?
yuuji@0 1556 t3[(ku*p3->max_ten) + ten] : UBOGON;
yuuji@0 1557 else c = ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) ?
yuuji@0 1558 c1 + ((unsigned long) p3->tab) : UBOGON;
yuuji@0 1559 }
yuuji@0 1560 else { /* CS3 not set up */
yuuji@0 1561 c = UBOGON; /* swallow byte, say bogon */
yuuji@0 1562 if (i < text->size) i++;
yuuji@0 1563 }
yuuji@0 1564 break;
yuuji@0 1565
yuuji@0 1566 default:
yuuji@0 1567 if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) ||
yuuji@0 1568 ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten)) c = UBOGON;
yuuji@0 1569 else if (((c = t1[(ku*p1->max_ten) + ten]) == UBOGON) &&
yuuji@0 1570 /* special hack for JIS X 0212: merge rows less than 10 */
yuuji@0 1571 ku && (ku < 10) && t3 && p3->base_ten)
yuuji@0 1572 c = t3[((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten];
yuuji@0 1573 }
yuuji@0 1574 }
yuuji@0 1575 /* convert if second pass */
yuuji@0 1576 if (pass) UTF8_WRITE_BMP (s,c,cv,de)
yuuji@0 1577 else UTF8_COUNT_BMP (ret->size,c,cv,de);
yuuji@0 1578 }
yuuji@0 1579 if (!pass) (s = ret->data = (unsigned char *)
yuuji@0 1580 fs_get (ret->size + 1))[ret->size] =NIL;
yuuji@0 1581 }
yuuji@0 1582 }
yuuji@0 1583
yuuji@0 1584
yuuji@0 1585 /* Convert ASCII + double-byte sized text to UTF-8
yuuji@0 1586 * Accepts: source sized text
yuuji@0 1587 * pointer to return sized text
yuuji@0 1588 * conversion table
yuuji@0 1589 * canonicalization function
yuuji@0 1590 */
yuuji@0 1591
yuuji@0 1592 void utf8_text_dbyte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
yuuji@0 1593 ucs4de_t de)
yuuji@0 1594 {
yuuji@0 1595 unsigned long i;
yuuji@0 1596 unsigned char *s;
yuuji@0 1597 unsigned int c,c1,ku,ten;
yuuji@0 1598 struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
yuuji@0 1599 unsigned short *t1 = (unsigned short *) p1->tab;
yuuji@0 1600 for (ret->size = i = 0; i < text->size;) {
yuuji@0 1601 if ((c = text->data[i++]) & BIT8) {
yuuji@0 1602 /* special hack for GBK: 0x80 is Euro */
yuuji@0 1603 if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO;
yuuji@0 1604 else c = ((i < text->size) && (c1 = text->data[i++]) &&
yuuji@0 1605 ((ku = c - p1->base_ku) < p1->max_ku) &&
yuuji@0 1606 ((ten = c1 - p1->base_ten) < p1->max_ten)) ?
yuuji@0 1607 t1[(ku*p1->max_ten) + ten] : UBOGON;
yuuji@0 1608 }
yuuji@0 1609 UTF8_COUNT_BMP (ret->size,c,cv,de)
yuuji@0 1610 }
yuuji@0 1611 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
yuuji@0 1612 for (i = 0; i < text->size;) {
yuuji@0 1613 if ((c = text->data[i++]) & BIT8) {
yuuji@0 1614 /* special hack for GBK: 0x80 is Euro */
yuuji@0 1615 if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO;
yuuji@0 1616 else c = ((i < text->size) && (c1 = text->data[i++]) &&
yuuji@0 1617 ((ku = c - p1->base_ku) < p1->max_ku) &&
yuuji@0 1618 ((ten = c1 - p1->base_ten) < p1->max_ten)) ?
yuuji@0 1619 t1[(ku*p1->max_ten) + ten] : UBOGON;
yuuji@0 1620 }
yuuji@0 1621 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
yuuji@0 1622 }
yuuji@0 1623 }
yuuji@0 1624
yuuji@0 1625 /* Convert ASCII + double byte 2 plane sized text to UTF-8
yuuji@0 1626 * Accepts: source sized text
yuuji@0 1627 * pointer to return sized text
yuuji@0 1628 * conversion table
yuuji@0 1629 * canonicalization function
yuuji@0 1630 */
yuuji@0 1631
yuuji@0 1632 void utf8_text_dbyte2 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
yuuji@0 1633 ucs4de_t de)
yuuji@0 1634 {
yuuji@0 1635 unsigned long i;
yuuji@0 1636 unsigned char *s;
yuuji@0 1637 unsigned int c,c1,ku,ten;
yuuji@0 1638 struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
yuuji@0 1639 struct utf8_eucparam *p2 = p1 + 1;
yuuji@0 1640 unsigned short *t = (unsigned short *) p1->tab;
yuuji@0 1641 for (ret->size = i = 0; i < text->size;) {
yuuji@0 1642 if ((c = text->data[i++]) & BIT8) {
yuuji@0 1643 if ((i >= text->size) || !(c1 = text->data[i++]))
yuuji@0 1644 c = UBOGON; /* out of space or bogon */
yuuji@0 1645 else if (c1 & BIT8) /* high vs. low plane */
yuuji@0 1646 c = ((ku = c - p2->base_ku) < p2->max_ku &&
yuuji@0 1647 ((ten = c1 - p2->base_ten) < p2->max_ten)) ?
yuuji@0 1648 t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON;
yuuji@0 1649 else c = ((ku = c - p1->base_ku) < p1->max_ku &&
yuuji@0 1650 ((ten = c1 - p1->base_ten) < p1->max_ten)) ?
yuuji@0 1651 t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON;
yuuji@0 1652 }
yuuji@0 1653 UTF8_COUNT_BMP (ret->size,c,cv,de)
yuuji@0 1654 }
yuuji@0 1655 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
yuuji@0 1656 for (i = 0; i < text->size;) {
yuuji@0 1657 if ((c = text->data[i++]) & BIT8) {
yuuji@0 1658 if ((i >= text->size) || !(c1 = text->data[i++]))
yuuji@0 1659 c = UBOGON; /* out of space or bogon */
yuuji@0 1660 else if (c1 & BIT8) /* high vs. low plane */
yuuji@0 1661 c = ((ku = c - p2->base_ku) < p2->max_ku &&
yuuji@0 1662 ((ten = c1 - p2->base_ten) < p2->max_ten)) ?
yuuji@0 1663 t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON;
yuuji@0 1664 else c = ((ku = c - p1->base_ku) < p1->max_ku &&
yuuji@0 1665 ((ten = c1 - p1->base_ten) < p1->max_ten)) ?
yuuji@0 1666 t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON;
yuuji@0 1667 }
yuuji@0 1668 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
yuuji@0 1669 }
yuuji@0 1670 }
yuuji@0 1671
yuuji@0 1672 #ifdef JISTOUNICODE /* Japanese */
yuuji@0 1673 /* Convert Shift JIS sized text to UTF-8
yuuji@0 1674 * Accepts: source sized text
yuuji@0 1675 * pointer to return sized text
yuuji@0 1676 * canonicalization function
yuuji@0 1677 */
yuuji@0 1678
yuuji@0 1679 void utf8_text_sjis (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,
yuuji@0 1680 ucs4de_t de)
yuuji@0 1681 {
yuuji@0 1682 unsigned long i;
yuuji@0 1683 unsigned char *s;
yuuji@0 1684 unsigned int c,c1,ku,ten;
yuuji@0 1685 for (ret->size = i = 0; i < text->size;) {
yuuji@0 1686 if ((c = text->data[i++]) & BIT8) {
yuuji@0 1687 /* half-width katakana */
yuuji@0 1688 if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8;
yuuji@0 1689 else if (i >= text->size) c = UBOGON;
yuuji@0 1690 else { /* Shift-JIS */
yuuji@0 1691 c1 = text->data[i++];
yuuji@0 1692 SJISTOJIS (c,c1);
yuuji@0 1693 c = JISTOUNICODE (c,c1,ku,ten);
yuuji@0 1694 }
yuuji@0 1695 }
yuuji@0 1696 /* compromise - do yen sign but not overline */
yuuji@0 1697 else if (c == JISROMAN_YEN) c = UCS2_YEN;
yuuji@0 1698 UTF8_COUNT_BMP (ret->size,c,cv,de)
yuuji@0 1699 }
yuuji@0 1700 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
yuuji@0 1701 for (i = 0; i < text->size;) {
yuuji@0 1702 if ((c = text->data[i++]) & BIT8) {
yuuji@0 1703 /* half-width katakana */
yuuji@0 1704 if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8;
yuuji@0 1705 else { /* Shift-JIS */
yuuji@0 1706 c1 = text->data[i++];
yuuji@0 1707 SJISTOJIS (c,c1);
yuuji@0 1708 c = JISTOUNICODE (c,c1,ku,ten);
yuuji@0 1709 }
yuuji@0 1710 }
yuuji@0 1711 /* compromise - do yen sign but not overline */
yuuji@0 1712 else if (c == JISROMAN_YEN) c = UCS2_YEN;
yuuji@0 1713 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
yuuji@0 1714 }
yuuji@0 1715 }
yuuji@0 1716 #endif
yuuji@0 1717
yuuji@0 1718 /* Convert ISO-2022 sized text to UTF-8
yuuji@0 1719 * Accepts: source sized text
yuuji@0 1720 * pointer to returned sized text
yuuji@0 1721 * canonicalization function
yuuji@0 1722 */
yuuji@0 1723
yuuji@0 1724 void utf8_text_2022 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
yuuji@0 1725 {
yuuji@0 1726 unsigned long i;
yuuji@0 1727 unsigned char *s;
yuuji@0 1728 unsigned int pass,state,c,co,gi,gl,gr,g[4],ku,ten;
yuuji@0 1729 for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
yuuji@0 1730 gi = 0; /* quell compiler warnings */
yuuji@0 1731 state = I2S_CHAR; /* initialize engine */
yuuji@0 1732 g[0]= g[2] = I2CS_ASCII; /* G0 and G2 are ASCII */
yuuji@0 1733 g[1]= g[3] = I2CS_ISO8859_1;/* G1 and G3 are ISO-8850-1 */
yuuji@0 1734 gl = I2C_G0; gr = I2C_G1; /* left is G0, right is G1 */
yuuji@0 1735 for (i = 0; i < text->size;) {
yuuji@0 1736 c = text->data[i++];
yuuji@0 1737 switch (state) { /* dispatch based upon engine state */
yuuji@0 1738 case I2S_ESC: /* ESC seen */
yuuji@0 1739 switch (c) { /* process intermediate character */
yuuji@0 1740 case I2C_MULTI: /* multibyte character? */
yuuji@0 1741 state = I2S_MUL; /* mark multibyte flag seen */
yuuji@0 1742 break;
yuuji@0 1743 case I2C_SS2: /* single shift GL to G2 */
yuuji@0 1744 case I2C_SS2_ALT: /* Taiwan SeedNet */
yuuji@0 1745 gl |= I2C_SG2;
yuuji@0 1746 break;
yuuji@0 1747 case I2C_SS3: /* single shift GL to G3 */
yuuji@0 1748 case I2C_SS3_ALT: /* Taiwan SeedNet */
yuuji@0 1749 gl |= I2C_SG3;
yuuji@0 1750 break;
yuuji@0 1751 case I2C_LS2: /* shift GL to G2 */
yuuji@0 1752 gl = I2C_G2;
yuuji@0 1753 break;
yuuji@0 1754 case I2C_LS3: /* shift GL to G3 */
yuuji@0 1755 gl = I2C_G3;
yuuji@0 1756 break;
yuuji@0 1757 case I2C_LS1R: /* shift GR to G1 */
yuuji@0 1758 gr = I2C_G1;
yuuji@0 1759 break;
yuuji@0 1760 case I2C_LS2R: /* shift GR to G2 */
yuuji@0 1761 gr = I2C_G2;
yuuji@0 1762 break;
yuuji@0 1763 case I2C_LS3R: /* shift GR to G3 */
yuuji@0 1764 gr = I2C_G3;
yuuji@0 1765 break;
yuuji@0 1766 case I2C_G0_94: case I2C_G1_94: case I2C_G2_94: case I2C_G3_94:
yuuji@0 1767 g[gi = c - I2C_G0_94] = (state == I2S_MUL) ? I2CS_94x94 : I2CS_94;
yuuji@0 1768 state = I2S_INT; /* ready for character set */
yuuji@0 1769 break;
yuuji@0 1770 case I2C_G0_96: case I2C_G1_96: case I2C_G2_96: case I2C_G3_96:
yuuji@0 1771 g[gi = c - I2C_G0_96] = (state == I2S_MUL) ? I2CS_96x96 : I2CS_96;
yuuji@0 1772 state = I2S_INT; /* ready for character set */
yuuji@0 1773 break;
yuuji@0 1774 default: /* bogon */
yuuji@0 1775 if (pass) *s++ = I2C_ESC,*s++ = c;
yuuji@0 1776 else ret->size += 2;
yuuji@0 1777 state = I2S_CHAR; /* return to previous state */
yuuji@0 1778 }
yuuji@0 1779 break;
yuuji@0 1780
yuuji@0 1781 case I2S_MUL: /* ESC $ */
yuuji@0 1782 switch (c) { /* process multibyte intermediate character */
yuuji@0 1783 case I2C_G0_94: case I2C_G1_94: case I2C_G2_94: case I2C_G3_94:
yuuji@0 1784 g[gi = c - I2C_G0_94] = I2CS_94x94;
yuuji@0 1785 state = I2S_INT; /* ready for character set */
yuuji@0 1786 break;
yuuji@0 1787 case I2C_G0_96: case I2C_G1_96: case I2C_G2_96: case I2C_G3_96:
yuuji@0 1788 g[gi = c - I2C_G0_96] = I2CS_96x96;
yuuji@0 1789 state = I2S_INT; /* ready for character set */
yuuji@0 1790 break;
yuuji@0 1791 default: /* probably omitted I2CS_94x94 */
yuuji@0 1792 g[gi = I2C_G0] = I2CS_94x94 | c;
yuuji@0 1793 state = I2S_CHAR; /* return to character state */
yuuji@0 1794 }
yuuji@0 1795 break;
yuuji@0 1796 case I2S_INT:
yuuji@0 1797 state = I2S_CHAR; /* return to character state */
yuuji@0 1798 g[gi] |= c; /* set character set */
yuuji@0 1799 break;
yuuji@0 1800
yuuji@0 1801 case I2S_CHAR: /* character data */
yuuji@0 1802 switch (c) {
yuuji@0 1803 case I2C_ESC: /* ESC character */
yuuji@0 1804 state = I2S_ESC; /* see if ISO-2022 prefix */
yuuji@0 1805 break;
yuuji@0 1806 case I2C_SI: /* shift GL to G0 */
yuuji@0 1807 gl = I2C_G0;
yuuji@0 1808 break;
yuuji@0 1809 case I2C_SO: /* shift GL to G1 */
yuuji@0 1810 gl = I2C_G1;
yuuji@0 1811 break;
yuuji@0 1812 case I2C_SS2_ALT: /* single shift GL to G2 */
yuuji@0 1813 case I2C_SS2_ALT_7:
yuuji@0 1814 gl |= I2C_SG2;
yuuji@0 1815 break;
yuuji@0 1816 case I2C_SS3_ALT: /* single shift GL to G3 */
yuuji@0 1817 case I2C_SS3_ALT_7:
yuuji@0 1818 gl |= I2C_SG3;
yuuji@0 1819 break;
yuuji@0 1820
yuuji@0 1821 default: /* ordinary character */
yuuji@0 1822 co = c; /* note original character */
yuuji@0 1823 if (gl & (3 << 2)) { /* single shifted? */
yuuji@0 1824 gi = g[gl >> 2]; /* get shifted character set */
yuuji@0 1825 gl &= 0x3; /* cancel shift */
yuuji@0 1826 }
yuuji@0 1827 /* select left or right half */
yuuji@0 1828 else gi = (c & BIT8) ? g[gr] : g[gl];
yuuji@0 1829 c &= BITS7; /* make 7-bit */
yuuji@0 1830 switch (gi) { /* interpret in character set */
yuuji@0 1831 case I2CS_ASCII: /* ASCII */
yuuji@0 1832 break; /* easy! */
yuuji@0 1833 case I2CS_BRITISH: /* British ASCII */
yuuji@0 1834 /* Pound sterling sign */
yuuji@0 1835 if (c == BRITISH_POUNDSTERLING) c = UCS2_POUNDSTERLING;
yuuji@0 1836 break;
yuuji@0 1837 case I2CS_JIS_ROMAN: /* JIS Roman */
yuuji@0 1838 case I2CS_JIS_BUGROM: /* old bugs */
yuuji@0 1839 switch (c) { /* two exceptions to ASCII */
yuuji@0 1840 case JISROMAN_YEN: /* Yen sign */
yuuji@0 1841 c = UCS2_YEN;
yuuji@0 1842 break;
yuuji@0 1843 /* overline */
yuuji@0 1844 case JISROMAN_OVERLINE:
yuuji@0 1845 c = UCS2_OVERLINE;
yuuji@0 1846 break;
yuuji@0 1847 }
yuuji@0 1848 break;
yuuji@0 1849 case I2CS_JIS_KANA: /* JIS hankaku katakana */
yuuji@0 1850 if ((c >= MIN_KANA_7) && (c < MAX_KANA_7)) c += KANA_7;
yuuji@0 1851 break;
yuuji@0 1852
yuuji@0 1853 case I2CS_ISO8859_1: /* Latin-1 (West European) */
yuuji@0 1854 c |= BIT8; /* just turn on high bit */
yuuji@0 1855 break;
yuuji@0 1856 case I2CS_ISO8859_2: /* Latin-2 (Czech, Slovak) */
yuuji@0 1857 c = iso8859_2tab[c];
yuuji@0 1858 break;
yuuji@0 1859 case I2CS_ISO8859_3: /* Latin-3 (Dutch, Turkish) */
yuuji@0 1860 c = iso8859_3tab[c];
yuuji@0 1861 break;
yuuji@0 1862 case I2CS_ISO8859_4: /* Latin-4 (Scandinavian) */
yuuji@0 1863 c = iso8859_4tab[c];
yuuji@0 1864 break;
yuuji@0 1865 case I2CS_ISO8859_5: /* Cyrillic */
yuuji@0 1866 c = iso8859_5tab[c];
yuuji@0 1867 break;
yuuji@0 1868 case I2CS_ISO8859_6: /* Arabic */
yuuji@0 1869 c = iso8859_6tab[c];
yuuji@0 1870 break;
yuuji@0 1871 case I2CS_ISO8859_7: /* Greek */
yuuji@0 1872 c = iso8859_7tab[c];
yuuji@0 1873 break;
yuuji@0 1874 case I2CS_ISO8859_8: /* Hebrew */
yuuji@0 1875 c = iso8859_8tab[c];
yuuji@0 1876 break;
yuuji@0 1877 case I2CS_ISO8859_9: /* Latin-5 (Finnish, Portuguese) */
yuuji@0 1878 c = iso8859_9tab[c];
yuuji@0 1879 break;
yuuji@0 1880 case I2CS_TIS620: /* Thai */
yuuji@0 1881 c = tis620tab[c];
yuuji@0 1882 break;
yuuji@0 1883 case I2CS_ISO8859_10: /* Latin-6 (Northern Europe) */
yuuji@0 1884 c = iso8859_10tab[c];
yuuji@0 1885 break;
yuuji@0 1886 case I2CS_ISO8859_13: /* Latin-7 (Baltic) */
yuuji@0 1887 c = iso8859_13tab[c];
yuuji@0 1888 break;
yuuji@0 1889 case I2CS_VSCII: /* Vietnamese */
yuuji@0 1890 c = visciitab[c];
yuuji@0 1891 break;
yuuji@0 1892 case I2CS_ISO8859_14: /* Latin-8 (Celtic) */
yuuji@0 1893 c = iso8859_14tab[c];
yuuji@0 1894 break;
yuuji@0 1895 case I2CS_ISO8859_15: /* Latin-9 (Euro) */
yuuji@0 1896 c = iso8859_15tab[c];
yuuji@0 1897 break;
yuuji@0 1898 case I2CS_ISO8859_16: /* Latin-10 (Baltic) */
yuuji@0 1899 c = iso8859_16tab[c];
yuuji@0 1900 break;
yuuji@0 1901
yuuji@0 1902 default: /* all other character sets */
yuuji@0 1903 /* multibyte character set */
yuuji@0 1904 if ((gi & I2CS_MUL) && !(c & BIT8) && isgraph (c)) {
yuuji@0 1905 c = (i < text->size) ? text->data[i++] : 0;
yuuji@0 1906 switch (gi) {
yuuji@0 1907 #ifdef GBTOUNICODE
yuuji@0 1908 case I2CS_GB: /* GB 2312 */
yuuji@0 1909 co |= BIT8; /* make into EUC */
yuuji@0 1910 c |= BIT8;
yuuji@0 1911 c = GBTOUNICODE (co,c,ku,ten);
yuuji@0 1912 break;
yuuji@0 1913 #endif
yuuji@0 1914 #ifdef JISTOUNICODE
yuuji@0 1915 case I2CS_JIS_OLD:/* JIS X 0208-1978 */
yuuji@0 1916 case I2CS_JIS_NEW:/* JIS X 0208-1983 */
yuuji@0 1917 c = JISTOUNICODE (co,c,ku,ten);
yuuji@0 1918 break;
yuuji@0 1919 #endif
yuuji@0 1920 #ifdef JIS0212TOUNICODE
yuuji@0 1921 case I2CS_JIS_EXT:/* JIS X 0212-1990 */
yuuji@0 1922 c = JIS0212TOUNICODE (co,c,ku,ten);
yuuji@0 1923 break;
yuuji@0 1924 #endif
yuuji@0 1925 #ifdef KSCTOUNICODE
yuuji@0 1926 case I2CS_KSC: /* KSC 5601 */
yuuji@0 1927 co |= BIT8; /* make into EUC */
yuuji@0 1928 c |= BIT8;
yuuji@0 1929 c = KSCTOUNICODE (co,c,ku,ten);
yuuji@0 1930 break;
yuuji@0 1931 #endif
yuuji@0 1932 #ifdef CNS1TOUNICODE
yuuji@0 1933 case I2CS_CNS1: /* CNS 11643 plane 1 */
yuuji@0 1934 c = CNS1TOUNICODE (co,c,ku,ten);
yuuji@0 1935 break;
yuuji@0 1936 #endif
yuuji@0 1937 #ifdef CNS2TOUNICODE
yuuji@0 1938 case I2CS_CNS2: /* CNS 11643 plane 2 */
yuuji@0 1939 c = CNS2TOUNICODE (co,c,ku,ten);
yuuji@0 1940 break;
yuuji@0 1941 #endif
yuuji@0 1942 #ifdef CNS3TOUNICODE
yuuji@0 1943 case I2CS_CNS3: /* CNS 11643 plane 3 */
yuuji@0 1944 c = CNS3TOUNICODE (co,c,ku,ten);
yuuji@0 1945 break;
yuuji@0 1946 #endif
yuuji@0 1947 #ifdef CNS4TOUNICODE
yuuji@0 1948 case I2CS_CNS4: /* CNS 11643 plane 4 */
yuuji@0 1949 c = CNS4TOUNICODE (co,c,ku,ten);
yuuji@0 1950 break;
yuuji@0 1951 #endif
yuuji@0 1952 #ifdef CNS5TOUNICODE
yuuji@0 1953 case I2CS_CNS5: /* CNS 11643 plane 5 */
yuuji@0 1954 c = CNS5TOUNICODE (co,c,ku,ten);
yuuji@0 1955 break;
yuuji@0 1956 #endif
yuuji@0 1957 #ifdef CNS6TOUNICODE
yuuji@0 1958 case I2CS_CNS6: /* CNS 11643 plane 6 */
yuuji@0 1959 c = CNS6TOUNICODE (co,c,ku,ten);
yuuji@0 1960 break;
yuuji@0 1961 #endif
yuuji@0 1962 #ifdef CNS7TOUNICODE
yuuji@0 1963 case I2CS_CNS7: /* CNS 11643 plane 7 */
yuuji@0 1964 c = CNS7TOUNICODE (co,c,ku,ten);
yuuji@0 1965 break;
yuuji@0 1966 #endif
yuuji@0 1967 default: /* unknown multibyte, treat as UCS-2 */
yuuji@0 1968 c |= (co << 8); /* wrong, but nothing else to do */
yuuji@0 1969 break;
yuuji@0 1970 }
yuuji@0 1971 }
yuuji@0 1972 else c = co; /* unknown single byte, treat as 8859-1 */
yuuji@0 1973 }
yuuji@0 1974 /* convert if second pass */
yuuji@0 1975 if (pass) UTF8_WRITE_BMP (s,c,cv,de)
yuuji@0 1976 else UTF8_COUNT_BMP (ret->size,c,cv,de);
yuuji@0 1977 }
yuuji@0 1978 }
yuuji@0 1979 }
yuuji@0 1980 if (!pass) (s = ret->data = (unsigned char *)
yuuji@0 1981 fs_get (ret->size + 1))[ret->size] = NIL;
yuuji@0 1982 else if (((unsigned long) (s - ret->data)) != ret->size)
yuuji@0 1983 fatal ("ISO-2022 to UTF-8 botch");
yuuji@0 1984 }
yuuji@0 1985 }
yuuji@0 1986
yuuji@0 1987 /* Convert UTF-7 sized text to UTF-8
yuuji@0 1988 * Accepts: source sized text
yuuji@0 1989 * pointer to returned sized text
yuuji@0 1990 * canonicalization function
yuuji@0 1991 */
yuuji@0 1992
yuuji@0 1993 void utf8_text_utf7 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
yuuji@0 1994 {
yuuji@0 1995 unsigned long i;
yuuji@0 1996 unsigned char *s;
yuuji@0 1997 unsigned int c,c1,d,uc,pass,e,e1,state,surrh;
yuuji@0 1998 for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
yuuji@0 1999 c1 = d = uc = e = e1 = 0;
yuuji@0 2000 for (i = 0,state = NIL; i < text->size;) {
yuuji@0 2001 c = text->data[i++]; /* get next byte */
yuuji@0 2002 switch (state) {
yuuji@0 2003 case U7_PLUS: /* previous character was + */
yuuji@0 2004 if (c == '-') { /* +- means textual + */
yuuji@0 2005 c = '+';
yuuji@0 2006 state = U7_ASCII; /* revert to ASCII */
yuuji@0 2007 break;
yuuji@0 2008 }
yuuji@0 2009 state = U7_UNICODE; /* enter Unicode state */
yuuji@0 2010 e = e1 = 0; /* initialize Unicode quantum position */
yuuji@0 2011 case U7_UNICODE: /* Unicode state */
yuuji@0 2012 if (c == '-') state = U7_MINUS;
yuuji@0 2013 else { /* decode Unicode */
yuuji@0 2014 /* don't use isupper/islower since this is ASCII only */
yuuji@0 2015 if ((c >= 'A') && (c <= 'Z')) c -= 'A';
yuuji@0 2016 else if ((c >= 'a') && (c <= 'z')) c -= 'a' - 26;
yuuji@0 2017 else if (isdigit (c)) c -= '0' - 52;
yuuji@0 2018 else if (c == '+') c = 62;
yuuji@0 2019 else if (c == '/') c = 63;
yuuji@0 2020 else state = U7_ASCII;/* end of modified BASE64 */
yuuji@0 2021 }
yuuji@0 2022 break;
yuuji@0 2023 case U7_MINUS: /* previous character was absorbed - */
yuuji@0 2024 state = U7_ASCII; /* revert to ASCII */
yuuji@0 2025 case U7_ASCII: /* ASCII state */
yuuji@0 2026 if (c == '+') state = U7_PLUS;
yuuji@0 2027 break;
yuuji@0 2028 }
yuuji@0 2029
yuuji@0 2030 switch (state) { /* store character if in character mode */
yuuji@0 2031 case U7_UNICODE: /* Unicode */
yuuji@0 2032 switch (e++) { /* install based on BASE64 state */
yuuji@0 2033 case 0:
yuuji@0 2034 c1 = c << 2; /* byte 1: high 6 bits */
yuuji@0 2035 break;
yuuji@0 2036 case 1:
yuuji@0 2037 d = c1 | (c >> 4); /* byte 1: low 2 bits */
yuuji@0 2038 c1 = c << 4; /* byte 2: high 4 bits */
yuuji@0 2039 break;
yuuji@0 2040 case 2:
yuuji@0 2041 d = c1 | (c >> 2); /* byte 2: low 4 bits */
yuuji@0 2042 c1 = c << 6; /* byte 3: high 2 bits */
yuuji@0 2043 break;
yuuji@0 2044 case 3:
yuuji@0 2045 d = c | c1; /* byte 3: low 6 bits */
yuuji@0 2046 e = 0; /* reinitialize mechanism */
yuuji@0 2047 break;
yuuji@0 2048 }
yuuji@0 2049 if (e == 1) break; /* done if first BASE64 state */
yuuji@0 2050 if (!e1) { /* first byte of UCS-2 character */
yuuji@0 2051 uc = (d & 0xff) << 8; /* note first byte */
yuuji@0 2052 e1 = T; /* enter second UCS-2 state */
yuuji@0 2053 break; /* done */
yuuji@0 2054 }
yuuji@0 2055 c = uc | (d & 0xff); /* build UCS-2 character */
yuuji@0 2056 e1 = NIL; /* back to first UCS-2 state, drop in */
yuuji@0 2057 /* surrogate pair? */
yuuji@0 2058 if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
yuuji@0 2059 /* save high surrogate for later */
yuuji@0 2060 if (c < UTF16_SURRL) surrh = c;
yuuji@0 2061 else c = UTF16_BASE + ((surrh & UTF16_MASK) << UTF16_SHIFT) +
yuuji@0 2062 (c & UTF16_MASK);
yuuji@0 2063 break; /* either way with surrogates, we're done */
yuuji@0 2064 }
yuuji@0 2065 case U7_ASCII: /* just install if ASCII */
yuuji@0 2066 /* convert if second pass */
yuuji@0 2067 if (pass) UTF8_WRITE_BMP (s,c,cv,de)
yuuji@0 2068 else UTF8_COUNT_BMP (ret->size,c,cv,de);
yuuji@0 2069 }
yuuji@0 2070 }
yuuji@0 2071 if (!pass) (s = ret->data = (unsigned char *)
yuuji@0 2072 fs_get (ret->size + 1))[ret->size] = NIL;
yuuji@0 2073 else if (((unsigned long) (s - ret->data)) != ret->size)
yuuji@0 2074 fatal ("UTF-7 to UTF-8 botch");
yuuji@0 2075 }
yuuji@0 2076 }
yuuji@0 2077
yuuji@0 2078
yuuji@0 2079 /* Convert UTF-8 sized text to UTF-8
yuuji@0 2080 * Accepts: source sized text
yuuji@0 2081 * pointer to returned sized text
yuuji@0 2082 * canonicalization function
yuuji@0 2083 */
yuuji@0 2084
yuuji@0 2085 void utf8_text_utf8 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
yuuji@0 2086 {
yuuji@0 2087 unsigned long i,c;
yuuji@0 2088 unsigned char *s,*t;
yuuji@0 2089 for (ret->size = 0, t = text->data, i = text->size; i;) {
yuuji@0 2090 if ((c = utf8_get (&t,&i)) & U8G_ERROR) {
yuuji@0 2091 ret->data = text->data; /* conversion failed */
yuuji@0 2092 ret->size = text->size;
yuuji@0 2093 return;
yuuji@0 2094 }
yuuji@0 2095 UTF8_COUNT (ret->size,c,cv,de)
yuuji@0 2096 }
yuuji@0 2097 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
yuuji@0 2098 for (t = text->data, i = text->size; i;) {
yuuji@0 2099 c = utf8_get (&t,&i);
yuuji@0 2100 UTF8_WRITE (s,c,cv,de) /* convert UCS-4 to UTF-8 */
yuuji@0 2101 }
yuuji@0 2102 if (((unsigned long) (s - ret->data)) != ret->size)
yuuji@0 2103 fatal ("UTF-8 to UTF-8 botch");
yuuji@0 2104 }
yuuji@0 2105
yuuji@0 2106 /* Convert UCS-2 sized text to UTF-8
yuuji@0 2107 * Accepts: source sized text
yuuji@0 2108 * pointer to returned sized text
yuuji@0 2109 * canonicalization function
yuuji@0 2110 */
yuuji@0 2111
yuuji@0 2112 void utf8_text_ucs2 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
yuuji@0 2113 {
yuuji@0 2114 unsigned long i;
yuuji@0 2115 unsigned char *s,*t;
yuuji@0 2116 unsigned int c;
yuuji@0 2117 for (ret->size = 0, t = text->data, i = text->size / 2; i; --i) {
yuuji@0 2118 c = *t++ << 8;
yuuji@0 2119 c |= *t++;
yuuji@0 2120 UTF8_COUNT_BMP (ret->size,c,cv,de);
yuuji@0 2121 }
yuuji@0 2122 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
yuuji@0 2123 for (t = text->data, i = text->size / 2; i; --i) {
yuuji@0 2124 c = *t++ << 8;
yuuji@0 2125 c |= *t++;
yuuji@0 2126 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
yuuji@0 2127 }
yuuji@0 2128 if (((unsigned long) (s - ret->data)) != ret->size)
yuuji@0 2129 fatal ("UCS-2 to UTF-8 botch");
yuuji@0 2130 }
yuuji@0 2131
yuuji@0 2132
yuuji@0 2133 /* Convert UCS-4 sized text to UTF-8
yuuji@0 2134 * Accepts: source sized text
yuuji@0 2135 * pointer to returned sized text
yuuji@0 2136 * canonicalization function
yuuji@0 2137 */
yuuji@0 2138
yuuji@0 2139 void utf8_text_ucs4 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
yuuji@0 2140 {
yuuji@0 2141 unsigned long i;
yuuji@0 2142 unsigned char *s,*t;
yuuji@0 2143 unsigned long c;
yuuji@0 2144 for (ret->size = 0, t = text->data, i = text->size / 4; i; --i) {
yuuji@0 2145 c = *t++ << 24; c |= *t++ << 16; c |= *t++ << 8; c |= *t++;
yuuji@0 2146 UTF8_COUNT (ret->size,c,cv,de);
yuuji@0 2147 }
yuuji@0 2148 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
yuuji@0 2149 for (t = text->data, i = text->size / 2; i; --i) {
yuuji@0 2150 c = *t++ << 24; c |= *t++ << 16; c |= *t++ << 8; c |= *t++;
yuuji@0 2151 UTF8_WRITE (s,c,cv,de) /* convert UCS-4 to UTF-8 */
yuuji@0 2152 }
yuuji@0 2153 if (((unsigned long) (s - ret->data)) != ret->size)
yuuji@0 2154 fatal ("UCS-4 to UTF-8 botch");
yuuji@0 2155 }
yuuji@0 2156
yuuji@0 2157 /* Convert UTF-16 sized text to UTF-8
yuuji@0 2158 * Accepts: source sized text
yuuji@0 2159 * pointer to returned sized text
yuuji@0 2160 * canonicalization function
yuuji@0 2161 */
yuuji@0 2162
yuuji@0 2163 void utf8_text_utf16 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
yuuji@0 2164 {
yuuji@0 2165 unsigned long i;
yuuji@0 2166 unsigned char *s,*t;
yuuji@0 2167 unsigned long c,d;
yuuji@0 2168 for (ret->size = 0, t = text->data, i = text->size / 2; i; --i) {
yuuji@0 2169 c = *t++ << 8;
yuuji@0 2170 c |= *t++;
yuuji@0 2171 /* possible surrogate? */
yuuji@0 2172 if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
yuuji@0 2173 /* invalid first surrogate */
yuuji@0 2174 if ((c > UTF16_SURRHEND) || !i) c = UBOGON;
yuuji@0 2175 else { /* get second surrogate */
yuuji@0 2176 d = *t++ << 8;
yuuji@0 2177 d |= *t++;
yuuji@0 2178 --i; /* swallowed another 16-bits */
yuuji@0 2179 /* invalid second surrogate */
yuuji@0 2180 if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) c = UBOGON;
yuuji@0 2181 else c = UTF16_BASE + ((c & UTF16_MASK) << UTF16_SHIFT) +
yuuji@0 2182 (d & UTF16_MASK);
yuuji@0 2183 }
yuuji@0 2184 }
yuuji@0 2185 UTF8_COUNT (ret->size,c,cv,de);
yuuji@0 2186 }
yuuji@0 2187 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
yuuji@0 2188 for (t = text->data, i = text->size / 2; i; --i) {
yuuji@0 2189 c = *t++ << 8;
yuuji@0 2190 c |= *t++;
yuuji@0 2191 /* possible surrogate? */
yuuji@0 2192 if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
yuuji@0 2193 /* invalid first surrogate */
yuuji@0 2194 if ((c > UTF16_SURRHEND) || !i) c = UBOGON;
yuuji@0 2195 else { /* get second surrogate */
yuuji@0 2196 d = *t++ << 8;
yuuji@0 2197 d |= *t++;
yuuji@0 2198 --i; /* swallowed another 16-bits */
yuuji@0 2199 /* invalid second surrogate */
yuuji@0 2200 if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) c = UBOGON;
yuuji@0 2201 else c = UTF16_BASE + ((c & UTF16_MASK) << UTF16_SHIFT) +
yuuji@0 2202 (d & UTF16_MASK);
yuuji@0 2203 }
yuuji@0 2204 }
yuuji@0 2205 UTF8_WRITE (s,c,cv,de) /* convert UCS-4 to UTF-8 */
yuuji@0 2206 }
yuuji@0 2207 if (((unsigned long) (s - ret->data)) != ret->size)
yuuji@0 2208 fatal ("UTF-16 to UTF-8 botch");
yuuji@0 2209 }
yuuji@0 2210
yuuji@0 2211 /* Size of UCS-4 character, possibly not in BMP, as UTF-8 octets
yuuji@0 2212 * Accepts: character
yuuji@0 2213 * Returns: size (0 means bogon)
yuuji@0 2214 *
yuuji@0 2215 * Use UTF8_SIZE macro if known to be in the BMP
yuuji@0 2216 */
yuuji@0 2217
yuuji@0 2218 unsigned long utf8_size (unsigned long c)
yuuji@0 2219 {
yuuji@0 2220 if (c < 0x80) return 1;
yuuji@0 2221 else if (c < 0x800) return 2;
yuuji@0 2222 else if (c < 0x10000) return 3;
yuuji@0 2223 else if (c < 0x200000) return 4;
yuuji@0 2224 else if (c < 0x4000000) return 5;
yuuji@0 2225 else if (c < 0x80000000) return 6;
yuuji@0 2226 return 0;
yuuji@0 2227 }
yuuji@0 2228
yuuji@0 2229
yuuji@0 2230 /* Put UCS-4 character, possibly not in BMP, as UTF-8 octets
yuuji@0 2231 * Accepts: destination string pointer
yuuji@0 2232 * character
yuuji@0 2233 * Returns: updated destination pointer
yuuji@0 2234 *
yuuji@0 2235 * Use UTF8_PUT_BMP macro if known to be in the BMP
yuuji@0 2236 */
yuuji@0 2237
yuuji@0 2238 unsigned char *utf8_put (unsigned char *s,unsigned long c)
yuuji@0 2239 {
yuuji@0 2240 unsigned char mark[6] = {0x00,0xc0,0xe0,0xf0,0xf8,0xfc};
yuuji@0 2241 unsigned long size = utf8_size (c);
yuuji@0 2242 switch (size) {
yuuji@0 2243 case 6:
yuuji@0 2244 s[5] = 0x80 | (unsigned char) (c & 0x3f);
yuuji@0 2245 c >>= 6;
yuuji@0 2246 case 5:
yuuji@0 2247 s[4] = 0x80 | (unsigned char) (c & 0x3f);
yuuji@0 2248 c >>= 6;
yuuji@0 2249 case 4:
yuuji@0 2250 s[3] = 0x80 | (unsigned char) (c & 0x3f);
yuuji@0 2251 c >>= 6;
yuuji@0 2252 case 3:
yuuji@0 2253 s[2] = 0x80 | (unsigned char) (c & 0x3f);
yuuji@0 2254 c >>= 6;
yuuji@0 2255 case 2:
yuuji@0 2256 s[1] = 0x80 | (unsigned char) (c & 0x3f);
yuuji@0 2257 c >>= 6;
yuuji@0 2258 case 1:
yuuji@0 2259 *s = mark[size-1] | (unsigned char) (c & 0x7f);
yuuji@0 2260 break;
yuuji@0 2261 }
yuuji@0 2262 return s + size;
yuuji@0 2263 }
yuuji@0 2264
yuuji@0 2265 /* Return title case of a fixed-width UCS-4 character
yuuji@0 2266 * Accepts: character
yuuji@0 2267 * Returns: title case of character
yuuji@0 2268 */
yuuji@0 2269
yuuji@0 2270 unsigned long ucs4_titlecase (unsigned long c)
yuuji@0 2271 {
yuuji@0 2272 if (c <= UCS4_TMAPMAX) return ucs4_tmaptab[c];
yuuji@0 2273 if (c < UCS4_TMAPHIMIN) return c;
yuuji@0 2274 if (c <= UCS4_TMAPHIMAX) return c - UCS4_TMAPHIMAP;
yuuji@0 2275 if (c < UCS4_TMAPDESERETMIN) return c;
yuuji@0 2276 if (c <= UCS4_TMAPDESERETMAX) return c - UCS4_TMAPDESERETMAP;
yuuji@0 2277 return c;
yuuji@0 2278 }
yuuji@0 2279
yuuji@0 2280
yuuji@0 2281 /* Return width of a fixed-width UCS-4 character in planes 0-2
yuuji@0 2282 * Accepts: character
yuuji@0 2283 * Returns: width (0, 1, 2) or negative error condition if not valid
yuuji@0 2284 */
yuuji@0 2285
yuuji@0 2286 long ucs4_width (unsigned long c)
yuuji@0 2287 {
yuuji@0 2288 long ret;
yuuji@0 2289 /* out of range, not-a-char, or surrogates */
yuuji@0 2290 if ((c > UCS4_MAXUNICODE) || ((c & 0xfffe) == 0xfffe) ||
yuuji@0 2291 ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR))) ret = U4W_NOTUNCD;
yuuji@0 2292 /* private-use */
yuuji@0 2293 else if (c >= UCS4_PVTBASE) ret = U4W_PRIVATE;
yuuji@0 2294 /* SSP are not printing characters */
yuuji@0 2295 else if (c >= UCS4_SSPBASE) ret = U4W_SSPCHAR;
yuuji@0 2296 /* unassigned planes */
yuuji@0 2297 else if (c >= UCS4_UNABASE) ret = U4W_UNASSGN;
yuuji@0 2298 /* SIP and reserved plane 3 are wide */
yuuji@0 2299 else if (c >= UCS4_SIPBASE) ret = 2;
yuuji@0 2300 #if (UCS4_WIDLEN != UCS4_SIPBASE)
yuuji@0 2301 #error "UCS4_WIDLEN != UCS4_SIPBASE"
yuuji@0 2302 #endif
yuuji@0 2303 /* C0/C1 controls */
yuuji@0 2304 else if ((c <= UCS2_C0CONTROLEND) ||
yuuji@0 2305 ((c >= UCS2_C1CONTROL) && (c <= UCS2_C1CONTROLEND)))
yuuji@0 2306 ret = U4W_CONTROL;
yuuji@0 2307 /* BMP and SMP get value from table */
yuuji@0 2308 else switch (ret = (ucs4_widthtab[(c >> 2)] >> ((3 - (c & 0x3)) << 1)) &0x3){
yuuji@0 2309 case 0: /* zero-width */
yuuji@0 2310 if (c == 0x00ad) ret = 1; /* force U+00ad (SOFT HYPHEN) to width 1 */
yuuji@0 2311 case 1: /* single-width */
yuuji@0 2312 case 2: /* double-width */
yuuji@0 2313 break;
yuuji@0 2314 case 3: /* ambiguous width */
yuuji@0 2315 ret = (c >= 0x2100) ? 2 : 1;/* need to do something better than this */
yuuji@0 2316 break;
yuuji@0 2317 }
yuuji@0 2318 return ret;
yuuji@0 2319 }
yuuji@0 2320
yuuji@0 2321 /* Return screen width of UTF-8 string
yuuji@0 2322 * Accepts: string
yuuji@0 2323 * Returns: width or negative if not valid UTF-8
yuuji@0 2324 */
yuuji@0 2325
yuuji@0 2326 long utf8_strwidth (unsigned char *s)
yuuji@0 2327 {
yuuji@0 2328 unsigned long c,i,ret;
yuuji@0 2329 /* go through string */
yuuji@0 2330 for (ret = 0; *s; ret += ucs4_width (c)) {
yuuji@0 2331 /* It's alright to give a fake value for the byte count to utf8_get()
yuuji@0 2332 * since the null of a null-terminated string will stop processing anyway.
yuuji@0 2333 */
yuuji@0 2334 i = 6; /* fake value */
yuuji@0 2335 if ((c = utf8_get (&s,&i)) & U8G_ERROR) return -1;
yuuji@0 2336 }
yuuji@0 2337 return ret;
yuuji@0 2338 }
yuuji@0 2339
yuuji@0 2340
yuuji@0 2341 /* Return screen width of UTF-8 text
yuuji@0 2342 * Accepts: SIZEDTEXT to string
yuuji@0 2343 * Returns: width or negative if not valid UTF-8
yuuji@0 2344 */
yuuji@0 2345
yuuji@0 2346 long utf8_textwidth (SIZEDTEXT *utf8)
yuuji@0 2347 {
yuuji@0 2348 unsigned long c;
yuuji@0 2349 unsigned char *s = utf8->data;
yuuji@0 2350 unsigned long i = utf8->size;
yuuji@0 2351 unsigned long ret = 0;
yuuji@0 2352 while (i) { /* while there's a string to process */
yuuji@0 2353 if ((c = utf8_get (&s,&i)) & U8G_ERROR) return -1;
yuuji@0 2354 ret += ucs4_width (c);
yuuji@0 2355 }
yuuji@0 2356 return ret;
yuuji@0 2357 }
yuuji@0 2358
yuuji@0 2359 /* Decomposition (phew!) */
yuuji@0 2360
yuuji@0 2361 #define MORESINGLE 1 /* single UCS-4 tail value */
yuuji@0 2362 #define MOREMULTIPLE 2 /* multiple UCS-2 tail values */
yuuji@0 2363
yuuji@0 2364 struct decomposemore {
yuuji@0 2365 short type; /* type of more */
yuuji@0 2366 union {
yuuji@0 2367 unsigned long single; /* single decomposed value */
yuuji@0 2368 struct { /* multiple BMP values */
yuuji@0 2369 unsigned short *next;
yuuji@0 2370 unsigned long count;
yuuji@0 2371 } multiple;
yuuji@0 2372 } data;
yuuji@0 2373 };
yuuji@0 2374
yuuji@0 2375 #define RECURSIVEMORE struct recursivemore
yuuji@0 2376
yuuji@0 2377 RECURSIVEMORE {
yuuji@0 2378 struct decomposemore *more;
yuuji@0 2379 RECURSIVEMORE *next;
yuuji@0 2380 };
yuuji@0 2381
yuuji@0 2382
yuuji@0 2383 /* Return decomposition of a UCS-4 character
yuuji@0 2384 * Accepts: character or U8G_ERROR to return next from "more"
yuuji@0 2385 * pointer to returned more
yuuji@0 2386 * Returns: [next] decomposed value, more set if still more decomposition
yuuji@0 2387 */
yuuji@0 2388
yuuji@0 2389 unsigned long ucs4_decompose (unsigned long c,void **more)
yuuji@0 2390 {
yuuji@0 2391 unsigned long i,ix,ret;
yuuji@0 2392 struct decomposemore *m;
yuuji@0 2393 if (c & U8G_ERROR) { /* want to chase more? */
yuuji@0 2394 /* do sanity check */
yuuji@0 2395 if (m = (struct decomposemore *) *more) switch (m->type) {
yuuji@0 2396 case MORESINGLE: /* single value */
yuuji@0 2397 ret = m->data.single;
yuuji@0 2398 fs_give (more); /* no more decomposition */
yuuji@0 2399 break;
yuuji@0 2400 case MOREMULTIPLE: /* multiple value */
yuuji@0 2401 ret = *m->data.multiple.next++;
yuuji@0 2402 if (!--m->data.multiple.count) fs_give (more);
yuuji@0 2403 break;
yuuji@0 2404 default: /* uh-oh */
yuuji@0 2405 fatal ("invalid more block argument to ucs4_decompose!");
yuuji@0 2406 }
yuuji@0 2407 else fatal ("no more block provided to ucs4_decompose!");
yuuji@0 2408 }
yuuji@0 2409
yuuji@0 2410 else { /* start decomposition */
yuuji@0 2411 *more = NIL; /* initially set no more */
yuuji@0 2412 /* BMP low decompositions */
yuuji@0 2413 if (c < UCS4_BMPLOMIN) ret = c;
yuuji@0 2414 /* fix this someday */
yuuji@0 2415 else if (c == UCS4_BMPLOMIN) ret = ucs4_dbmplotab[0];
yuuji@0 2416 else if (c <= UCS4_BMPLOMAX) {
yuuji@0 2417 /* within range - have a decomposition? */
yuuji@0 2418 if (i = ucs4_dbmploixtab[c - UCS4_BMPLOMIN]) {
yuuji@0 2419 /* get first value of decomposition */
yuuji@0 2420 ret = ucs4_dbmplotab[ix = i & UCS4_BMPLOIXMASK];
yuuji@0 2421 /* has continuation? */
yuuji@0 2422 if (i & UCS4_BMPLOSIZEMASK) {
yuuji@0 2423 m = (struct decomposemore *)
yuuji@0 2424 (*more = memset (fs_get (sizeof (struct decomposemore)),0,
yuuji@0 2425 sizeof (struct decomposemore)));
yuuji@0 2426 m->type = MOREMULTIPLE;
yuuji@0 2427 m->data.multiple.next = &ucs4_dbmplotab[++ix];
yuuji@0 2428 m->data.multiple.count = i >> UCS4_BMPLOSIZESHIFT;
yuuji@0 2429 }
yuuji@0 2430 }
yuuji@0 2431 else ret = c; /* in range but doesn't decompose */
yuuji@0 2432 }
yuuji@0 2433 /* BMP CJK compatibility */
yuuji@0 2434 else if (c < UCS4_BMPCJKMIN) ret = c;
yuuji@0 2435 else if (c <= UCS4_BMPCJKMAX) {
yuuji@0 2436 if (!(ret = ucs4_bmpcjk1decomptab[c - UCS4_BMPCJKMIN])) ret = c;
yuuji@0 2437 }
yuuji@0 2438 /* BMP CJK compatibility - some not in BMP */
yuuji@0 2439 #if UCS4_BMPCJK2MIN - (UCS4_BMPCJKMAX + 1)
yuuji@0 2440 else if (c < UCS4_BMPCJK2MIN) ret = c;
yuuji@0 2441 #endif
yuuji@0 2442 else if (c <= UCS4_BMPCJK2MAX)
yuuji@0 2443 ret = ucs4_bmpcjk2decomptab[c - UCS4_BMPCJK2MIN];
yuuji@0 2444 /* BMP high decompositions */
yuuji@0 2445 else if (c < UCS4_BMPHIMIN) ret = c;
yuuji@0 2446 else if (c <= UCS4_BMPHIMAX) {
yuuji@0 2447 /* within range - have a decomposition? */
yuuji@0 2448 if (i = ucs4_dbmphiixtab[c - UCS4_BMPHIMIN]) {
yuuji@0 2449 /* get first value of decomposition */
yuuji@0 2450 ret = ucs4_dbmphitab[ix = i & UCS4_BMPHIIXMASK];
yuuji@0 2451 /* has continuation? */
yuuji@0 2452 if (i & UCS4_BMPHISIZEMASK) {
yuuji@0 2453 m = (struct decomposemore *)
yuuji@0 2454 (*more = memset (fs_get (sizeof (struct decomposemore)),0,
yuuji@0 2455 sizeof (struct decomposemore)));
yuuji@0 2456 m->type = MOREMULTIPLE;
yuuji@0 2457 m->data.multiple.next = &ucs4_dbmphitab[++ix];
yuuji@0 2458 m->data.multiple.count = i >> UCS4_BMPHISIZESHIFT;
yuuji@0 2459 }
yuuji@0 2460 }
yuuji@0 2461 else ret = c; /* in range but doesn't decompose */
yuuji@0 2462 }
yuuji@0 2463
yuuji@0 2464 /* BMP half and full width forms */
yuuji@0 2465 else if (c < UCS4_BMPHALFFULLMIN) ret = c;
yuuji@0 2466 else if (c <= UCS4_BMPHALFFULLMAX) {
yuuji@0 2467 if (!(ret = ucs4_bmphalffulldecomptab[c - UCS4_BMPHALFFULLMIN])) ret = c;
yuuji@0 2468 }
yuuji@0 2469 /* SMP music */
yuuji@0 2470 else if (c < UCS4_SMPMUSIC1MIN) ret = c;
yuuji@0 2471 else if (c <= UCS4_SMPMUSIC1MAX) {
yuuji@0 2472 ret = ucs4_smpmusic1decomptab[c -= UCS4_SMPMUSIC1MIN][0];
yuuji@0 2473 m = (struct decomposemore *)
yuuji@0 2474 (*more = memset (fs_get (sizeof (struct decomposemore)),0,
yuuji@0 2475 sizeof (struct decomposemore)));
yuuji@0 2476 m->type = MORESINGLE;
yuuji@0 2477 m->data.single = ucs4_smpmusic1decomptab[c][1];
yuuji@0 2478 }
yuuji@0 2479 else if (c < UCS4_SMPMUSIC2MIN) ret = c;
yuuji@0 2480 else if (c <= UCS4_SMPMUSIC2MAX) {
yuuji@0 2481 ret = ucs4_smpmusic2decomptab[c -= UCS4_SMPMUSIC2MIN][0];
yuuji@0 2482 m = (struct decomposemore *)
yuuji@0 2483 (*more = memset (fs_get (sizeof (struct decomposemore)),0,
yuuji@0 2484 sizeof (struct decomposemore)));
yuuji@0 2485 m->type = MORESINGLE;
yuuji@0 2486 m->data.single = ucs4_smpmusic2decomptab[c][1];
yuuji@0 2487 }
yuuji@0 2488 /* SMP mathematical forms */
yuuji@0 2489 else if (c < UCS4_SMPMATHMIN) ret = c;
yuuji@0 2490 else if (c <= UCS4_SMPMATHMAX) {
yuuji@0 2491 if (!(ret = ucs4_smpmathdecomptab[c - UCS4_SMPMATHMIN])) ret = c;
yuuji@0 2492 }
yuuji@0 2493 /* CJK compatibility ideographs in SIP */
yuuji@0 2494 else if (!(ret = ((c >= UCS4_SIPMIN) && (c <= UCS4_SIPMAX)) ?
yuuji@0 2495 ucs4_sipdecomptab[c - UCS4_SIPMIN] : c)) ret = c;
yuuji@0 2496 }
yuuji@0 2497 return ret;
yuuji@0 2498 }
yuuji@0 2499
yuuji@0 2500 /* Return recursive decomposition of a UCS-4 character
yuuji@0 2501 * Accepts: character or U8G_ERROR to return next from "more"
yuuji@0 2502 * pointer to returned more
yuuji@0 2503 * Returns: [next] decomposed value, more set if still more decomposition
yuuji@0 2504 */
yuuji@0 2505
yuuji@0 2506 unsigned long ucs4_decompose_recursive (unsigned long c,void **more)
yuuji@0 2507 {
yuuji@0 2508 unsigned long c1;
yuuji@0 2509 void *m,*mn;
yuuji@0 2510 RECURSIVEMORE *mr;
yuuji@0 2511 if (c & U8G_ERROR) { /* want to chase more? */
yuuji@0 2512 mn = NIL;
yuuji@0 2513 if (mr = (RECURSIVEMORE *) *more) switch (mr->more->type) {
yuuji@0 2514 case MORESINGLE: /* decompose single value */
yuuji@0 2515 c = ucs4_decompose_recursive (mr->more->data.single,&mn);
yuuji@0 2516 *more = mr->next; /* done with this more, remove it */
yuuji@0 2517 fs_give ((void **) &mr->more);
yuuji@0 2518 fs_give ((void **) &mr);
yuuji@0 2519 break;
yuuji@0 2520 case MOREMULTIPLE: /* decompose current value in multiple */
yuuji@0 2521 c = ucs4_decompose_recursive (*mr->more->data.multiple.next++,&mn);
yuuji@0 2522 /* if done with this multiple decomposition */
yuuji@0 2523 if (!--mr->more->data.multiple.count) {
yuuji@0 2524 *more = mr->next; /* done with this more, remove it */
yuuji@0 2525 fs_give ((void **) &mr->more);
yuuji@0 2526 fs_give ((void **) &mr);
yuuji@0 2527 }
yuuji@0 2528 break;
yuuji@0 2529 default: /* uh-oh */
yuuji@0 2530 fatal ("invalid more block argument to ucs4_decompose_recursive!");
yuuji@0 2531 }
yuuji@0 2532 else fatal ("no more block provided to ucs4_decompose_recursive!");
yuuji@0 2533 if (mr = mn) { /* did this value recurse on us? */
yuuji@0 2534 mr->next = *more; /* yes, insert new more at head */
yuuji@0 2535 *more = mr;
yuuji@0 2536 }
yuuji@0 2537 }
yuuji@0 2538 else { /* start decomposition */
yuuji@0 2539 *more = NIL; /* initially set no more */
yuuji@0 2540 mr = NIL;
yuuji@0 2541 do { /* repeatedly decompose this codepoint */
yuuji@0 2542 c = ucs4_decompose (c1 = c,&m);
yuuji@0 2543 if (m) { /* multi-byte decomposition */
yuuji@0 2544 if (c1 == c) fatal ("endless multiple decomposition!");
yuuji@0 2545 /* create a block to stash this more */
yuuji@0 2546 mr = memset (fs_get (sizeof (RECURSIVEMORE)),0,sizeof (RECURSIVEMORE));
yuuji@0 2547 mr->more = m; /* note the expansion */
yuuji@0 2548 mr->next = *more; /* old list is the tail */
yuuji@0 2549 *more = mr; /* and this is the new head */
yuuji@0 2550 }
yuuji@0 2551 } while (c1 != c); /* until nothing more to decompose */
yuuji@0 2552 }
yuuji@0 2553 return c;
yuuji@0 2554 }

UW-IMAP'd extensions by yuuji