imapext-2007

diff src/c-client/utf8.c @ 0:ada5e610ab86

imap-2007e
author yuuji@gentei.org
date Mon, 14 Sep 2009 15:17:45 +0900
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/c-client/utf8.c	Mon Sep 14 15:17:45 2009 +0900
     1.3 @@ -0,0 +1,2554 @@
     1.4 +/* ========================================================================
     1.5 + * Copyright 1988-2008 University of Washington
     1.6 + *
     1.7 + * Licensed under the Apache License, Version 2.0 (the "License");
     1.8 + * you may not use this file except in compliance with the License.
     1.9 + * You may obtain a copy of the License at
    1.10 + *
    1.11 + *     http://www.apache.org/licenses/LICENSE-2.0
    1.12 + *
    1.13 + * 
    1.14 + * ========================================================================
    1.15 + */
    1.16 +
    1.17 +/*
    1.18 + * Program:	UTF-8 routines
    1.19 + *
    1.20 + * Author:	Mark Crispin
    1.21 + *		Networks and Distributed Computing
    1.22 + *		Computing & Communications
    1.23 + *		University of Washington
    1.24 + *		Administration Building, AG-44
    1.25 + *		Seattle, WA  98195
    1.26 + *		Internet: MRC@CAC.Washington.EDU
    1.27 + *
    1.28 + * Date:	11 June 1997
    1.29 + * Last Edited:	17 January 2008
    1.30 + */
    1.31 +
    1.32 +
    1.33 +#include <stdio.h>
    1.34 +#include <ctype.h>
    1.35 +#include "c-client.h"
    1.36 +
    1.37 +/*	*** IMPORTANT ***
    1.38 + *
    1.39 + *  There is a very important difference between "character set" and "charset",
    1.40 + * and the comments in this file reflect these differences.  A "character set"
    1.41 + * (also known as "coded character set") is a mapping between codepoints and
    1.42 + * characters.  A "charset" is as defined in MIME, and incorporates one or more
    1.43 + * coded character sets in a character encoding scheme.  See RFC 2130 for more
    1.44 + * details.
    1.45 + */
    1.46 +
    1.47 +
    1.48 +/* Character set conversion tables */
    1.49 +
    1.50 +#include "iso_8859.c"		/* 8-bit single-byte coded graphic */
    1.51 +#include "koi8_r.c"		/* Cyrillic - Russia */
    1.52 +#include "koi8_u.c"		/* Cyrillic - Ukraine */
    1.53 +#include "tis_620.c"		/* Thai */
    1.54 +#include "viscii.c"		/* Vietnamese */
    1.55 +#include "windows.c"		/* Windows */
    1.56 +#include "ibm.c"		/* IBM */
    1.57 +#include "gb_2312.c"		/* Chinese (PRC) - simplified */
    1.58 +#include "gb_12345.c"		/* Chinese (PRC) - traditional */
    1.59 +#include "jis_0208.c"		/* Japanese - basic */
    1.60 +#include "jis_0212.c"		/* Japanese - supplementary */
    1.61 +#include "ksc_5601.c"		/* Korean */
    1.62 +#include "big5.c"		/* Taiwanese (ROC) - industrial standard */
    1.63 +#include "cns11643.c"		/* Taiwanese (ROC) - national standard */
    1.64 +
    1.65 +
    1.66 +#include "widths.c"		/* Unicode character widths */
    1.67 +#include "tmap.c"		/* Unicode titlecase mapping */
    1.68 +#include "decomtab.c"		/* Unicode decomposions */
    1.69 +
    1.70 +/* EUC parameters */
    1.71 +
    1.72 +#ifdef GBTOUNICODE		/* PRC simplified Chinese */
    1.73 +static const struct utf8_eucparam gb_param = {
    1.74 +  BASE_GB2312_KU,BASE_GB2312_TEN,MAX_GB2312_KU,MAX_GB2312_TEN,
    1.75 +  (void *) gb2312tab};
    1.76 +#endif
    1.77 +
    1.78 +
    1.79 +#ifdef GB12345TOUNICODE		/* PRC traditional Chinese */
    1.80 +static const struct utf8_eucparam gbt_param = {
    1.81 +  BASE_GB12345_KU,BASE_GB12345_TEN,MAX_GB12345_KU,MAX_GB12345_TEN,
    1.82 +  (void *) gb12345tab};
    1.83 +#endif
    1.84 +
    1.85 +
    1.86 +#ifdef BIG5TOUNICODE		/* ROC traditional Chinese */
    1.87 +static const struct utf8_eucparam big5_param[] = {
    1.88 +  {BASE_BIG5_KU,BASE_BIG5_TEN_0,MAX_BIG5_KU,MAX_BIG5_TEN_0,(void *) big5tab},
    1.89 +  {BASE_BIG5_KU,BASE_BIG5_TEN_1,MAX_BIG5_KU,MAX_BIG5_TEN_1,NIL}
    1.90 +};
    1.91 +#endif
    1.92 +
    1.93 +
    1.94 +#ifdef JISTOUNICODE		/* Japanese */
    1.95 +static const struct utf8_eucparam jis_param[] = {
    1.96 +  {BASE_JIS0208_KU,BASE_JIS0208_TEN,MAX_JIS0208_KU,MAX_JIS0208_TEN,
    1.97 +     (void *) jis0208tab},
    1.98 +  {MIN_KANA_8,0,MAX_KANA_8,0,(void *) KANA_8},
    1.99 +#ifdef JIS0212TOUNICODE		/* Japanese extended */
   1.100 +  {BASE_JIS0212_KU,BASE_JIS0212_TEN,MAX_JIS0212_KU,MAX_JIS0212_TEN,
   1.101 +     (void *) jis0212tab}
   1.102 +#else
   1.103 +  {0,0,0,0,NIL}
   1.104 +#endif
   1.105 +};
   1.106 +#endif
   1.107 +
   1.108 +
   1.109 +#ifdef KSCTOUNICODE		/* Korean */
   1.110 +static const struct utf8_eucparam ksc_param = {
   1.111 +  BASE_KSC5601_KU,BASE_KSC5601_TEN,MAX_KSC5601_KU,MAX_KSC5601_TEN,
   1.112 +  (void *) ksc5601tab};
   1.113 +#endif
   1.114 +
   1.115 +/* List of supported charsets */
   1.116 +
   1.117 +static const CHARSET utf8_csvalid[] = {
   1.118 +  {"US-ASCII",CT_ASCII,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.119 +   NIL,NIL,NIL},
   1.120 +  {"UTF-8",CT_UTF8,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.121 +   NIL,SC_UNICODE,NIL},
   1.122 +  {"UTF-7",CT_UTF7,CF_PRIMARY | CF_POSTING | CF_UNSUPRT,
   1.123 +   NIL,SC_UNICODE,"UTF-8"},
   1.124 +  {"ISO-8859-1",CT_1BYTE0,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.125 +   NIL,SC_LATIN_1,NIL},
   1.126 +  {"ISO-8859-2",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.127 +   (void *) iso8859_2tab,SC_LATIN_2,NIL},
   1.128 +  {"ISO-8859-3",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.129 +   (void *) iso8859_3tab,SC_LATIN_3,NIL},
   1.130 +  {"ISO-8859-4",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.131 +   (void *) iso8859_4tab,SC_LATIN_4,NIL},
   1.132 +  {"ISO-8859-5",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.133 +   (void *) iso8859_5tab,SC_CYRILLIC,"KOI8-R"},
   1.134 +  {"ISO-8859-6",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.135 +   (void *) iso8859_6tab,SC_ARABIC,NIL},
   1.136 +  {"ISO-8859-7",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.137 +   (void *) iso8859_7tab,SC_GREEK,NIL},
   1.138 +  {"ISO-8859-8",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.139 +   (void *) iso8859_8tab,SC_HEBREW,NIL},
   1.140 +  {"ISO-8859-9",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.141 +   (void *) iso8859_9tab,SC_LATIN_5,NIL},
   1.142 +  {"ISO-8859-10",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.143 +   (void *) iso8859_10tab,SC_LATIN_6,NIL},
   1.144 +  {"ISO-8859-11",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.145 +   (void *) iso8859_11tab,SC_THAI,NIL},
   1.146 +#if 0				/* ISO 8859-12 reserved for ISCII(?) */
   1.147 +  {"ISO-8859-12",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.148 +   (void *) iso8859_12tab,NIL,NIL},
   1.149 +#endif
   1.150 +  {"ISO-8859-13",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.151 +   (void *) iso8859_13tab,SC_LATIN_7,NIL},
   1.152 +  {"ISO-8859-14",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.153 +   (void *) iso8859_14tab,SC_LATIN_8,NIL},
   1.154 +  {"ISO-8859-15",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.155 +   (void *) iso8859_15tab,SC_LATIN_9,NIL},
   1.156 +  {"ISO-8859-16",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.157 +   (void *) iso8859_16tab,SC_LATIN_10,NIL},
   1.158 +  {"KOI8-R",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.159 +   (void *) koi8rtab,SC_CYRILLIC,NIL},
   1.160 +  {"KOI8-U",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.161 +   (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,NIL},
   1.162 +  {"KOI8-RU",CT_1BYTE,CF_DISPLAY,
   1.163 +   (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,"KOI8-U"},
   1.164 +  {"TIS-620",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.165 +   (void *) tis620tab,SC_THAI,"ISO-8859-11"},
   1.166 +  {"VISCII",CT_1BYTE8,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.167 +   (void *) visciitab,SC_VIETNAMESE,NIL},
   1.168 +
   1.169 +#ifdef GBTOUNICODE
   1.170 +  {"GBK",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.171 +     (void *) &gb_param,SC_CHINESE_SIMPLIFIED,NIL},
   1.172 +  {"GB2312",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.173 +   (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
   1.174 +  {"CN-GB",CT_DBYTE,CF_DISPLAY,
   1.175 +     (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
   1.176 +#ifdef CNS1TOUNICODE
   1.177 +  {"ISO-2022-CN",CT_2022,CF_PRIMARY | CF_UNSUPRT,
   1.178 +     NIL,SC_CHINESE_SIMPLIFIED | SC_CHINESE_TRADITIONAL,
   1.179 +   NIL},
   1.180 +#endif
   1.181 +#endif
   1.182 +#ifdef GB12345TOUNICODE
   1.183 +  {"CN-GB-12345",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
   1.184 +     (void *) &gbt_param,SC_CHINESE_TRADITIONAL,"BIG5"},
   1.185 +#endif
   1.186 +#ifdef BIG5TOUNICODE
   1.187 +  {"BIG5",CT_DBYTE2,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.188 +     (void *) big5_param,SC_CHINESE_TRADITIONAL,NIL},
   1.189 +  {"CN-BIG5",CT_DBYTE2,CF_DISPLAY,
   1.190 +     (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"},
   1.191 +  {"BIG-5",CT_DBYTE2,CF_DISPLAY,
   1.192 +     (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"},
   1.193 +#endif
   1.194 +#ifdef JISTOUNICODE
   1.195 +  {"ISO-2022-JP",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.196 +     NIL,SC_JAPANESE,NIL},
   1.197 +  {"EUC-JP",CT_EUC,CF_PRIMARY | CF_DISPLAY,
   1.198 +     (void *) jis_param,SC_JAPANESE,"ISO-2022-JP"},
   1.199 +  {"SHIFT_JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY,
   1.200 +     NIL,SC_JAPANESE,"ISO-2022-JP"},
   1.201 +  {"SHIFT-JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY,
   1.202 +     NIL,SC_JAPANESE,"ISO-2022-JP"},
   1.203 +#ifdef JIS0212TOUNICODE
   1.204 +  {"ISO-2022-JP-1",CT_2022,CF_UNSUPRT,
   1.205 +     NIL,SC_JAPANESE,"ISO-2022-JP"},
   1.206 +#ifdef GBTOUNICODE
   1.207 +#ifdef KSCTOUNICODE
   1.208 +  {"ISO-2022-JP-2",CT_2022,CF_UNSUPRT,
   1.209 +     NIL,
   1.210 +     SC_LATIN_1 | SC_LATIN_2 | SC_LATIN_3 | SC_LATIN_4 | SC_LATIN_5 |
   1.211 +       SC_LATIN_6 | SC_LATIN_7 | SC_LATIN_8 | SC_LATIN_9 | SC_LATIN_10 |
   1.212 +	 SC_ARABIC | SC_CYRILLIC | SC_GREEK | SC_HEBREW | SC_THAI |
   1.213 +	   SC_VIETNAMESE | SC_CHINESE_SIMPLIFIED | SC_JAPANESE | SC_KOREAN
   1.214 +#ifdef CNS1TOUNICODE
   1.215 +	     | SC_CHINESE_TRADITIONAL
   1.216 +#endif
   1.217 +	       ,"UTF-8"},
   1.218 +#endif
   1.219 +#endif
   1.220 +#endif
   1.221 +#endif
   1.222 +
   1.223 +#ifdef KSCTOUNICODE
   1.224 +  {"ISO-2022-KR",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_UNSUPRT,
   1.225 +     NIL,SC_KOREAN,"EUC-KR"},
   1.226 +  {"EUC-KR",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.227 +     (void *) &ksc_param,SC_KOREAN,NIL},
   1.228 +  {"KSC5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
   1.229 +     (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
   1.230 +  {"KSC_5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
   1.231 +     (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
   1.232 +  {"KS_C_5601-1987",CT_DBYTE,CF_DISPLAY,
   1.233 +     (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
   1.234 +  {"KS_C_5601-1989",CT_DBYTE,CF_DISPLAY,
   1.235 +     (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
   1.236 +  {"KS_C_5601-1992",CT_DBYTE,CF_DISPLAY,
   1.237 +     (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
   1.238 +  {"KS_C_5601-1997",CT_DBYTE,CF_DISPLAY,
   1.239 +     (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
   1.240 +#endif
   1.241 +
   1.242 +				/* deep sigh */
   1.243 +  {"WINDOWS-874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.244 +     (void *) windows_874tab,SC_THAI,"ISO-8859-11"},
   1.245 +  {"CP874",CT_1BYTE,CF_DISPLAY,
   1.246 +     (void *) windows_874tab,SC_THAI,"ISO-8859-11"},
   1.247 +#ifdef GBTOUNICODE
   1.248 +  {"WINDOWS-936",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
   1.249 +     (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
   1.250 +  {"CP936",CT_DBYTE,CF_DISPLAY,
   1.251 +     (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
   1.252 +#endif
   1.253 +#ifdef KSCTOUNICODE
   1.254 +  {"WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
   1.255 +     (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
   1.256 +  {"CP949",CT_DBYTE,CF_DISPLAY,
   1.257 +     (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
   1.258 +  {"X-WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
   1.259 +     (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
   1.260 +#endif
   1.261 +  {"WINDOWS-1250",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.262 +     (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"},
   1.263 +  {"CP1250",CT_1BYTE,CF_DISPLAY,
   1.264 +     (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"},
   1.265 +  {"WINDOWS-1251",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
   1.266 +     (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"},
   1.267 +  {"CP1251",CT_1BYTE,CF_DISPLAY,
   1.268 +     (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"},
   1.269 +  {"WINDOWS-1252",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.270 +     (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"},
   1.271 +  {"CP1252",CT_1BYTE,CF_DISPLAY,
   1.272 +     (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"},
   1.273 +  {"WINDOWS-1253",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.274 +     (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"},
   1.275 +  {"CP1253",CT_1BYTE,CF_DISPLAY,
   1.276 +     (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"},
   1.277 +  {"WINDOWS-1254",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.278 +     (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"},
   1.279 +  {"CP1254",CT_1BYTE,CF_DISPLAY,
   1.280 +     (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"},
   1.281 +  {"WINDOWS-1255",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.282 +     (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"},
   1.283 +  {"CP1255",CT_1BYTE,CF_DISPLAY,
   1.284 +     (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"},
   1.285 +  {"WINDOWS-1256",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.286 +     (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"},
   1.287 +  {"CP1256",CT_1BYTE,CF_DISPLAY,
   1.288 +     (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"},
   1.289 +  {"WINDOWS-1257",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.290 +     (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"},
   1.291 +  {"CP1257",CT_1BYTE,CF_DISPLAY,
   1.292 +     (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"},
   1.293 +  {"WINDOWS-1258",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.294 +     (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"},
   1.295 +  {"CP1258",CT_1BYTE,CF_DISPLAY,
   1.296 +     (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"},
   1.297 +
   1.298 +				/* deeper sigh */
   1.299 +  {"IBM367",CT_ASCII,CF_PRIMARY | CF_DISPLAY,
   1.300 +     NIL,NIL,"US-ASCII"},
   1.301 +  {"IBM437",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.302 +     (void *) ibm_437tab,SC_LATIN_1,"ISO-8859-1"},
   1.303 +  {"IBM737",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.304 +     (void *) ibm_737tab,SC_GREEK,"ISO-8859-7"},
   1.305 +  {"IBM775",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.306 +     (void *) ibm_775tab,SC_LATIN_7,"ISO-8859-13"},
   1.307 +  {"IBM850",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.308 +     (void *) ibm_850tab,SC_LATIN_1,"ISO-8859-1"},
   1.309 +  {"IBM852",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.310 +     (void *) ibm_852tab,SC_LATIN_2,"ISO-8859-2"},
   1.311 +  {"IBM855",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.312 +     (void *) ibm_855tab,SC_CYRILLIC,"ISO-8859-5"},
   1.313 +  {"IBM857",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.314 +     (void *) ibm_857tab,SC_LATIN_5,"ISO-8859-9"},
   1.315 +  {"IBM860",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.316 +     (void *) ibm_860tab,SC_LATIN_1,"ISO-8859-1"},
   1.317 +  {"IBM861",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.318 +     (void *) ibm_861tab,SC_LATIN_6,"ISO-8859-10"},
   1.319 +  {"IBM862",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.320 +     (void *) ibm_862tab,SC_HEBREW,"ISO-8859-8"},
   1.321 +  {"IBM863",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.322 +     (void *) ibm_863tab,SC_LATIN_1,"ISO-8859-1"},
   1.323 +  {"IBM864",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.324 +     (void *) ibm_864tab,SC_ARABIC,"ISO-8859-6"},
   1.325 +  {"IBM865",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.326 +     (void *) ibm_865tab,SC_LATIN_6,"ISO-8859-10"},
   1.327 +  {"IBM866",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.328 +     (void *) ibm_866tab,SC_CYRILLIC,"KOI8-R"},
   1.329 +  {"IBM869",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.330 +     (void *) ibm_869tab,SC_GREEK,"ISO-8859-7"},
   1.331 +  {"IBM874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
   1.332 +     (void *) ibm_874tab,SC_THAI,"ISO-8859-11"},
   1.333 +				/* deepest sigh */
   1.334 +  {"ANSI_X3.4-1968",CT_ASCII,CF_DISPLAY,
   1.335 +     NIL,NIL,"US-ASCII"},
   1.336 +  {"UNICODE-1-1-UTF-7",CT_UTF7,CF_UNSUPRT,
   1.337 +     NIL,SC_UNICODE,"UTF-8"},
   1.338 +				/* these should never appear in email */
   1.339 +  {"UCS-2",CT_UCS2,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
   1.340 +     NIL,SC_UNICODE,"UTF-8"},
   1.341 +  {"UCS-4",CT_UCS4,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
   1.342 +     NIL,SC_UNICODE,"UTF-8"},
   1.343 +  {"UTF-16",CT_UTF16,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
   1.344 +     NIL,SC_UNICODE,"UTF-8"},
   1.345 +  NIL
   1.346 +};
   1.347 +
   1.348 +/* Non-Unicode Script table */
   1.349 +
   1.350 +static const SCRIPT utf8_scvalid[] = {
   1.351 +  {"Arabic",NIL,SC_ARABIC},
   1.352 +  {"Chinese Simplified","China, Singapore",SC_CHINESE_SIMPLIFIED},
   1.353 +  {"Chinese Traditional","Taiwan, Hong Kong, Macao",SC_CHINESE_TRADITIONAL},
   1.354 +  {"Cyrillic",NIL,SC_CYRILLIC},
   1.355 +  {"Cyrillic Ukranian",NIL,SC_UKRANIAN},
   1.356 +  {"Greek",NIL,SC_GREEK},
   1.357 +  {"Hebrew",NIL,SC_HEBREW},
   1.358 +  {"Japanese",NIL,SC_JAPANESE},
   1.359 +  {"Korean",NIL,SC_KOREAN},
   1.360 +  {"Latin-1","Western Europe",SC_LATIN_1},
   1.361 +  {"Latin-2","Eastern Europe",SC_LATIN_2},
   1.362 +  {"Latin-3","Southern Europe",SC_LATIN_3},
   1.363 +  {"Latin-4","Northern Europe",SC_LATIN_4},
   1.364 +  {"Latin-5","Turkish",SC_LATIN_5},
   1.365 +  {"Latin-6","Nordic",SC_LATIN_6},
   1.366 +  {"Latin-7","Baltic",SC_LATIN_7},
   1.367 +  {"Latin-8","Celtic",SC_LATIN_8},
   1.368 +  {"Latin-9","Euro",SC_LATIN_9},
   1.369 +  {"Latin-10","Balkan",SC_LATIN_10},
   1.370 +  {"Thai",NIL,SC_THAI},
   1.371 +  {"Vietnamese",NIL,SC_VIETNAMESE},
   1.372 +  NIL
   1.373 +};
   1.374 +
   1.375 +/* Look up script name or return entire table
   1.376 + * Accepts: script name or NIL
   1.377 + * Returns: pointer to script table entry or NIL if unknown
   1.378 + */
   1.379 +
   1.380 +SCRIPT *utf8_script (char *script)
   1.381 +{
   1.382 +  unsigned long i;
   1.383 +  if (!script) return (SCRIPT *) &utf8_scvalid[0];
   1.384 +  else if (*script && (strlen (script) < 128))
   1.385 +    for (i = 0; utf8_scvalid[i].name; i++)
   1.386 +      if (!compare_cstring (script,utf8_scvalid[i].name))
   1.387 +	return (SCRIPT *) &utf8_scvalid[i];
   1.388 +  return NIL;			/* failed */
   1.389 +}
   1.390 +
   1.391 +
   1.392 +/* Look up charset name or return entire table
   1.393 + * Accepts: charset name or NIL
   1.394 + * Returns: charset table entry or NIL if unknown
   1.395 + */
   1.396 +
   1.397 +const CHARSET *utf8_charset (char *charset)
   1.398 +{
   1.399 +  unsigned long i;
   1.400 +  if (!charset) return (CHARSET *) &utf8_csvalid[0];
   1.401 +  else if (*charset && (strlen (charset) < 128))
   1.402 +    for (i = 0; utf8_csvalid[i].name; i++)
   1.403 +      if (!compare_cstring (charset,utf8_csvalid[i].name))
   1.404 +	return (CHARSET *) &utf8_csvalid[i];
   1.405 +  return NIL;			/* failed */
   1.406 +}
   1.407 +
   1.408 +/* Validate charset and generate error message if invalid
   1.409 + * Accepts: bad character set
   1.410 + * Returns: NIL if good charset, else error message string
   1.411 + */
   1.412 +
   1.413 +#define BADCSS "[BADCHARSET ("
   1.414 +#define BADCSE ")] Unknown charset: "
   1.415 +
   1.416 +char *utf8_badcharset (char *charset)
   1.417 +{
   1.418 +  char *msg = NIL;
   1.419 +  if (!utf8_charset (charset)) {
   1.420 +    char *s,*t;
   1.421 +    unsigned long i,j;
   1.422 +				/* calculate size of header, trailer, and bad
   1.423 +				 * charset plus charset names */
   1.424 +    for (i = 0, j = sizeof (BADCSS) + sizeof (BADCSE) + strlen (charset) - 2;
   1.425 +	 utf8_csvalid[i].name; i++)
   1.426 +      j += strlen (utf8_csvalid[i].name) + 1;
   1.427 +				/* not built right */
   1.428 +    if (!i) fatal ("No valid charsets!");
   1.429 +				/* header */
   1.430 +    for (s = msg = (char *) fs_get (j), t = BADCSS; *t; *s++ = *t++);
   1.431 +				/* each charset */
   1.432 +    for (i = 0; utf8_csvalid[i].name; *s++ = ' ', i++)
   1.433 +      for (t = utf8_csvalid[i].name; *t; *s++ = *t++);
   1.434 +				/* back over last space, trailer */
   1.435 +    for (t = BADCSE, --s; *t; *s++ = *t++);
   1.436 +				/* finally bogus charset */
   1.437 +    for (t = charset; *t; *s++ = *t++);
   1.438 +    *s++ = '\0';		/* finally tie off string */
   1.439 +    if (s != (msg + j)) fatal ("charset msg botch");
   1.440 +  }
   1.441 +  return msg;
   1.442 +}
   1.443 +
   1.444 +/* Convert charset labelled sized text to UTF-8
   1.445 + * Accepts: source sized text
   1.446 + *	    charset
   1.447 + *	    pointer to returned sized text if non-NIL
   1.448 + *	    flags
   1.449 + * Returns: T if successful, NIL if failure
   1.450 + */
   1.451 +
   1.452 +long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags)
   1.453 +{
   1.454 +  ucs4cn_t cv = (flags & U8T_CASECANON) ? ucs4_titlecase : NIL;
   1.455 +  ucs4de_t de = (flags & U8T_DECOMPOSE) ? ucs4_decompose_recursive : NIL;
   1.456 +  const CHARSET *cs = (charset && *charset) ?
   1.457 +    utf8_charset (charset) : utf8_infercharset (text);
   1.458 +  if (cs) return (text && ret) ? utf8_text_cs (text,cs,ret,cv,de) : LONGT;
   1.459 +  if (ret) {			/* no conversion possible */
   1.460 +    ret->data = text->data;	/* so return source */
   1.461 +    ret->size = text->size;
   1.462 +  }
   1.463 +  return NIL;			/* failure */
   1.464 +}
   1.465 +
   1.466 +
   1.467 +/* Operations used in converting data */
   1.468 +
   1.469 +#define UTF8_COUNT_BMP(count,c,cv,de) {		\
   1.470 +  void *more = NIL;				\
   1.471 +  if (cv) c = (*cv) (c);			\
   1.472 +  if (de) c = (*de) (c,&more);			\
   1.473 +  do count += UTF8_SIZE_BMP(c);			\
   1.474 +  while (more && (c = (*de) (U8G_ERROR,&more)));\
   1.475 +}
   1.476 +
   1.477 +#define UTF8_WRITE_BMP(b,c,cv,de) {		\
   1.478 +  void *more = NIL;				\
   1.479 +  if (cv) c = (*cv) (c);			\
   1.480 +  if (de) c = (*de) (c,&more);			\
   1.481 +  do UTF8_PUT_BMP (b,c)				\
   1.482 +  while (more && (c = (*de) (U8G_ERROR,&more)));\
   1.483 +}
   1.484 +
   1.485 +#define UTF8_COUNT(count,c,cv,de) {		\
   1.486 +  void *more = NIL;				\
   1.487 +  if (cv) c = (*cv) (c);			\
   1.488 +  if (de) c = (*de) (c,&more);			\
   1.489 +  do count += utf8_size (c);			\
   1.490 +  while (more && (c = (*de) (U8G_ERROR,&more)));\
   1.491 +}
   1.492 +
   1.493 +#define UTF8_WRITE(b,c,cv,de) {			\
   1.494 +  void *more = NIL;				\
   1.495 +  if (cv) c = (*cv) (c);			\
   1.496 +  if (de) c = (*de) (c,&more);			\
   1.497 +  do b = utf8_put (b,c);			\
   1.498 +  while (more && (c = (*de) (U8G_ERROR,&more)));\
   1.499 +}
   1.500 +
   1.501 +/* Convert sized text to UTF-8 given CHARSET block
   1.502 + * Accepts: source sized text
   1.503 + *	    CHARSET block
   1.504 + *	    pointer to returned sized text 
   1.505 + *	    canonicalization function
   1.506 + *	    decomposition function
   1.507 + * Returns: T if successful, NIL if failure
   1.508 + */
   1.509 +
   1.510 +long utf8_text_cs (SIZEDTEXT *text,const CHARSET *cs,SIZEDTEXT *ret,
   1.511 +		   ucs4cn_t cv,ucs4de_t de)
   1.512 +{
   1.513 +  ret->data = text->data;	/* default to source */
   1.514 +  ret->size = text->size;
   1.515 +  switch (cs->type) {		/* convert if type known */
   1.516 +  case CT_ASCII:		/* 7-bit ASCII no table */
   1.517 +  case CT_UTF8:			/* variable UTF-8 encoded Unicode no table */
   1.518 +    if (cv || de) utf8_text_utf8 (text,ret,cv,de);
   1.519 +    break;
   1.520 +  case CT_1BYTE0:		/* 1 byte no table */
   1.521 +    utf8_text_1byte0 (text,ret,cv,de);
   1.522 +    break;
   1.523 +  case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */
   1.524 +    utf8_text_1byte (text,ret,cs->tab,cv,de);
   1.525 +    break;
   1.526 +  case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */
   1.527 +    utf8_text_1byte8 (text,ret,cs->tab,cv,de);
   1.528 +    break;
   1.529 +  case CT_EUC:			/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
   1.530 +    utf8_text_euc (text,ret,cs->tab,cv,de);
   1.531 +    break;
   1.532 +  case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */
   1.533 +    utf8_text_dbyte (text,ret,cs->tab,cv,de);
   1.534 +    break;
   1.535 +  case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */
   1.536 +    utf8_text_dbyte2 (text,ret,cs->tab,cv,de);
   1.537 +    break;
   1.538 +  case CT_UTF7:			/* variable UTF-7 encoded Unicode no table */
   1.539 +    utf8_text_utf7 (text,ret,cv,de);
   1.540 +    break;
   1.541 +  case CT_UCS2:			/* 2 byte 16-bit Unicode no table */
   1.542 +    utf8_text_ucs2 (text,ret,cv,de);
   1.543 +    break;
   1.544 +  case CT_UCS4:			/* 4 byte 32-bit Unicode no table */
   1.545 +    utf8_text_ucs4 (text,ret,cv,de);
   1.546 +    break;
   1.547 +  case CT_UTF16:		/* variable UTF-16 encoded Unicode no table */
   1.548 +    utf8_text_utf16 (text,ret,cv,de);
   1.549 +    break;
   1.550 +  case CT_2022:			/* variable ISO-2022 encoded no table*/
   1.551 +    utf8_text_2022 (text,ret,cv,de);
   1.552 +    break;
   1.553 +  case CT_SJIS:			/* 2 byte Shift-JIS encoded JIS no table */
   1.554 +    utf8_text_sjis (text,ret,cv,de);
   1.555 +    break;
   1.556 +  default:			/* unknown character set type */
   1.557 +    return NIL;
   1.558 +  }
   1.559 +  return LONGT;			/* return success */
   1.560 +}
   1.561 +
   1.562 +/* Reverse mapping routines
   1.563 + *
   1.564 + * These routines only support character sets, not all possible charsets.  In
   1.565 + * particular, they do not support any Unicode encodings or ISO 2022.
   1.566 + *
   1.567 + * As a special dispensation, utf8_cstext() and utf8_cstocstext() support
   1.568 + * support ISO-2022-JP if EUC-JP can be reverse mapped; and utf8_rmaptext()
   1.569 + * will generated ISO-2022-JP using an EUC-JP rmap if flagged to do so.
   1.570 + *
   1.571 + * No attempt is made to map "equivalent" Unicode characters or Unicode
   1.572 + * characters that have the same glyph; nor is there any attempt to handle
   1.573 + * combining characters or otherwise do any stringprep.  Maybe later.
   1.574 + */
   1.575 +
   1.576 +
   1.577 +/* Convert UTF-8 sized text to charset
   1.578 + * Accepts: source sized text
   1.579 + *	    destination charset
   1.580 + *	    pointer to returned sized text
   1.581 + *	    substitute character if not in cs, else NIL to return failure
   1.582 + * Returns: T if successful, NIL if failure
   1.583 + */
   1.584 +
   1.585 +
   1.586 +long utf8_cstext (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,
   1.587 +		  unsigned long errch)
   1.588 +{
   1.589 +  short iso2022jp = !compare_cstring (charset,"ISO-2022-JP");
   1.590 +  unsigned short *rmap = utf8_rmap (iso2022jp ? "EUC-JP" : charset);
   1.591 +  return rmap ? utf8_rmaptext (text,rmap,ret,errch,iso2022jp) : NIL;
   1.592 +}
   1.593 +
   1.594 +/* Convert charset labelled sized text to another charset
   1.595 + * Accepts: source sized text
   1.596 + *	    source charset
   1.597 + *	    pointer to returned sized text
   1.598 + *	    destination charset
   1.599 + *	    substitute character if not in dest cs, else NIL to return failure
   1.600 + * Returns: T if successful, NIL if failure
   1.601 + *
   1.602 + * This routine has the same restricts as utf8_cstext().
   1.603 + */
   1.604 +
   1.605 +long utf8_cstocstext (SIZEDTEXT *src,char *sc,SIZEDTEXT *dst,char *dc,
   1.606 +		      unsigned long errch)
   1.607 +{
   1.608 +  SIZEDTEXT utf8;
   1.609 +  const CHARSET *scs,*dcs;
   1.610 +  unsigned short *rmap;
   1.611 +  long ret = NIL;
   1.612 +  long iso2022jp;
   1.613 +				/* lookup charsets and reverse map */
   1.614 +  if ((dc && (dcs = utf8_charset (dc))) &&
   1.615 +      (rmap = (iso2022jp = ((dcs->type == CT_2022) &&
   1.616 +			    !compare_cstring (dcs->name,"ISO-2022-JP"))) ?
   1.617 +       utf8_rmap ("EUC-JP") : utf8_rmap_cs (dcs)) &&
   1.618 +      (scs = (sc && *sc) ? utf8_charset (sc) : utf8_infercharset (src))) {
   1.619 +				/* init temporary buffer */
   1.620 +    memset (&utf8,NIL,sizeof (SIZEDTEXT));
   1.621 +				/* source cs equivalent to dest cs? */
   1.622 +    if ((scs->type == dcs->type) && (scs->tab == dcs->tab)) {
   1.623 +      dst->data = src->data;	/* yes, just copy pointers */
   1.624 +      dst->size = src->size;
   1.625 +      ret = LONGT;
   1.626 +    }
   1.627 +				/* otherwise do the conversion */
   1.628 +    else ret = (utf8_text_cs (src,scs,&utf8,NIL,NIL) &&
   1.629 +		utf8_rmaptext (&utf8,rmap,dst,errch,iso2022jp));
   1.630 +				/* flush temporary buffer */
   1.631 +    if (utf8.data && (utf8.data != src->data) && (utf8.data != dst->data))
   1.632 +      fs_give ((void **) &utf8.data);
   1.633 +  }
   1.634 +  return ret;
   1.635 +}
   1.636 +
   1.637 +/* Cached rmap */
   1.638 +
   1.639 +static const CHARSET *currmapcs = NIL;
   1.640 +static unsigned short *currmap = NIL;
   1.641 +
   1.642 +
   1.643 +/* Cache and return map for UTF-8 -> character set
   1.644 + * Accepts: character set name
   1.645 + * Returns: cached map if character set found, else NIL
   1.646 + */
   1.647 +
   1.648 +unsigned short *utf8_rmap (char *charset)
   1.649 +{
   1.650 +  return (currmapcs && !compare_cstring (charset,currmapcs->name)) ? currmap :
   1.651 +    utf8_rmap_cs (utf8_charset (charset));
   1.652 +}
   1.653 +
   1.654 +
   1.655 +/* Cache and return map for UTF-8 -> character set given CHARSET block
   1.656 + * Accepts: CHARSET block
   1.657 + * Returns: cached map if character set found, else NIL
   1.658 + */
   1.659 +
   1.660 +unsigned short *utf8_rmap_cs (const CHARSET *cs)
   1.661 +{
   1.662 +  unsigned short *ret = NIL;
   1.663 +  if (!cs);			/* have charset? */
   1.664 +  else if (cs == currmapcs) ret = currmap;
   1.665 +  else if (ret = utf8_rmap_gen (cs,currmap)) {
   1.666 +    currmapcs = cs;
   1.667 +    currmap = ret;
   1.668 +  }
   1.669 +  return ret;
   1.670 +}
   1.671 +
   1.672 +/* Return map for UTF-8 -> character set given CHARSET block
   1.673 + * Accepts: CHARSET block
   1.674 + *	    old map to recycle
   1.675 + * Returns: map if character set found, else NIL
   1.676 + */
   1.677 +
   1.678 +unsigned short *utf8_rmap_gen (const CHARSET *cs,unsigned short *oldmap)
   1.679 +{
   1.680 +  unsigned short u,*tab,*rmap;
   1.681 +  unsigned int i,m,ku,ten;
   1.682 +  struct utf8_eucparam *param,*p2;
   1.683 +  switch (cs->type) {		/* is a character set? */
   1.684 +  case CT_ASCII:		/* 7-bit ASCII no table */
   1.685 +  case CT_1BYTE0:		/* 1 byte no table */
   1.686 +  case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */
   1.687 +  case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */
   1.688 +  case CT_EUC:			/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
   1.689 +  case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */
   1.690 +  case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */
   1.691 +  case CT_SJIS:			/* 2 byte Shift-JIS */
   1.692 +    rmap = oldmap ? oldmap :	/* recycle old map if supplied else make new */
   1.693 +      (unsigned short *) fs_get (65536 * sizeof (unsigned short));
   1.694 +				/* initialize table for ASCII */
   1.695 +    for (i = 0; i < 128; i++) rmap[i] = (unsigned short) i;
   1.696 +				/* populate remainder of table with NOCHAR */
   1.697 +#define NOCHARBYTE (NOCHAR & 0xff)
   1.698 +#if NOCHAR - ((NOCHARBYTE << 8) | NOCHARBYTE)
   1.699 +    while (i < 65536) rmap[i++] = NOCHAR;
   1.700 +#else
   1.701 +    memset (rmap + 128,NOCHARBYTE,(65536 - 128) * sizeof (unsigned short));
   1.702 +#endif
   1.703 +    break;
   1.704 +  default:			/* unsupported charset type */
   1.705 +    rmap = NIL;			/* no map possible */
   1.706 +  }
   1.707 +  if (rmap) {			/* have a map? */
   1.708 +    switch (cs->type) {		/* additional reverse map actions */
   1.709 +    case CT_1BYTE0:		/* 1 byte no table */
   1.710 +      for (i = 128; i < 256; i++) rmap[i] = (unsigned short) i;
   1.711 +      break;
   1.712 +    case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */
   1.713 +      for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++)
   1.714 +	if (tab[i & BITS7] != UBOGON) rmap[tab[i & BITS7]] = (unsigned short)i;
   1.715 +      break;
   1.716 +    case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */
   1.717 +      for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++)
   1.718 +	if (tab[i] != UBOGON) rmap[tab[i]] = (unsigned short) i;
   1.719 +      break;
   1.720 +    case CT_EUC:		/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
   1.721 +      for (param = (struct utf8_eucparam *) cs->tab,
   1.722 +	     tab = (unsigned short *) param->tab, ku = 0;
   1.723 +	   ku < param->max_ku; ku++)
   1.724 +	for (ten = 0; ten < param->max_ten; ten++)
   1.725 +	  if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
   1.726 +	    rmap[u] = ((ku + param->base_ku) << 8) +
   1.727 +	      (ten + param->base_ten) + 0x8080;
   1.728 +      break;
   1.729 +
   1.730 +    case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */
   1.731 +      for (param = (struct utf8_eucparam *) cs->tab,
   1.732 +	     tab = (unsigned short *) param->tab, ku = 0;
   1.733 +	   ku < param->max_ku; ku++)
   1.734 +	for (ten = 0; ten < param->max_ten; ten++)
   1.735 +	  if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
   1.736 +	    rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten);
   1.737 +      break;
   1.738 +    case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */
   1.739 +      param = (struct utf8_eucparam *) cs->tab;
   1.740 +      p2 = param + 1;		/* plane 2 parameters */
   1.741 +				/* only ten parameters should differ */
   1.742 +      if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku))
   1.743 +	fatal ("ku definition error for CT_DBYTE2 charset");
   1.744 +				/* total codepoints in each ku */
   1.745 +      m = param->max_ten + p2->max_ten;
   1.746 +      tab = (unsigned short *) param->tab;
   1.747 +      for (ku = 0; ku < param->max_ku; ku++) {
   1.748 +	for (ten = 0; ten < param->max_ten; ten++)
   1.749 +	  if ((u = tab[(ku * m) + ten]) != UBOGON)
   1.750 +	    rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten);
   1.751 +	for (ten = 0; ten < p2->max_ten; ten++)
   1.752 +	  if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON)
   1.753 +	    rmap[u] = ((ku + param->base_ku) << 8) + (ten + p2->base_ten);
   1.754 +      }
   1.755 +      break;
   1.756 +    case CT_SJIS:		/* 2 byte Shift-JIS */
   1.757 +      for (ku = 0; ku < MAX_JIS0208_KU; ku++)
   1.758 +	for (ten = 0; ten < MAX_JIS0208_TEN; ten++)
   1.759 +	  if ((u = jis0208tab[ku][ten]) != UBOGON) {
   1.760 +	    int sku = ku + BASE_JIS0208_KU;
   1.761 +	    int sten = ten + BASE_JIS0208_TEN;
   1.762 +	    rmap[u] = ((((sku + 1) >> 1) + ((sku < 95) ? 112 : 176)) << 8) +
   1.763 +	      sten + ((sku % 2) ? ((sten > 95) ? 32 : 31) : 126);
   1.764 +	  }
   1.765 +				/* JIS Roman */
   1.766 +      rmap[UCS2_YEN] = JISROMAN_YEN;
   1.767 +      rmap[UCS2_OVERLINE] = JISROMAN_OVERLINE;
   1.768 +				/* JIS hankaku katakana */
   1.769 +      for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++)
   1.770 +	rmap[UCS2_KATAKANA + u] = MIN_KANA_8 + u;
   1.771 +      break;
   1.772 +    }
   1.773 +				/* hack: map NBSP to SP if otherwise no map */
   1.774 +    if (rmap[0x00a0] == NOCHAR) rmap[0x00a0] = rmap[0x0020];
   1.775 +  }
   1.776 +  return rmap;			/* return map */
   1.777 +}
   1.778 +
   1.779 +/* Convert UTF-8 sized text to charset using rmap
   1.780 + * Accepts: source sized text
   1.781 + *	    conversion rmap
   1.782 + *	    pointer to returned sized text
   1.783 + *	    substitute character if not in rmap, else NIL to return failure
   1.784 + *	    ISO-2022-JP conversion flag
   1.785 + * Returns T if successful, NIL if failure
   1.786 + *
   1.787 + * This routine doesn't try to convert to all possible charsets; in particular
   1.788 + * it doesn't support other Unicode encodings or any ISO 2022 other than
   1.789 + * ISO-2022-JP.
   1.790 + */
   1.791 +
   1.792 +long utf8_rmaptext (SIZEDTEXT *text,unsigned short *rmap,SIZEDTEXT *ret,
   1.793 +		    unsigned long errch,long iso2022jp)
   1.794 +{
   1.795 +  unsigned long i,u,c;
   1.796 +				/* get size of buffer */
   1.797 +  if (i = utf8_rmapsize (text,rmap,errch,iso2022jp)) {
   1.798 +    unsigned char *s = text->data;
   1.799 +    unsigned char *t = ret->data = (unsigned char *) fs_get (i);
   1.800 +    ret->size = i - 1;		/* number of octets in destination buffer */
   1.801 +				/* start non-zero ISO-2022-JP state at 1 */
   1.802 +    if (iso2022jp) iso2022jp = 1;
   1.803 +				/* convert string, ignore BOM */
   1.804 +    for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) {
   1.805 +				/* substitute error character for NOCHAR */
   1.806 +      if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch;
   1.807 +      switch (iso2022jp) {	/* depends upon ISO 2022 mode */
   1.808 +      case 0:			/* ISO 2022 not in effect */
   1.809 +				/* two-byte character */
   1.810 +	if (c > 0xff) *t++ = (unsigned char) (c >> 8);
   1.811 +				/* single-byte or low-byte of two-byte */
   1.812 +	*t++ = (unsigned char) (c & 0xff);
   1.813 +	break;
   1.814 +      case 1:			/* ISO 2022 Roman */
   1.815 +				/* <ch> */
   1.816 +	if (c < 0x80) *t++ = (unsigned char) c;
   1.817 +	else {			/* JIS character */
   1.818 +	  *t++ = I2C_ESC;	/* ESC $ B <hi> <lo> */
   1.819 +	  *t++ = I2C_MULTI;
   1.820 +	  *t++ = I2CS_94x94_JIS_NEW;
   1.821 +	  *t++ = (unsigned char) (c >> 8) & 0x7f;
   1.822 +	  *t++ = (unsigned char) c & 0x7f;
   1.823 +	  iso2022jp = 2;	/* shift to ISO 2022 JIS */
   1.824 +	}
   1.825 +	break;
   1.826 +      case 2:			/* ISO 2022 JIS */
   1.827 +	if (c > 0x7f) {		/* <hi> <lo> */
   1.828 +	  *t++ = (unsigned char) (c >> 8) & 0x7f;
   1.829 +	  *t++ = (unsigned char) c & 0x7f;
   1.830 +	}
   1.831 +	else {			/* ASCII character */
   1.832 +	  *t++ = I2C_ESC;	/* ESC ( J <ch> */
   1.833 +	  *t++ = I2C_G0_94;
   1.834 +	  *t++ = I2CS_94_JIS_ROMAN;
   1.835 +	  *t++ = (unsigned char) c;
   1.836 +	  iso2022jp = 1;	/* shift to ISO 2022 Roman */
   1.837 +	}
   1.838 +	break;
   1.839 +      }
   1.840 +    }
   1.841 +    if (iso2022jp == 2) {	/* ISO-2022-JP string must end in Roman */
   1.842 +      *t++ = I2C_ESC;		/* ESC ( J */
   1.843 +      *t++ = I2C_G0_94;
   1.844 +      *t++ = I2CS_94_JIS_ROMAN;
   1.845 +    }
   1.846 +    *t++ = NIL;			/* tie off returned data */
   1.847 +    return LONGT;		/* return success */
   1.848 +  }
   1.849 +  ret->data = NIL;
   1.850 +  ret->size = 0;
   1.851 +  return NIL;			/* failure */
   1.852 +}
   1.853 +
   1.854 +/* Calculate size of convertsion of UTF-8 sized text to charset using rmap
   1.855 + * Accepts: source sized text
   1.856 + *	    conversion rmap
   1.857 + *	    pointer to returned sized text
   1.858 + *	    substitute character if not in rmap, else NIL to return failure
   1.859 + *	    ISO-2022-JP conversion flag
   1.860 + * Returns size+1 if successful, NIL if failure
   1.861 + *
   1.862 + * This routine doesn't try to handle to all possible charsets; in particular
   1.863 + * it doesn't support other Unicode encodings or any ISO 2022 other than
   1.864 + * ISO-2022-JP.
   1.865 + */
   1.866 +
   1.867 +unsigned long utf8_rmapsize (SIZEDTEXT *text,unsigned short *rmap,
   1.868 +			     unsigned long errch,long iso2022jp)
   1.869 +{
   1.870 +  unsigned long i,u,c;
   1.871 +  unsigned long ret = 1;	/* terminating NUL */
   1.872 +  unsigned char *s = text->data;
   1.873 +  if (iso2022jp) iso2022jp = 1;	/* start non-zero ISO-2022-JP state at 1 */
   1.874 +  for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) {
   1.875 +    if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch)))
   1.876 +      return NIL;		/* not in BMP, or NOCHAR and no err char */
   1.877 +    switch (iso2022jp) {	/* depends upon ISO 2022 mode */
   1.878 +    case 0:			/* ISO 2022 not in effect */
   1.879 +      ret += (c > 0xff) ? 2 : 1;
   1.880 +      break;
   1.881 +    case 1:			/* ISO 2022 Roman */
   1.882 +      if (c < 0x80) ret += 1;	/* <ch> */
   1.883 +      else {			/* JIS character */
   1.884 +	ret += 5;		/* ESC $ B <hi> <lo> */
   1.885 +	iso2022jp = 2;		/* shift to ISO 2022 JIS */
   1.886 +      }
   1.887 +      break;
   1.888 +    case 2:			/* ISO 2022 JIS */
   1.889 +      if (c > 0x7f) ret += 2;	/* <hi> <lo> */
   1.890 +      else {			/* ASCII character */
   1.891 +	ret += 4;		/* ESC ( J <ch> */
   1.892 +	iso2022jp = 1;		/* shift to ISO 2022 Roman */
   1.893 +      }
   1.894 +      break;
   1.895 +    }
   1.896 +  }
   1.897 +  if (iso2022jp == 2) {		/* ISO-2022-JP string must end in Roman */
   1.898 +    ret += 3;			/* ESC ( J */
   1.899 +    iso2022jp = 1;		/* reset state to Roman */
   1.900 +  }
   1.901 +  return ret;
   1.902 +}
   1.903 +
   1.904 +/* Convert UCS-4 to charset using rmap
   1.905 + * Accepts: source UCS-4 character(s)
   1.906 + *	    numver of UCS-4 characters
   1.907 + *	    conversion rmap
   1.908 + *	    pointer to returned sized text
   1.909 + *	    substitute character if not in rmap, else NIL to return failure
   1.910 + * Returns T if successful, NIL if failure
   1.911 + *
   1.912 + * Currently only supports BMP characters, and does not support ISO-2022
   1.913 + */
   1.914 +
   1.915 +long ucs4_rmaptext (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
   1.916 +		    SIZEDTEXT *ret,unsigned long errch)
   1.917 +{
   1.918 +  long size = ucs4_rmaplen (ucs4,len,rmap,errch);
   1.919 +  return (size >= 0) ?		/* build in newly-created buffer */
   1.920 +    ucs4_rmapbuf (ret->data = (unsigned char *) fs_get ((ret->size = size) +1),
   1.921 +		  ucs4,len,rmap,errch) : NIL;
   1.922 +}
   1.923 +
   1.924 +/* Return size of UCS-4 string converted to other CS via rmap
   1.925 + * Accepts: source UCS-4 character(s)
   1.926 + *	    numver of UCS-4 characters
   1.927 + *	    conversion rmap
   1.928 + *	    substitute character if not in rmap, else NIL to return failure
   1.929 + * Returns: length if success, negative if failure (no-convert)
   1.930 + */
   1.931 +
   1.932 +long ucs4_rmaplen (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
   1.933 +		   unsigned long errch)
   1.934 +{
   1.935 +  long ret;
   1.936 +  unsigned long i,u,c;
   1.937 +				/* count non-BOM characters */
   1.938 +  for (ret = 0,i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) {
   1.939 +    if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch)))
   1.940 +      return -1;		/* not in BMP, or NOCHAR and no err char? */
   1.941 +    ret += (c > 0xff) ? 2 : 1;
   1.942 +  }
   1.943 +  return ret;
   1.944 +}
   1.945 +
   1.946 +
   1.947 +/* Stuff buffer with UCS-4 string converted to other CS via rmap
   1.948 + * Accepts: destination buffer
   1.949 + *	    source UCS-4 character(s)
   1.950 + *	    number of UCS-4 characters
   1.951 + *	    conversion rmap
   1.952 + *	    substitute character if not in rmap, else NIL to return failure
   1.953 + * Returns: T, always
   1.954 + */
   1.955 +
   1.956 +long ucs4_rmapbuf (unsigned char *t,unsigned long *ucs4,unsigned long len,
   1.957 +		   unsigned short *rmap,unsigned long errch)
   1.958 +{
   1.959 +  unsigned long i,u,c;
   1.960 +				/* convert non-BOM characters */
   1.961 +  for (i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) {
   1.962 +				/* substitute error character for NOCHAR */
   1.963 +    if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch;
   1.964 +				/* two-byte character? */
   1.965 +    if (c > 0xff) *t++ = (unsigned char) (c >> 8);
   1.966 +				/* single-byte or low-byte of two-byte */
   1.967 +    *t++ = (unsigned char) (c & 0xff);
   1.968 +  }
   1.969 +  *t++ = NIL;			/* tie off returned data */
   1.970 +  return LONGT;
   1.971 +}
   1.972 +
   1.973 +/* Return UCS-4 Unicode character from UTF-8 string
   1.974 + * Accepts: pointer to string
   1.975 + *	    remaining octets in string
   1.976 + * Returns: UCS-4 character with pointer and count updated
   1.977 + *	    or error code with pointer and count unchanged
   1.978 + */
   1.979 +
   1.980 +unsigned long utf8_get (unsigned char **s,unsigned long *i)
   1.981 +{
   1.982 +  unsigned char *t = *s;
   1.983 +  unsigned long j = *i;
   1.984 +				/* decode raw UTF-8 string */
   1.985 +  unsigned long ret = utf8_get_raw (&t,&j);
   1.986 +  if (ret & U8G_ERROR);		/* invalid raw UTF-8 decoding? */
   1.987 +				/* no, is it surrogate? */
   1.988 +  else if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) ret = U8G_SURROGA;
   1.989 +				/* or in non-Unicode ISO 10646 space? */
   1.990 +  else if (ret > UCS4_MAXUNICODE) ret = U8G_NOTUNIC;
   1.991 +  else {
   1.992 +    *s = t;			/* all is well, update pointer */
   1.993 +    *i = j;			/* and counter */
   1.994 +  }
   1.995 +  return ret;			/* return value */
   1.996 +}
   1.997 +
   1.998 +/* Return raw (including non-Unicode) UCS-4 character from UTF-8 string
   1.999 + * Accepts: pointer to string
  1.1000 + *	    remaining octets in string
  1.1001 + * Returns: UCS-4 character with pointer and count updated
  1.1002 + *	    or error code with pointer and count unchanged
  1.1003 + */
  1.1004 +
  1.1005 +unsigned long utf8_get_raw (unsigned char **s,unsigned long *i)
  1.1006 +{
  1.1007 +  unsigned char c,c1;
  1.1008 +  unsigned char *t = *s;
  1.1009 +  unsigned long j = *i;
  1.1010 +  unsigned long ret = U8G_NOTUTF8;
  1.1011 +  int more = 0;
  1.1012 +  do {				/* make sure have source octets available */
  1.1013 +    if (!j--) return more ? U8G_ENDSTRI : U8G_ENDSTRG;
  1.1014 +				/* UTF-8 continuation? */
  1.1015 +    else if (((c = *t++) > 0x7f) && (c < 0xc0)) {
  1.1016 +				/* continuation when not in progress */
  1.1017 +      if (!more) return U8G_BADCONT;
  1.1018 +      --more;			/* found a continuation octet */
  1.1019 +      ret <<= 6;		/* shift current value by 6 bits */
  1.1020 +      ret |= c & 0x3f;		/* merge continuation octet */
  1.1021 +    }
  1.1022 +				/* incomplete UTF-8 character */
  1.1023 +    else if (more) return U8G_INCMPLT;
  1.1024 +    else {			/* start of sequence */
  1.1025 +      c1 = j ? *t : 0xbf;	/* assume valid continuation if incomplete */
  1.1026 +      if (c < 0x80) ret = c;	/* U+0000 - U+007f */
  1.1027 +      else if (c < 0xc2);	/* c0 and c1 never valid */
  1.1028 +      else if (c < 0xe0) {	/* U+0080 - U+07ff */
  1.1029 +	if (c &= 0x1f) more = 1;
  1.1030 +      }
  1.1031 +      else if (c < 0xf0) {	/* U+0800 - U+ffff */
  1.1032 +	if ((c &= 0x0f) || (c1 >= 0xa0)) more = 2;
  1.1033 +      }
  1.1034 +      else if (c < 0xf8) {	/* U+10000 - U+10ffff (and 110000 - 1fffff) */
  1.1035 +	if ((c &= 0x07) || (c1 >= 0x90)) more = 3;
  1.1036 +      }
  1.1037 +      else if (c < 0xfc) {	/* ISO 10646 200000 - 3ffffff */
  1.1038 +	if ((c &= 0x03) || (c1 >= 0x88)) more = 4;
  1.1039 +      }
  1.1040 +      else if (c < 0xfe) {	/* ISO 10646 4000000 - 7fffffff */
  1.1041 +	if ((c &= 0x01) || (c1 >= 0x84)) more = 5;
  1.1042 +      }
  1.1043 +				/* fe and ff never valid */
  1.1044 +      if (more) {		/* multi-octet, make sure more to come */
  1.1045 +	if (!j) return U8G_ENDSTRI;
  1.1046 +	ret = c;		/* continuation needed, save start bits */
  1.1047 +      }
  1.1048 +    }
  1.1049 +  } while (more);
  1.1050 +  if (!(ret & U8G_ERROR)) {	/* success return? */
  1.1051 +    *s = t;			/* yes, update pointer */
  1.1052 +    *i = j;			/* and counter */
  1.1053 +  }
  1.1054 +  return ret;			/* return value */
  1.1055 +}
  1.1056 +
  1.1057 +/* Return UCS-4 character from named charset string
  1.1058 + * Accepts: charset
  1.1059 + *	    pointer to string
  1.1060 + *	    remaining octets in string
  1.1061 + * Returns: UCS-4 character with pointer and count updated, negative if error
  1.1062 + *
  1.1063 + * Error codes are the same as utf8_get().
  1.1064 + */
  1.1065 +
  1.1066 +unsigned long ucs4_cs_get (CHARSET *cs,unsigned char **s,unsigned long *i)
  1.1067 +{
  1.1068 +  unsigned char c,c1,ku,ten;
  1.1069 +  unsigned long ret,d;
  1.1070 +  unsigned char *t = *s;
  1.1071 +  unsigned long j = *i;
  1.1072 +  struct utf8_eucparam *p1,*p2,*p3;
  1.1073 +  if (j--) c = *t++;		/* get first octet */
  1.1074 +  else return U8G_ENDSTRG;	/* empty string */
  1.1075 +  switch (cs->type) {		/* convert if type known */
  1.1076 +  case CT_UTF8:			/* variable UTF-8 encoded Unicode no table */
  1.1077 +    return utf8_get (s,i);
  1.1078 +  case CT_ASCII:		/* 7-bit ASCII no table */
  1.1079 +    if (c >= 0x80) return U8G_NOTUTF8;
  1.1080 +  case CT_1BYTE0:		/* 1 byte no table */
  1.1081 +    ret = c;			/* identity */
  1.1082 +    break;
  1.1083 +  case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */
  1.1084 +    ret = (c > 0x80) ? ((unsigned short *) cs->tab)[c & BITS7] : c;
  1.1085 +    break;
  1.1086 +  case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */
  1.1087 +    ret = ((unsigned short *) cs->tab)[c];
  1.1088 +    break;
  1.1089 +
  1.1090 +  case CT_EUC:			/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
  1.1091 +    if (c & BIT8) {
  1.1092 +      p1 = (struct utf8_eucparam *) cs->tab;
  1.1093 +      p2 = p1 + 1;
  1.1094 +      p3 = p1 + 2;
  1.1095 +      if (j--) c1 = *t++;	/* get second octet */
  1.1096 +      else return U8G_ENDSTRI;
  1.1097 +      if (!(c1 & BIT8)) return U8G_NOTUTF8;
  1.1098 +      switch (c) {		/* check 8bit code set */
  1.1099 +      case EUC_CS2:		/* CS2 */
  1.1100 +	if (p2->base_ku) {	/* CS2 set up? */
  1.1101 +	  if (p2->base_ten) {	/* yes, multibyte? */
  1.1102 +	    if (j--) c = *t++;	/* get second octet */
  1.1103 +	    else return U8G_ENDSTRI;
  1.1104 +	    if ((c & BIT8) &&
  1.1105 +		((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) &&
  1.1106 +		((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) {
  1.1107 +	      ret = ((unsigned short *) p2->tab)[(ku*p2->max_ten) + ten];
  1.1108 +	      break;
  1.1109 +	    }
  1.1110 +	  }
  1.1111 +	  else if ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) {
  1.1112 +	    ret = c1 + ((unsigned long) p2->tab);
  1.1113 +	    break;
  1.1114 +	  }
  1.1115 +	}
  1.1116 +	return U8G_NOTUTF8;	/* CS2 not set up or bogus */
  1.1117 +      case EUC_CS3:		/* CS3 */
  1.1118 +	if (p3->base_ku) {	/* CS3 set up? */
  1.1119 +	  if (p3->base_ten) {	/* yes, multibyte? */
  1.1120 +	    if (j--) c = *t++;	/* get second octet */
  1.1121 +	    else return U8G_ENDSTRI;
  1.1122 +	    if ((c & BIT8) &&
  1.1123 +		((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) &&
  1.1124 +		((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) {
  1.1125 +	      ret = ((unsigned short *) p3->tab)[(ku*p3->max_ten) + ten];
  1.1126 +	      break;
  1.1127 +	    }
  1.1128 +	  }
  1.1129 +	  else if ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) {
  1.1130 +	    ret = c1 + ((unsigned long) p3->tab);
  1.1131 +	    break;
  1.1132 +	  }
  1.1133 +	}
  1.1134 +	return U8G_NOTUTF8;	/* CS3 not set up or bogus */
  1.1135 +      default:
  1.1136 +	if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) ||
  1.1137 +	    ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten))
  1.1138 +	  return U8G_NOTUTF8;
  1.1139 +	ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten];
  1.1140 +		/* special hack for JIS X 0212: merge rows less than 10 */
  1.1141 +	if ((ret == UBOGON) && ku && (ku < 10) && p3->tab && p3->base_ten)
  1.1142 +	  ret = ((unsigned short *) p3->tab)
  1.1143 +	    [((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten];
  1.1144 +	break;
  1.1145 +      }
  1.1146 +    }
  1.1147 +    else ret = c;		/* ASCII character */
  1.1148 +    break;
  1.1149 +
  1.1150 +  case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */
  1.1151 +    if (c & BIT8) {		/* double-byte character? */
  1.1152 +      p1 = (struct utf8_eucparam *) cs->tab;
  1.1153 +      if (j--) c1 = *t++;	/* get second octet */
  1.1154 +      else return U8G_ENDSTRI;
  1.1155 +      if (((ku = c - p1->base_ku) < p1->max_ku) &&
  1.1156 +	  ((ten = c1 - p1->base_ten) < p1->max_ten))
  1.1157 +	ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten];
  1.1158 +      else return U8G_NOTUTF8;
  1.1159 +    }
  1.1160 +    else ret = c;		/* ASCII character */
  1.1161 +    break;
  1.1162 +  case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */
  1.1163 +    if (c & BIT8) {		/* double-byte character? */
  1.1164 +      p1 = (struct utf8_eucparam *) cs->tab;
  1.1165 +      p2 = p1 + 1;
  1.1166 +      if (j--) c1 = *t++;	/* get second octet */
  1.1167 +      else return U8G_ENDSTRI;
  1.1168 +      if (c1 & BIT8) {		/* high vs. low plane */
  1.1169 +	if ((ku = c - p2->base_ku) < p2->max_ku &&
  1.1170 +	    ((ten = c1 - p2->base_ten) < p2->max_ten))
  1.1171 +	  ret = ((unsigned short *) p1->tab)
  1.1172 +	    [(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten];
  1.1173 +	else return U8G_NOTUTF8;
  1.1174 +      }
  1.1175 +      else if ((ku = c - p1->base_ku) < p1->max_ku &&
  1.1176 +	       ((ten = c1 - p1->base_ten) < p1->max_ten))
  1.1177 +	  ret = ((unsigned short *) p1->tab)
  1.1178 +	    [(ku*(p1->max_ten + p2->max_ten)) + ten];
  1.1179 +      else return U8G_NOTUTF8;
  1.1180 +    }
  1.1181 +    else ret = c;		/* ASCII character */
  1.1182 +    break;
  1.1183 +  case CT_SJIS:			/* 2 byte Shift-JIS encoded JIS no table */
  1.1184 +				/* compromise - do yen sign but not overline */
  1.1185 +    if (!(c & BIT8)) ret = (c == JISROMAN_YEN) ? UCS2_YEN : c;
  1.1186 +				/* half-width katakana? */
  1.1187 +    else if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) ret = c + KANA_8;
  1.1188 +    else {			/* Shift-JIS */
  1.1189 +      if (j--) c1 = *t++;	/* get second octet */
  1.1190 +      else return U8G_ENDSTRI;
  1.1191 +      SJISTOJIS (c,c1);
  1.1192 +      c = JISTOUNICODE (c,c1,ku,ten);
  1.1193 +    }
  1.1194 +    break;
  1.1195 +
  1.1196 +  case CT_UCS2:			/* 2 byte 16-bit Unicode no table */
  1.1197 +    ret = c << 8;
  1.1198 +    if (j--) c = *t++;		/* get second octet */
  1.1199 +    else return U8G_ENDSTRI;	/* empty string */
  1.1200 +    ret |= c;
  1.1201 +    break;
  1.1202 +  case CT_UCS4:			/* 4 byte 32-bit Unicode no table */
  1.1203 +    if (c & 0x80) return U8G_NOTUTF8;
  1.1204 +    if (j < 3) return U8G_ENDSTRI;
  1.1205 +    j -= 3;			/* count three octets */
  1.1206 +    ret = c << 24;
  1.1207 +    ret |= (*t++) << 16;
  1.1208 +    ret |= (*t++) << 8;
  1.1209 +    ret |= (*t++);
  1.1210 +    break;
  1.1211 +  case CT_UTF16:		/* variable UTF-16 encoded Unicode no table */
  1.1212 +    ret = c << 8;
  1.1213 +    if (j--) c = *t++;		/* get second octet */
  1.1214 +    else return U8G_ENDSTRI;	/* empty string */
  1.1215 +    ret |= c;
  1.1216 +				/* surrogate? */
  1.1217 +    if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) {
  1.1218 +				/* invalid first surrogate */
  1.1219 +      if ((ret > UTF16_SURRHEND) || (j < 2)) return U8G_NOTUTF8;
  1.1220 +      j -= 2;			/* count two octets */
  1.1221 +      d = (*t++) << 8;		/* first octet of second surrogate */
  1.1222 +      d |= *t++;		/* second octet of second surrogate */
  1.1223 +      if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) return U8G_NOTUTF8;
  1.1224 +      ret = UTF16_BASE + ((ret & UTF16_MASK) << UTF16_SHIFT) +
  1.1225 +	(d & UTF16_MASK);
  1.1226 +    }
  1.1227 +    break;
  1.1228 +  default:			/* unknown/unsupported character set type */
  1.1229 +    return U8G_NOTUTF8;
  1.1230 +  }
  1.1231 +  *s = t;			/* update pointer and counter */
  1.1232 +  *i = j;
  1.1233 +  return ret;
  1.1234 +}
  1.1235 +
  1.1236 +/* Produce charset validity map for BMP
  1.1237 + * Accepts: list of charsets to map
  1.1238 + * Returns: validity map, indexed by BMP codepoint
  1.1239 + *
  1.1240 + * Bit 0x1 is the "not-CJK" character bit
  1.1241 + */
  1.1242 +
  1.1243 +unsigned long *utf8_csvalidmap (char *charsets[])
  1.1244 +{
  1.1245 +  unsigned short u,*tab;
  1.1246 +  unsigned int m,ku,ten;
  1.1247 +  unsigned long i,csi,csb;
  1.1248 +  struct utf8_eucparam *param,*p2;
  1.1249 +  char *s;
  1.1250 +  const CHARSET *cs;
  1.1251 +  unsigned long *ret = (unsigned long *)
  1.1252 +    fs_get (i = 0x10000 * sizeof (unsigned long));
  1.1253 +  memset (ret,0,i);		/* zero the entire vector */
  1.1254 +				/* mark all the non-CJK codepoints */
  1.1255 +	/* U+0000 - U+2E7F non-CJK */
  1.1256 +  for (i = 0; i < 0x2E7F; ++i) ret[i] = 0x1;
  1.1257 +	/* U+2E80 - U+2EFF CJK Radicals Supplement
  1.1258 +	 * U+2F00 - U+2FDF Kangxi Radicals
  1.1259 +	 * U+2FE0 - U+2FEF unassigned
  1.1260 +	 * U+2FF0 - U+2FFF Ideographic Description Characters
  1.1261 +	 * U+3000 - U+303F CJK Symbols and Punctuation
  1.1262 +	 * U+3040 - U+309F Hiragana
  1.1263 +	 * U+30A0 - U+30FF Katakana
  1.1264 +	 * U+3100 - U+312F BoPoMoFo
  1.1265 +	 * U+3130 - U+318F Hangul Compatibility Jamo
  1.1266 +	 * U+3190 - U+319F Kanbun
  1.1267 +	 * U+31A0 - U+31BF BoPoMoFo Extended
  1.1268 +	 * U+31C0 - U+31EF CJK Strokes
  1.1269 +	 * U+31F0 - U+31FF Katakana Phonetic Extensions
  1.1270 +	 * U+3200 - U+32FF Enclosed CJK Letters and Months
  1.1271 +	 * U+3300 - U+33FF CJK Compatibility
  1.1272 +	 * U+3400 - U+4DBF CJK Unified Ideographs Extension A
  1.1273 +	 * U+4DC0 - U+4DFF Yijing Hexagram Symbols
  1.1274 +	 * U+4E00 - U+9FFF CJK Unified Ideographs
  1.1275 +	 * U+A000 - U+A48F Yi Syllables
  1.1276 +	 * U+A490 - U+A4CF Yi Radicals
  1.1277 +	 * U+A700 - U+A71F Modifier Tone Letters
  1.1278 +	 */
  1.1279 +  for (i = 0xa720; i < 0xabff; ++i) ret[i] = 0x1;
  1.1280 +	/* U+AC00 - U+D7FF Hangul Syllables */
  1.1281 +  for (i = 0xd800; i < 0xf8ff; ++i) ret[i] = 0x1;
  1.1282 +	/* U+F900 - U+FAFF CJK Compatibility Ideographs */
  1.1283 +  for (i = 0xfb00; i < 0xfe2f; ++i) ret[i] = 0x1;
  1.1284 +	/* U+FE30 - U+FE4F CJK Compatibility Forms
  1.1285 +	 * U+FE50 - U+FE6F Small Form Variants (for CNS 11643)
  1.1286 +	 */
  1.1287 +  for (i = 0xfe70; i < 0xfeff; ++i) ret[i] = 0x1;
  1.1288 +	/* U+FF00 - U+FFEF CJK Compatibility Ideographs */
  1.1289 +  for (i = 0xfff0; i < 0x10000; ++i) ret[i] = 0x1;
  1.1290 +
  1.1291 +				/* for each supplied charset */
  1.1292 +  for (csi = 1; ret && charsets && (s = charsets[csi - 1]); ++csi) {
  1.1293 +				/* substitute EUC-JP for ISO-2022-JP */
  1.1294 +    if (!compare_cstring (s,"ISO-2022-JP")) s = "EUC-JP";
  1.1295 +				/* look up charset */
  1.1296 +    if (cs = utf8_charset (s)) {
  1.1297 +      csb = 1 << csi;		/* charset bit */
  1.1298 +      switch (cs->type) {
  1.1299 +      case CT_ASCII:		/* 7-bit ASCII no table */
  1.1300 +      case CT_1BYTE0:		/* 1 byte no table */
  1.1301 +      case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */
  1.1302 +      case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */
  1.1303 +      case CT_EUC:		/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
  1.1304 +      case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */
  1.1305 +      case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */
  1.1306 +      case CT_SJIS:		/* 2 byte Shift-JIS */
  1.1307 +				/* supported charset type, all ASCII is OK */
  1.1308 +	for (i = 0; i < 128; ++i) ret[i] |= csb;
  1.1309 +	break;
  1.1310 +      default:			/* unsupported charset type */
  1.1311 +	fs_give ((void **) &ret);
  1.1312 +	break;
  1.1313 +      }
  1.1314 +				/* now do additional operations */
  1.1315 +      if (ret) switch (cs->type) {
  1.1316 +      case CT_1BYTE0:		/* 1 byte no table */
  1.1317 +	for (i = 128; i < 256; i++) ret[i] |= csb;
  1.1318 +	break;
  1.1319 +      case CT_1BYTE:		/* 1 byte ASCII + table 0x80-0xff */
  1.1320 +	for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++)
  1.1321 +	  if (tab[i & BITS7] != UBOGON) ret[tab[i & BITS7]] |= csb;
  1.1322 +	break;
  1.1323 +      case CT_1BYTE8:		/* 1 byte table 0x00 - 0xff */
  1.1324 +	for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++)
  1.1325 +	  if (tab[i] != UBOGON) ret[tab[i]] |= csb;
  1.1326 +      break;
  1.1327 +      case CT_EUC:		/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
  1.1328 +	for (param = (struct utf8_eucparam *) cs->tab,
  1.1329 +	       tab = (unsigned short *) param->tab, ku = 0;
  1.1330 +	     ku < param->max_ku; ku++)
  1.1331 +	  for (ten = 0; ten < param->max_ten; ten++)
  1.1332 +	    if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
  1.1333 +	      ret[u] |= csb;
  1.1334 +	break;
  1.1335 +
  1.1336 +      case CT_DBYTE:		/* 2 byte ASCII + utf8_eucparam */
  1.1337 +	for (param = (struct utf8_eucparam *) cs->tab,
  1.1338 +	       tab = (unsigned short *) param->tab, ku = 0;
  1.1339 +	     ku < param->max_ku; ku++)
  1.1340 +	  for (ten = 0; ten < param->max_ten; ten++)
  1.1341 +	    if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
  1.1342 +	      ret[u] |= csb;
  1.1343 +      break;
  1.1344 +      case CT_DBYTE2:		/* 2 byte ASCII + utf8_eucparam plane1/2 */
  1.1345 +	param = (struct utf8_eucparam *) cs->tab;
  1.1346 +	p2 = param + 1;		/* plane 2 parameters */
  1.1347 +				/* only ten parameters should differ */
  1.1348 +	if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku))
  1.1349 +	  fatal ("ku definition error for CT_DBYTE2 charset");
  1.1350 +				/* total codepoints in each ku */
  1.1351 +	m = param->max_ten + p2->max_ten;
  1.1352 +	tab = (unsigned short *) param->tab;
  1.1353 +	for (ku = 0; ku < param->max_ku; ku++) {
  1.1354 +	  for (ten = 0; ten < param->max_ten; ten++)
  1.1355 +	    if ((u = tab[(ku * m) + ten]) != UBOGON)
  1.1356 +	      ret[u] |= csb;
  1.1357 +	  for (ten = 0; ten < p2->max_ten; ten++)
  1.1358 +	    if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON)
  1.1359 +	      ret[u] |= csb;
  1.1360 +	}
  1.1361 +	break;
  1.1362 +      case CT_SJIS:		/* 2 byte Shift-JIS */
  1.1363 +	for (ku = 0; ku < MAX_JIS0208_KU; ku++)
  1.1364 +	  for (ten = 0; ten < MAX_JIS0208_TEN; ten++)
  1.1365 +	    if ((u = jis0208tab[ku][ten]) != UBOGON) ret[u] |= csb;
  1.1366 +				/* JIS hankaku katakana */
  1.1367 +	for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++)
  1.1368 +	  ret[UCS2_KATAKANA + u] |= csb;
  1.1369 +	break;
  1.1370 +      }
  1.1371 +    }
  1.1372 +				/* invalid charset, punt */
  1.1373 +    else fs_give ((void **) &ret);
  1.1374 +  }
  1.1375 +  return ret;
  1.1376 +}
  1.1377 +
  1.1378 +/* Infer charset from unlabelled sized text
  1.1379 + * Accepts: sized text
  1.1380 + * Returns: charset if one inferred, or NIL if unknown
  1.1381 + */
  1.1382 +
  1.1383 +const CHARSET *utf8_infercharset (SIZEDTEXT *src)
  1.1384 +{
  1.1385 +  long iso2022jp = NIL;
  1.1386 +  long eightbit = NIL;
  1.1387 +  unsigned long i;
  1.1388 +				/* look for ISO 2022 */
  1.1389 +  if (src) for (i = 0; i < src->size; i++) {
  1.1390 +				/* ESC sequence? */
  1.1391 +    if ((src->data[i] == I2C_ESC) && (++i < src->size)) switch (src->data[i]) {
  1.1392 +    case I2C_MULTI:		/* yes, multibyte? */
  1.1393 +      if (++i < src->size) switch (src->data[i]) {
  1.1394 +      case I2CS_94x94_JIS_OLD:	/* JIS X 0208-1978 */
  1.1395 +      case I2CS_94x94_JIS_NEW:	/* JIS X 0208-1983 */
  1.1396 +      case I2CS_94x94_JIS_EXT:	/* JIS X 0212-1990 (kludge...) */
  1.1397 +	iso2022jp = T;		/* found an ISO-2022-JP sequence */
  1.1398 +	break;
  1.1399 +      default:			/* other multibyte */
  1.1400 +	return NIL;		/* definitely invalid */
  1.1401 +      }
  1.1402 +      break;
  1.1403 +    case I2C_G0_94:		/* single byte */
  1.1404 +      if (++i < src->size) switch (src->data[i]) {
  1.1405 +      case I2CS_94_JIS_BUGROM:	/* in case old buggy software */
  1.1406 +      case I2CS_94_JIS_ROMAN:	/* JIS X 0201-1976 left half */
  1.1407 +      case I2CS_94_ASCII:	/* ASCII */
  1.1408 +      case I2CS_94_BRITISH:	/* good enough for gov't work */
  1.1409 +	break;
  1.1410 +      default:			/* other 94 single byte */
  1.1411 +	return NIL;		/* definitely invalid */
  1.1412 +      }
  1.1413 +    }
  1.1414 +				/* if possible UTF-8 and not ISO-2022-JP */
  1.1415 +    else if (!iso2022jp && (eightbit >= 0) && (src->data[i] & BIT8) &&
  1.1416 +	     (eightbit = utf8_validate (src->data + i,src->size - i)) > 0)
  1.1417 +      i += eightbit - 1;	/* skip past all but last of UTF-8 char */
  1.1418 +  }
  1.1419 +				/* ISO-2022-JP overrides other guesses */
  1.1420 +  if (iso2022jp) return utf8_charset ("ISO-2022-JP");
  1.1421 +  if (eightbit > 0) return utf8_charset ("UTF-8");
  1.1422 +  return eightbit ? NIL : utf8_charset ("US-ASCII");
  1.1423 +}
  1.1424 +
  1.1425 +
  1.1426 +/* Validate that character at this position is UTF-8
  1.1427 + * Accepts: string pointer
  1.1428 + *	    size of remaining string
  1.1429 + * Returns: size of UTF-8 character in octets or -1 if not UTF-8
  1.1430 + */
  1.1431 +
  1.1432 +long utf8_validate (unsigned char *s,unsigned long i)
  1.1433 +{
  1.1434 +  unsigned long j = i;
  1.1435 +  return (utf8_get (&s,&i) & U8G_ERROR) ? -1 : j - i;
  1.1436 +}
  1.1437 +
  1.1438 +/* Convert ISO 8859-1 to UTF-8
  1.1439 + * Accepts: source sized text
  1.1440 + *	    pointer to return sized text
  1.1441 + *	    canonicalization function
  1.1442 + */
  1.1443 +
  1.1444 +void utf8_text_1byte0 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
  1.1445 +{
  1.1446 +  unsigned long i;
  1.1447 +  unsigned char *s;
  1.1448 +  unsigned int c;
  1.1449 +  for (ret->size = i = 0; i < text->size;) {
  1.1450 +    c = text->data[i++];
  1.1451 +    UTF8_COUNT_BMP (ret->size,c,cv,de)
  1.1452 +  }
  1.1453 +  (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
  1.1454 +  for (i = 0; i < text->size;) {
  1.1455 +    c = text->data[i++];
  1.1456 +    UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
  1.1457 +  }
  1.1458 +}
  1.1459 +
  1.1460 +
  1.1461 +/* Convert single byte ASCII+8bit character set sized text to UTF-8
  1.1462 + * Accepts: source sized text
  1.1463 + *	    pointer to return sized text
  1.1464 + *	    conversion table
  1.1465 + *	    canonicalization function
  1.1466 + */
  1.1467 +
  1.1468 +void utf8_text_1byte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
  1.1469 +		      ucs4de_t de)
  1.1470 +{
  1.1471 +  unsigned long i;
  1.1472 +  unsigned char *s;
  1.1473 +  unsigned int c;
  1.1474 +  unsigned short *tbl = (unsigned short *) tab;
  1.1475 +  for (ret->size = i = 0; i < text->size;) {
  1.1476 +    if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7];
  1.1477 +    UTF8_COUNT_BMP (ret->size,c,cv,de)
  1.1478 +  }
  1.1479 +  (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
  1.1480 +  for (i = 0; i < text->size;) {
  1.1481 +    if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7];
  1.1482 +    UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
  1.1483 +  }
  1.1484 +}
  1.1485 +
  1.1486 +/* Convert single byte 8bit character set sized text to UTF-8
  1.1487 + * Accepts: source sized text
  1.1488 + *	    pointer to return sized text
  1.1489 + *	    conversion table
  1.1490 + *	    canonicalization function
  1.1491 + */
  1.1492 +
  1.1493 +void utf8_text_1byte8 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
  1.1494 +		       ucs4de_t de)
  1.1495 +{
  1.1496 +  unsigned long i;
  1.1497 +  unsigned char *s;
  1.1498 +  unsigned int c;
  1.1499 +  unsigned short *tbl = (unsigned short *) tab;
  1.1500 +  for (ret->size = i = 0; i < text->size;) {
  1.1501 +    c = tbl[text->data[i++]];
  1.1502 +    UTF8_COUNT_BMP (ret->size,c,cv,de)
  1.1503 +  }
  1.1504 +  (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
  1.1505 +  for (i = 0; i < text->size;) {
  1.1506 +    c = tbl[text->data[i++]];
  1.1507 +    UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
  1.1508 +  }
  1.1509 +}
  1.1510 +
  1.1511 +/* Convert EUC sized text to UTF-8
  1.1512 + * Accepts: source sized text
  1.1513 + *	    pointer to return sized text
  1.1514 + *	    EUC parameter table
  1.1515 + *	    canonicalization function
  1.1516 + */
  1.1517 +
  1.1518 +void utf8_text_euc (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
  1.1519 +		    ucs4de_t de)
  1.1520 +{
  1.1521 +  unsigned long i;
  1.1522 +  unsigned char *s;
  1.1523 +  unsigned int pass,c,c1,ku,ten;
  1.1524 +  struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
  1.1525 +  struct utf8_eucparam *p2 = p1 + 1;
  1.1526 +  struct utf8_eucparam *p3 = p1 + 2;
  1.1527 +  unsigned short *t1 = (unsigned short *) p1->tab;
  1.1528 +  unsigned short *t2 = (unsigned short *) p2->tab;
  1.1529 +  unsigned short *t3 = (unsigned short *) p3->tab;
  1.1530 +  for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
  1.1531 +    for (i = 0; i < text->size;) {
  1.1532 +				/* not CS0? */
  1.1533 +      if ((c = text->data[i++]) & BIT8) {
  1.1534 +				/* yes, must have another high byte */
  1.1535 +	if ((i >= text->size) || !((c1 = text->data[i++]) & BIT8))
  1.1536 +	  c = UBOGON;		/* out of space or bogon */
  1.1537 +	else switch (c) {	/* check 8bit code set */
  1.1538 +	case EUC_CS2:		/* CS2 */
  1.1539 +	  if (p2->base_ku) {	/* CS2 set up? */
  1.1540 +	    if (p2->base_ten)	/* yes, multibyte? */
  1.1541 +	      c = ((i < text->size) && ((c = text->data[i++]) & BIT8) &&
  1.1542 +		   ((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) &&
  1.1543 +		   ((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) ?
  1.1544 +		     t2[(ku*p2->max_ten) + ten] : UBOGON;
  1.1545 +	    else c = ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) ?
  1.1546 +	      c1 + ((unsigned long) p2->tab) : UBOGON;
  1.1547 +	  }	  
  1.1548 +	  else {		/* CS2 not set up */
  1.1549 +	    c = UBOGON;		/* swallow byte, say bogon */
  1.1550 +	    if (i < text->size) i++;
  1.1551 +	  }
  1.1552 +	  break;
  1.1553 +	case EUC_CS3:		/* CS3 */
  1.1554 +	  if (p3->base_ku) {	/* CS3 set up? */
  1.1555 +	    if (p3->base_ten)	/* yes, multibyte? */
  1.1556 +	      c = ((i < text->size) && ((c = text->data[i++]) & BIT8) &&
  1.1557 +		   ((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) &&
  1.1558 +		   ((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) ?
  1.1559 +		     t3[(ku*p3->max_ten) + ten] : UBOGON;
  1.1560 +	    else c = ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) ?
  1.1561 +	      c1 + ((unsigned long) p3->tab) : UBOGON;
  1.1562 +	  }	  
  1.1563 +	  else {		/* CS3 not set up */
  1.1564 +	    c = UBOGON;		/* swallow byte, say bogon */
  1.1565 +	    if (i < text->size) i++;
  1.1566 +	  }
  1.1567 +	  break;
  1.1568 +
  1.1569 +	default:
  1.1570 +	  if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) ||
  1.1571 +	      ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten)) c = UBOGON;
  1.1572 +	  else if (((c = t1[(ku*p1->max_ten) + ten]) == UBOGON) &&
  1.1573 +		   /* special hack for JIS X 0212: merge rows less than 10 */
  1.1574 +		   ku && (ku < 10) && t3 && p3->base_ten)
  1.1575 +	    c = t3[((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten];
  1.1576 +	}
  1.1577 +      }
  1.1578 +				/* convert if second pass */
  1.1579 +      if (pass) UTF8_WRITE_BMP (s,c,cv,de)
  1.1580 +      else UTF8_COUNT_BMP (ret->size,c,cv,de);
  1.1581 +    }
  1.1582 +    if (!pass) (s = ret->data = (unsigned char *)
  1.1583 +		fs_get (ret->size + 1))[ret->size] =NIL;
  1.1584 +  }
  1.1585 +}
  1.1586 +
  1.1587 +
  1.1588 +/* Convert ASCII + double-byte sized text to UTF-8
  1.1589 + * Accepts: source sized text
  1.1590 + *	    pointer to return sized text
  1.1591 + *	    conversion table
  1.1592 + *	    canonicalization function
  1.1593 + */
  1.1594 +
  1.1595 +void utf8_text_dbyte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
  1.1596 +		      ucs4de_t de)
  1.1597 +{
  1.1598 +  unsigned long i;
  1.1599 +  unsigned char *s;
  1.1600 +  unsigned int c,c1,ku,ten;
  1.1601 +  struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
  1.1602 +  unsigned short *t1 = (unsigned short *) p1->tab;
  1.1603 +  for (ret->size = i = 0; i < text->size;) {
  1.1604 +    if ((c = text->data[i++]) & BIT8) {
  1.1605 +				/* special hack for GBK: 0x80 is Euro */
  1.1606 +      if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO;
  1.1607 +      else c = ((i < text->size) && (c1 = text->data[i++]) &&
  1.1608 +		((ku = c - p1->base_ku) < p1->max_ku) &&
  1.1609 +		((ten = c1 - p1->base_ten) < p1->max_ten)) ?
  1.1610 +	     t1[(ku*p1->max_ten) + ten] : UBOGON;
  1.1611 +    }
  1.1612 +    UTF8_COUNT_BMP (ret->size,c,cv,de)
  1.1613 +  }
  1.1614 +  (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
  1.1615 +  for (i = 0; i < text->size;) {
  1.1616 +    if ((c = text->data[i++]) & BIT8) {
  1.1617 +				/* special hack for GBK: 0x80 is Euro */
  1.1618 +      if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO;
  1.1619 +      else c = ((i < text->size) && (c1 = text->data[i++]) &&
  1.1620 +		((ku = c - p1->base_ku) < p1->max_ku) &&
  1.1621 +		((ten = c1 - p1->base_ten) < p1->max_ten)) ?
  1.1622 +	     t1[(ku*p1->max_ten) + ten] : UBOGON;
  1.1623 +    }
  1.1624 +    UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
  1.1625 +  }
  1.1626 +}
  1.1627 +
  1.1628 +/* Convert ASCII + double byte 2 plane sized text to UTF-8
  1.1629 + * Accepts: source sized text
  1.1630 + *	    pointer to return sized text
  1.1631 + *	    conversion table
  1.1632 + *	    canonicalization function
  1.1633 + */
  1.1634 +
  1.1635 +void utf8_text_dbyte2 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
  1.1636 +		       ucs4de_t de)
  1.1637 +{
  1.1638 +  unsigned long i;
  1.1639 +  unsigned char *s;
  1.1640 +  unsigned int c,c1,ku,ten;
  1.1641 +  struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
  1.1642 +  struct utf8_eucparam *p2 = p1 + 1;
  1.1643 +  unsigned short *t = (unsigned short *) p1->tab;
  1.1644 +  for (ret->size = i = 0; i < text->size;) {
  1.1645 +    if ((c = text->data[i++]) & BIT8) {
  1.1646 +      if ((i >= text->size) || !(c1 = text->data[i++]))
  1.1647 +	c = UBOGON;		/* out of space or bogon */
  1.1648 +      else if (c1 & BIT8)	/* high vs. low plane */
  1.1649 +	c = ((ku = c - p2->base_ku) < p2->max_ku &&
  1.1650 +	     ((ten = c1 - p2->base_ten) < p2->max_ten)) ?
  1.1651 +	       t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON;
  1.1652 +      else c = ((ku = c - p1->base_ku) < p1->max_ku &&
  1.1653 +		((ten = c1 - p1->base_ten) < p1->max_ten)) ?
  1.1654 +		  t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON;
  1.1655 +    }
  1.1656 +    UTF8_COUNT_BMP (ret->size,c,cv,de)
  1.1657 +  }
  1.1658 +  (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
  1.1659 +  for (i = 0; i < text->size;) {
  1.1660 +    if ((c = text->data[i++]) & BIT8) {
  1.1661 +      if ((i >= text->size) || !(c1 = text->data[i++]))
  1.1662 +	c = UBOGON;		/* out of space or bogon */
  1.1663 +      else if (c1 & BIT8)	/* high vs. low plane */
  1.1664 +	c = ((ku = c - p2->base_ku) < p2->max_ku &&
  1.1665 +	     ((ten = c1 - p2->base_ten) < p2->max_ten)) ?
  1.1666 +	       t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON;
  1.1667 +      else c = ((ku = c - p1->base_ku) < p1->max_ku &&
  1.1668 +		((ten = c1 - p1->base_ten) < p1->max_ten)) ?
  1.1669 +		  t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON;
  1.1670 +    }
  1.1671 +    UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
  1.1672 +  }
  1.1673 +}
  1.1674 +
  1.1675 +#ifdef JISTOUNICODE		/* Japanese */
  1.1676 +/* Convert Shift JIS sized text to UTF-8
  1.1677 + * Accepts: source sized text
  1.1678 + *	    pointer to return sized text
  1.1679 + *	    canonicalization function
  1.1680 + */
  1.1681 +
  1.1682 +void utf8_text_sjis (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,
  1.1683 +		     ucs4de_t de)
  1.1684 +{
  1.1685 +  unsigned long i;
  1.1686 +  unsigned char *s;
  1.1687 +  unsigned int c,c1,ku,ten;
  1.1688 +  for (ret->size = i = 0; i < text->size;) {
  1.1689 +    if ((c = text->data[i++]) & BIT8) {
  1.1690 +				/* half-width katakana */
  1.1691 +      if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8;
  1.1692 +      else if (i >= text->size) c = UBOGON;
  1.1693 +      else {			/* Shift-JIS */
  1.1694 +	c1 = text->data[i++];
  1.1695 +	SJISTOJIS (c,c1);
  1.1696 +	c = JISTOUNICODE (c,c1,ku,ten);
  1.1697 +      }
  1.1698 +    }
  1.1699 +				/* compromise - do yen sign but not overline */
  1.1700 +    else if (c == JISROMAN_YEN) c = UCS2_YEN;
  1.1701 +    UTF8_COUNT_BMP (ret->size,c,cv,de)
  1.1702 +  }
  1.1703 +  (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
  1.1704 +  for (i = 0; i < text->size;) {
  1.1705 +    if ((c = text->data[i++]) & BIT8) {
  1.1706 +				/* half-width katakana */
  1.1707 +      if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8;
  1.1708 +      else {			/* Shift-JIS */
  1.1709 +	c1 = text->data[i++];
  1.1710 +	SJISTOJIS (c,c1);
  1.1711 +	c = JISTOUNICODE (c,c1,ku,ten);
  1.1712 +      }
  1.1713 +    }
  1.1714 +				/* compromise - do yen sign but not overline */
  1.1715 +    else if (c == JISROMAN_YEN) c = UCS2_YEN;
  1.1716 +    UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
  1.1717 +  }
  1.1718 +}
  1.1719 +#endif
  1.1720 +
  1.1721 +/* Convert ISO-2022 sized text to UTF-8
  1.1722 + * Accepts: source sized text
  1.1723 + *	    pointer to returned sized text
  1.1724 + *	    canonicalization function
  1.1725 + */
  1.1726 +
  1.1727 +void utf8_text_2022 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
  1.1728 +{
  1.1729 +  unsigned long i;
  1.1730 +  unsigned char *s;
  1.1731 +  unsigned int pass,state,c,co,gi,gl,gr,g[4],ku,ten;
  1.1732 +  for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
  1.1733 +    gi = 0;			/* quell compiler warnings */
  1.1734 +    state = I2S_CHAR;		/* initialize engine */
  1.1735 +    g[0]= g[2] = I2CS_ASCII;	/* G0 and G2 are ASCII */
  1.1736 +    g[1]= g[3] = I2CS_ISO8859_1;/* G1 and G3 are ISO-8850-1 */
  1.1737 +    gl = I2C_G0; gr = I2C_G1;	/* left is G0, right is G1 */
  1.1738 +    for (i = 0; i < text->size;) {
  1.1739 +      c = text->data[i++];
  1.1740 +      switch (state) {		/* dispatch based upon engine state */
  1.1741 +      case I2S_ESC:		/* ESC seen */
  1.1742 +	switch (c) {		/* process intermediate character */
  1.1743 +	case I2C_MULTI:		/* multibyte character? */
  1.1744 +	  state = I2S_MUL;	/* mark multibyte flag seen */
  1.1745 +	  break;
  1.1746 +        case I2C_SS2:		/* single shift GL to G2 */
  1.1747 +	case I2C_SS2_ALT:	/* Taiwan SeedNet */
  1.1748 +	  gl |= I2C_SG2;
  1.1749 +	  break;
  1.1750 +        case I2C_SS3:		/* single shift GL to G3 */
  1.1751 +	case I2C_SS3_ALT:	/* Taiwan SeedNet */
  1.1752 +	  gl |= I2C_SG3;
  1.1753 +	  break;
  1.1754 +        case I2C_LS2:		/* shift GL to G2 */
  1.1755 +	  gl = I2C_G2;
  1.1756 +	  break;
  1.1757 +        case I2C_LS3:		/* shift GL to G3 */
  1.1758 +	  gl = I2C_G3;
  1.1759 +	  break;
  1.1760 +        case I2C_LS1R:		/* shift GR to G1 */
  1.1761 +	  gr = I2C_G1;
  1.1762 +	  break;
  1.1763 +        case I2C_LS2R:		/* shift GR to G2 */
  1.1764 +	  gr = I2C_G2;
  1.1765 +	  break;
  1.1766 +        case I2C_LS3R:		/* shift GR to G3 */
  1.1767 +	  gr = I2C_G3;
  1.1768 +	  break;
  1.1769 +	case I2C_G0_94: case I2C_G1_94: case I2C_G2_94:	case I2C_G3_94:
  1.1770 +	  g[gi = c - I2C_G0_94] = (state == I2S_MUL) ? I2CS_94x94 : I2CS_94;
  1.1771 +	  state = I2S_INT;	/* ready for character set */
  1.1772 +	  break;
  1.1773 +	case I2C_G0_96:	case I2C_G1_96: case I2C_G2_96:	case I2C_G3_96:
  1.1774 +	  g[gi = c - I2C_G0_96] = (state == I2S_MUL) ? I2CS_96x96 : I2CS_96;
  1.1775 +	  state = I2S_INT;	/* ready for character set */
  1.1776 +	  break;
  1.1777 +	default:		/* bogon */
  1.1778 +	  if (pass) *s++ = I2C_ESC,*s++ = c;
  1.1779 +	  else ret->size += 2;
  1.1780 +	  state = I2S_CHAR;	/* return to previous state */
  1.1781 +	}
  1.1782 +	break;
  1.1783 +
  1.1784 +      case I2S_MUL:		/* ESC $ */
  1.1785 +	switch (c) {		/* process multibyte intermediate character */
  1.1786 +	case I2C_G0_94: case I2C_G1_94: case I2C_G2_94:	case I2C_G3_94:
  1.1787 +	  g[gi = c - I2C_G0_94] = I2CS_94x94;
  1.1788 +	  state = I2S_INT;	/* ready for character set */
  1.1789 +	  break;
  1.1790 +	case I2C_G0_96:	case I2C_G1_96: case I2C_G2_96:	case I2C_G3_96:
  1.1791 +	  g[gi = c - I2C_G0_96] = I2CS_96x96;
  1.1792 +	  state = I2S_INT;	/* ready for character set */
  1.1793 +	  break;
  1.1794 +	default:		/* probably omitted I2CS_94x94 */
  1.1795 +	  g[gi = I2C_G0] = I2CS_94x94 | c;
  1.1796 +	  state = I2S_CHAR;	/* return to character state */
  1.1797 +	}
  1.1798 +	break;
  1.1799 +      case I2S_INT:
  1.1800 +	state = I2S_CHAR;	/* return to character state */
  1.1801 +	g[gi] |= c;		/* set character set */
  1.1802 +	break;
  1.1803 +
  1.1804 +      case I2S_CHAR:		/* character data */
  1.1805 +	switch (c) {
  1.1806 +	case I2C_ESC:		/* ESC character */
  1.1807 +	  state = I2S_ESC;	/* see if ISO-2022 prefix */
  1.1808 +	  break;
  1.1809 +	case I2C_SI:		/* shift GL to G0 */
  1.1810 +	  gl = I2C_G0;
  1.1811 +	  break;
  1.1812 +	case I2C_SO:		/* shift GL to G1 */
  1.1813 +	  gl = I2C_G1;
  1.1814 +	  break;
  1.1815 +        case I2C_SS2_ALT:	/* single shift GL to G2 */
  1.1816 +	case I2C_SS2_ALT_7:
  1.1817 +	  gl |= I2C_SG2;
  1.1818 +	  break;
  1.1819 +        case I2C_SS3_ALT:	/* single shift GL to G3 */
  1.1820 +	case I2C_SS3_ALT_7:
  1.1821 +	  gl |= I2C_SG3;
  1.1822 +	  break;
  1.1823 +
  1.1824 +	default:		/* ordinary character */
  1.1825 +	  co = c;		/* note original character */
  1.1826 +	  if (gl & (3 << 2)) {	/* single shifted? */
  1.1827 +	    gi = g[gl >> 2];	/* get shifted character set */
  1.1828 +	    gl &= 0x3;		/* cancel shift */
  1.1829 +	  }
  1.1830 +				/* select left or right half */
  1.1831 +	  else gi = (c & BIT8) ? g[gr] : g[gl];
  1.1832 +	  c &= BITS7;		/* make 7-bit */
  1.1833 +	  switch (gi) {		/* interpret in character set */
  1.1834 +	  case I2CS_ASCII:	/* ASCII */
  1.1835 +	    break;		/* easy! */
  1.1836 +	  case I2CS_BRITISH:	/* British ASCII */
  1.1837 +				/* Pound sterling sign */
  1.1838 +	    if (c == BRITISH_POUNDSTERLING) c = UCS2_POUNDSTERLING;
  1.1839 +	    break;
  1.1840 +	  case I2CS_JIS_ROMAN:	/* JIS Roman */
  1.1841 +	  case I2CS_JIS_BUGROM:	/* old bugs */
  1.1842 +	    switch (c) {	/* two exceptions to ASCII */
  1.1843 +	    case JISROMAN_YEN:	/* Yen sign */
  1.1844 +	      c = UCS2_YEN;
  1.1845 +	      break;
  1.1846 +				/* overline */
  1.1847 +	    case JISROMAN_OVERLINE:
  1.1848 +	      c = UCS2_OVERLINE;
  1.1849 +	      break;
  1.1850 +	    }
  1.1851 +	    break;
  1.1852 +	  case I2CS_JIS_KANA:	/* JIS hankaku katakana */
  1.1853 +	    if ((c >= MIN_KANA_7) && (c < MAX_KANA_7)) c += KANA_7;
  1.1854 +	    break;
  1.1855 +
  1.1856 +	  case I2CS_ISO8859_1:	/* Latin-1 (West European) */
  1.1857 +	    c |= BIT8;		/* just turn on high bit */
  1.1858 +	    break;
  1.1859 +	  case I2CS_ISO8859_2:	/* Latin-2 (Czech, Slovak) */
  1.1860 +	    c = iso8859_2tab[c];
  1.1861 +	    break;
  1.1862 +	  case I2CS_ISO8859_3:	/* Latin-3 (Dutch, Turkish) */
  1.1863 +	    c = iso8859_3tab[c];
  1.1864 +	    break;
  1.1865 +	  case I2CS_ISO8859_4:	/* Latin-4 (Scandinavian) */
  1.1866 +	    c = iso8859_4tab[c];
  1.1867 +	    break;
  1.1868 +	  case I2CS_ISO8859_5:	/* Cyrillic */
  1.1869 +	    c = iso8859_5tab[c];
  1.1870 +	    break;
  1.1871 +	  case I2CS_ISO8859_6:	/* Arabic */
  1.1872 +	    c = iso8859_6tab[c];
  1.1873 +	    break;
  1.1874 +	  case I2CS_ISO8859_7:	/* Greek */
  1.1875 +	    c = iso8859_7tab[c];
  1.1876 +	    break;
  1.1877 +	  case I2CS_ISO8859_8:	/* Hebrew */
  1.1878 +	    c = iso8859_8tab[c];
  1.1879 +	    break;
  1.1880 +	  case I2CS_ISO8859_9:	/* Latin-5 (Finnish, Portuguese) */
  1.1881 +	    c = iso8859_9tab[c];
  1.1882 +	    break;
  1.1883 +	  case I2CS_TIS620:	/* Thai */
  1.1884 +	    c = tis620tab[c];
  1.1885 +	    break;
  1.1886 +	  case I2CS_ISO8859_10:	/* Latin-6 (Northern Europe) */
  1.1887 +	    c = iso8859_10tab[c];
  1.1888 +	    break;
  1.1889 +	  case I2CS_ISO8859_13:	/* Latin-7 (Baltic) */
  1.1890 +	    c = iso8859_13tab[c];
  1.1891 +	    break;
  1.1892 +	  case I2CS_VSCII:	/* Vietnamese */
  1.1893 +	    c = visciitab[c];
  1.1894 +	    break;
  1.1895 +	  case I2CS_ISO8859_14:	/* Latin-8 (Celtic) */
  1.1896 +	    c = iso8859_14tab[c];
  1.1897 +	    break;
  1.1898 +	  case I2CS_ISO8859_15:	/* Latin-9 (Euro) */
  1.1899 +	    c = iso8859_15tab[c];
  1.1900 +	    break;
  1.1901 +	  case I2CS_ISO8859_16:	/* Latin-10 (Baltic) */
  1.1902 +	    c = iso8859_16tab[c];
  1.1903 +	    break;
  1.1904 +
  1.1905 +	  default:		/* all other character sets */
  1.1906 +				/* multibyte character set */
  1.1907 +	    if ((gi & I2CS_MUL) && !(c & BIT8) && isgraph (c)) {
  1.1908 +	      c = (i < text->size) ? text->data[i++] : 0;
  1.1909 +	      switch (gi) {
  1.1910 +#ifdef GBTOUNICODE
  1.1911 +	      case I2CS_GB:	/* GB 2312 */
  1.1912 +		co |= BIT8;	/* make into EUC */
  1.1913 +		c |= BIT8;
  1.1914 +		c = GBTOUNICODE (co,c,ku,ten);
  1.1915 +		break;
  1.1916 +#endif
  1.1917 +#ifdef JISTOUNICODE
  1.1918 +	      case I2CS_JIS_OLD:/* JIS X 0208-1978 */
  1.1919 +	      case I2CS_JIS_NEW:/* JIS X 0208-1983 */
  1.1920 +		c = JISTOUNICODE (co,c,ku,ten);
  1.1921 +		break;
  1.1922 +#endif
  1.1923 +#ifdef JIS0212TOUNICODE
  1.1924 +	      case I2CS_JIS_EXT:/* JIS X 0212-1990 */
  1.1925 +		c = JIS0212TOUNICODE (co,c,ku,ten);
  1.1926 +		break;
  1.1927 +#endif
  1.1928 +#ifdef KSCTOUNICODE
  1.1929 +	      case I2CS_KSC:	/* KSC 5601 */
  1.1930 +		co |= BIT8;	/* make into EUC */
  1.1931 +		c |= BIT8;
  1.1932 +		c = KSCTOUNICODE (co,c,ku,ten);
  1.1933 +		break;
  1.1934 +#endif
  1.1935 +#ifdef CNS1TOUNICODE
  1.1936 +	      case I2CS_CNS1:	/* CNS 11643 plane 1 */
  1.1937 +		c = CNS1TOUNICODE (co,c,ku,ten);
  1.1938 +		break;
  1.1939 +#endif
  1.1940 +#ifdef CNS2TOUNICODE
  1.1941 +	      case I2CS_CNS2:	/* CNS 11643 plane 2 */
  1.1942 +		c = CNS2TOUNICODE (co,c,ku,ten);
  1.1943 +		break;
  1.1944 +#endif
  1.1945 +#ifdef CNS3TOUNICODE
  1.1946 +	      case I2CS_CNS3:	/* CNS 11643 plane 3 */
  1.1947 +		c = CNS3TOUNICODE (co,c,ku,ten);
  1.1948 +		break;
  1.1949 +#endif
  1.1950 +#ifdef CNS4TOUNICODE
  1.1951 +	      case I2CS_CNS4:	/* CNS 11643 plane 4 */
  1.1952 +		c = CNS4TOUNICODE (co,c,ku,ten);
  1.1953 +		break;
  1.1954 +#endif
  1.1955 +#ifdef CNS5TOUNICODE
  1.1956 +	      case I2CS_CNS5:	/* CNS 11643 plane 5 */
  1.1957 +		c = CNS5TOUNICODE (co,c,ku,ten);
  1.1958 +		break;
  1.1959 +#endif
  1.1960 +#ifdef CNS6TOUNICODE
  1.1961 +	      case I2CS_CNS6:	/* CNS 11643 plane 6 */
  1.1962 +		c = CNS6TOUNICODE (co,c,ku,ten);
  1.1963 +		break;
  1.1964 +#endif
  1.1965 +#ifdef CNS7TOUNICODE
  1.1966 +	      case I2CS_CNS7:	/* CNS 11643 plane 7 */
  1.1967 +		c = CNS7TOUNICODE (co,c,ku,ten);
  1.1968 +		break;
  1.1969 +#endif
  1.1970 +	      default:		/* unknown multibyte, treat as UCS-2 */
  1.1971 +		c |= (co << 8);	/* wrong, but nothing else to do */
  1.1972 +		break;
  1.1973 +	      }
  1.1974 +	    }
  1.1975 +	    else c = co;	/* unknown single byte, treat as 8859-1 */
  1.1976 +	  }
  1.1977 +				/* convert if second pass */
  1.1978 +	  if (pass) UTF8_WRITE_BMP (s,c,cv,de)
  1.1979 +	  else UTF8_COUNT_BMP (ret->size,c,cv,de);
  1.1980 +	}
  1.1981 +      }
  1.1982 +    }
  1.1983 +    if (!pass) (s = ret->data = (unsigned char *)
  1.1984 +		fs_get (ret->size + 1))[ret->size] = NIL;
  1.1985 +    else if (((unsigned long) (s - ret->data)) != ret->size)
  1.1986 +      fatal ("ISO-2022 to UTF-8 botch");
  1.1987 +  }
  1.1988 +}
  1.1989 +
  1.1990 +/* Convert UTF-7 sized text to UTF-8
  1.1991 + * Accepts: source sized text
  1.1992 + *	    pointer to returned sized text
  1.1993 + *	    canonicalization function
  1.1994 + */
  1.1995 +
  1.1996 +void utf8_text_utf7 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
  1.1997 +{
  1.1998 +  unsigned long i;
  1.1999 +  unsigned char *s;
  1.2000 +  unsigned int c,c1,d,uc,pass,e,e1,state,surrh;
  1.2001 +  for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
  1.2002 +    c1 = d = uc = e = e1 = 0;
  1.2003 +    for (i = 0,state = NIL; i < text->size;) {
  1.2004 +      c = text->data[i++];	/* get next byte */
  1.2005 +      switch (state) {
  1.2006 +      case U7_PLUS:		/* previous character was + */
  1.2007 +	if (c == '-') {		/* +- means textual + */
  1.2008 +	  c = '+';
  1.2009 +	  state = U7_ASCII;	/* revert to ASCII */
  1.2010 +	  break;
  1.2011 +	}
  1.2012 +	state = U7_UNICODE;	/* enter Unicode state */
  1.2013 +	e = e1 = 0;		/* initialize Unicode quantum position */
  1.2014 +      case U7_UNICODE:		/* Unicode state */
  1.2015 +	if (c == '-') state = U7_MINUS;
  1.2016 +	else {			/* decode Unicode */
  1.2017 +	  /* don't use isupper/islower since this is ASCII only */
  1.2018 +	  if ((c >= 'A') && (c <= 'Z')) c -= 'A';
  1.2019 +	  else if ((c >= 'a') && (c <= 'z')) c -= 'a' - 26;
  1.2020 +	  else if (isdigit (c)) c -= '0' - 52;
  1.2021 +	  else if (c == '+') c = 62;
  1.2022 +	  else if (c == '/') c = 63;
  1.2023 +	  else state = U7_ASCII;/* end of modified BASE64 */
  1.2024 +	}
  1.2025 +	break;
  1.2026 +      case U7_MINUS:		/* previous character was absorbed - */
  1.2027 +	state = U7_ASCII;	/* revert to ASCII */
  1.2028 +      case U7_ASCII:		/* ASCII state */
  1.2029 +	if (c == '+') state = U7_PLUS;
  1.2030 +	break;
  1.2031 +      }
  1.2032 +
  1.2033 +      switch (state) {		/* store character if in character mode */
  1.2034 +      case U7_UNICODE:		/* Unicode */
  1.2035 +	switch (e++) {		/* install based on BASE64 state */
  1.2036 +	case 0:
  1.2037 +	  c1 = c << 2;		/* byte 1: high 6 bits */
  1.2038 +	  break;
  1.2039 +	case 1:
  1.2040 +	  d = c1 | (c >> 4);	/* byte 1: low 2 bits */
  1.2041 +	  c1 = c << 4;		/* byte 2: high 4 bits */
  1.2042 +	  break;
  1.2043 +	case 2:
  1.2044 +	  d = c1 | (c >> 2);	/* byte 2: low 4 bits */
  1.2045 +	  c1 = c << 6;		/* byte 3: high 2 bits */
  1.2046 +	  break;
  1.2047 +	case 3:
  1.2048 +	  d = c | c1;		/* byte 3: low 6 bits */
  1.2049 +	  e = 0;		/* reinitialize mechanism */
  1.2050 +	  break;
  1.2051 +	}
  1.2052 +	if (e == 1) break;	/* done if first BASE64 state */
  1.2053 +	if (!e1) {		/* first byte of UCS-2 character */
  1.2054 +	  uc = (d & 0xff) << 8;	/* note first byte */
  1.2055 +	  e1 = T;		/* enter second UCS-2 state */
  1.2056 +	  break;		/* done */
  1.2057 +	}
  1.2058 +	c = uc | (d & 0xff);	/* build UCS-2 character */
  1.2059 +	e1 = NIL;		/* back to first UCS-2 state, drop in */
  1.2060 +				/* surrogate pair?  */
  1.2061 +	if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
  1.2062 +				/* save high surrogate for later */
  1.2063 +	  if (c < UTF16_SURRL) surrh = c;
  1.2064 +	  else c = UTF16_BASE + ((surrh & UTF16_MASK) << UTF16_SHIFT) +
  1.2065 +		 (c & UTF16_MASK);
  1.2066 +	  break;		/* either way with surrogates, we're done */
  1.2067 +	}
  1.2068 +      case U7_ASCII:		/* just install if ASCII */
  1.2069 +				/* convert if second pass */
  1.2070 +	if (pass) UTF8_WRITE_BMP (s,c,cv,de)
  1.2071 +	else UTF8_COUNT_BMP (ret->size,c,cv,de);
  1.2072 +      }
  1.2073 +    }
  1.2074 +    if (!pass) (s = ret->data = (unsigned char *)
  1.2075 +		fs_get (ret->size + 1))[ret->size] = NIL;
  1.2076 +    else if (((unsigned long) (s - ret->data)) != ret->size)
  1.2077 +      fatal ("UTF-7 to UTF-8 botch");
  1.2078 +  }
  1.2079 +}
  1.2080 +
  1.2081 +
  1.2082 +/* Convert UTF-8 sized text to UTF-8
  1.2083 + * Accepts: source sized text
  1.2084 + *	    pointer to returned sized text
  1.2085 + *	    canonicalization function
  1.2086 + */
  1.2087 +
  1.2088 +void utf8_text_utf8 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
  1.2089 +{
  1.2090 +  unsigned long i,c;
  1.2091 +  unsigned char *s,*t;
  1.2092 +  for (ret->size = 0, t = text->data, i = text->size; i;) {
  1.2093 +    if ((c = utf8_get (&t,&i)) & U8G_ERROR) {
  1.2094 +      ret->data = text->data;	/* conversion failed */
  1.2095 +      ret->size = text->size;
  1.2096 +      return;
  1.2097 +    }
  1.2098 +    UTF8_COUNT (ret->size,c,cv,de)
  1.2099 +  }
  1.2100 +  (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
  1.2101 +  for (t = text->data, i = text->size; i;) {
  1.2102 +    c = utf8_get (&t,&i);
  1.2103 +    UTF8_WRITE (s,c,cv,de)	/* convert UCS-4 to UTF-8 */
  1.2104 +  }
  1.2105 +  if (((unsigned long) (s - ret->data)) != ret->size)
  1.2106 +    fatal ("UTF-8 to UTF-8 botch");
  1.2107 +}
  1.2108 +
  1.2109 +/* Convert UCS-2 sized text to UTF-8
  1.2110 + * Accepts: source sized text
  1.2111 + *	    pointer to returned sized text
  1.2112 + *	    canonicalization function
  1.2113 + */
  1.2114 +
  1.2115 +void utf8_text_ucs2 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
  1.2116 +{
  1.2117 +  unsigned long i;
  1.2118 +  unsigned char *s,*t;
  1.2119 +  unsigned int c;
  1.2120 +  for (ret->size = 0, t = text->data, i = text->size / 2; i; --i) {
  1.2121 +    c = *t++ << 8;
  1.2122 +    c |= *t++;
  1.2123 +    UTF8_COUNT_BMP (ret->size,c,cv,de);
  1.2124 +  }
  1.2125 +  (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
  1.2126 +  for (t = text->data, i = text->size / 2; i; --i) {
  1.2127 +    c = *t++ << 8;
  1.2128 +    c |= *t++;
  1.2129 +    UTF8_WRITE_BMP (s,c,cv,de)	/* convert UCS-2 to UTF-8 */
  1.2130 +  }
  1.2131 +  if (((unsigned long) (s - ret->data)) != ret->size)
  1.2132 +    fatal ("UCS-2 to UTF-8 botch");
  1.2133 +}
  1.2134 +
  1.2135 +
  1.2136 +/* Convert UCS-4 sized text to UTF-8
  1.2137 + * Accepts: source sized text
  1.2138 + *	    pointer to returned sized text
  1.2139 + *	    canonicalization function
  1.2140 + */
  1.2141 +
  1.2142 +void utf8_text_ucs4 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
  1.2143 +{
  1.2144 +  unsigned long i;
  1.2145 +  unsigned char *s,*t;
  1.2146 +  unsigned long c;
  1.2147 +  for (ret->size = 0, t = text->data, i = text->size / 4; i; --i) {
  1.2148 +    c = *t++ << 24; c |= *t++ << 16; c |= *t++ << 8; c |= *t++;
  1.2149 +    UTF8_COUNT (ret->size,c,cv,de);
  1.2150 +  }
  1.2151 +  (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
  1.2152 +  for (t = text->data, i = text->size / 2; i; --i) {
  1.2153 +    c = *t++ << 24; c |= *t++ << 16; c |= *t++ << 8; c |= *t++;
  1.2154 +    UTF8_WRITE (s,c,cv,de)	/* convert UCS-4 to UTF-8 */
  1.2155 +  }
  1.2156 +  if (((unsigned long) (s - ret->data)) != ret->size)
  1.2157 +    fatal ("UCS-4 to UTF-8 botch");
  1.2158 +}
  1.2159 +
  1.2160 +/* Convert UTF-16 sized text to UTF-8
  1.2161 + * Accepts: source sized text
  1.2162 + *	    pointer to returned sized text
  1.2163 + *	    canonicalization function
  1.2164 + */
  1.2165 +
  1.2166 +void utf8_text_utf16 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
  1.2167 +{
  1.2168 +  unsigned long i;
  1.2169 +  unsigned char *s,*t;
  1.2170 +  unsigned long c,d;
  1.2171 +  for (ret->size = 0, t = text->data, i = text->size / 2; i; --i) {
  1.2172 +    c = *t++ << 8;
  1.2173 +    c |= *t++;
  1.2174 +				/* possible surrogate? */
  1.2175 +    if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
  1.2176 +				/* invalid first surrogate */
  1.2177 +      if ((c > UTF16_SURRHEND) || !i) c = UBOGON;
  1.2178 +      else {			/* get second surrogate */
  1.2179 +	d = *t++ << 8;
  1.2180 +	d |= *t++;
  1.2181 +	--i;			/* swallowed another 16-bits */
  1.2182 +				/* invalid second surrogate */
  1.2183 +	if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) c = UBOGON;
  1.2184 +	else c = UTF16_BASE + ((c & UTF16_MASK) << UTF16_SHIFT) +
  1.2185 +	       (d & UTF16_MASK);
  1.2186 +      }
  1.2187 +    }
  1.2188 +    UTF8_COUNT (ret->size,c,cv,de);
  1.2189 +  }
  1.2190 +  (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
  1.2191 +  for (t = text->data, i = text->size / 2; i; --i) {
  1.2192 +    c = *t++ << 8;
  1.2193 +    c |= *t++;
  1.2194 +				/* possible surrogate? */
  1.2195 +    if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
  1.2196 +				/* invalid first surrogate */
  1.2197 +      if ((c > UTF16_SURRHEND) || !i) c = UBOGON;
  1.2198 +      else {			/* get second surrogate */
  1.2199 +	d = *t++ << 8;
  1.2200 +	d |= *t++;
  1.2201 +	--i;			/* swallowed another 16-bits */
  1.2202 +				/* invalid second surrogate */
  1.2203 +	if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) c = UBOGON;
  1.2204 +	else c = UTF16_BASE + ((c & UTF16_MASK) << UTF16_SHIFT) +
  1.2205 +	       (d & UTF16_MASK);
  1.2206 +      }
  1.2207 +    }
  1.2208 +    UTF8_WRITE (s,c,cv,de)	/* convert UCS-4 to UTF-8 */
  1.2209 +  }
  1.2210 +  if (((unsigned long) (s - ret->data)) != ret->size)
  1.2211 +    fatal ("UTF-16 to UTF-8 botch");
  1.2212 +}
  1.2213 +
  1.2214 +/* Size of UCS-4 character, possibly not in BMP, as UTF-8 octets
  1.2215 + * Accepts: character
  1.2216 + * Returns: size (0 means bogon)
  1.2217 + *
  1.2218 + * Use UTF8_SIZE macro if known to be in the BMP
  1.2219 + */
  1.2220 +
  1.2221 +unsigned long utf8_size (unsigned long c)
  1.2222 +{
  1.2223 +  if (c < 0x80) return 1;
  1.2224 +  else if (c < 0x800) return 2;
  1.2225 +  else if (c < 0x10000) return 3;
  1.2226 +  else if (c < 0x200000) return 4;
  1.2227 +  else if (c < 0x4000000) return 5;
  1.2228 +  else if (c < 0x80000000) return 6;
  1.2229 +  return 0;
  1.2230 +}
  1.2231 +
  1.2232 +
  1.2233 +/* Put UCS-4 character, possibly not in BMP, as UTF-8 octets
  1.2234 + * Accepts: destination string pointer
  1.2235 + *	    character
  1.2236 + * Returns: updated destination pointer
  1.2237 + *
  1.2238 + * Use UTF8_PUT_BMP macro if known to be in the BMP
  1.2239 + */
  1.2240 +
  1.2241 +unsigned char *utf8_put (unsigned char *s,unsigned long c)
  1.2242 +{
  1.2243 +  unsigned char mark[6] = {0x00,0xc0,0xe0,0xf0,0xf8,0xfc};
  1.2244 +  unsigned long size = utf8_size (c);
  1.2245 +  switch (size) {
  1.2246 +  case 6:
  1.2247 +    s[5] = 0x80 | (unsigned char) (c & 0x3f);
  1.2248 +    c >>= 6;
  1.2249 +  case 5:
  1.2250 +    s[4] = 0x80 | (unsigned char) (c & 0x3f);
  1.2251 +    c >>= 6;
  1.2252 +  case 4:
  1.2253 +    s[3] = 0x80 | (unsigned char) (c & 0x3f);
  1.2254 +    c >>= 6;
  1.2255 +  case 3:
  1.2256 +    s[2] = 0x80 | (unsigned char) (c & 0x3f);
  1.2257 +    c >>= 6;
  1.2258 +  case 2:
  1.2259 +    s[1] = 0x80 | (unsigned char) (c & 0x3f);
  1.2260 +    c >>= 6;
  1.2261 +  case 1:
  1.2262 +    *s = mark[size-1] | (unsigned char) (c & 0x7f);
  1.2263 +    break;
  1.2264 +  }
  1.2265 +  return s + size;
  1.2266 +}
  1.2267 +
  1.2268 +/* Return title case of a fixed-width UCS-4 character
  1.2269 + * Accepts: character
  1.2270 + * Returns: title case of character
  1.2271 + */
  1.2272 +
  1.2273 +unsigned long ucs4_titlecase (unsigned long c)
  1.2274 +{
  1.2275 +  if (c <= UCS4_TMAPMAX) return ucs4_tmaptab[c];
  1.2276 +  if (c < UCS4_TMAPHIMIN) return c;
  1.2277 +  if (c <= UCS4_TMAPHIMAX) return c - UCS4_TMAPHIMAP;
  1.2278 +  if (c < UCS4_TMAPDESERETMIN) return c;
  1.2279 +  if (c <= UCS4_TMAPDESERETMAX) return c - UCS4_TMAPDESERETMAP;
  1.2280 +  return c;
  1.2281 +}
  1.2282 +
  1.2283 +
  1.2284 +/* Return width of a fixed-width UCS-4 character in planes 0-2
  1.2285 + * Accepts: character
  1.2286 + * Returns: width (0, 1, 2) or negative error condition if not valid
  1.2287 + */
  1.2288 +
  1.2289 +long ucs4_width (unsigned long c)
  1.2290 +{
  1.2291 +  long ret;
  1.2292 +				/* out of range, not-a-char, or surrogates */
  1.2293 +  if ((c > UCS4_MAXUNICODE) || ((c & 0xfffe) == 0xfffe) ||
  1.2294 +      ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR))) ret = U4W_NOTUNCD;
  1.2295 +				/* private-use */
  1.2296 +  else if (c >= UCS4_PVTBASE) ret = U4W_PRIVATE;
  1.2297 +				/* SSP are not printing characters */
  1.2298 +  else if (c >= UCS4_SSPBASE) ret = U4W_SSPCHAR;
  1.2299 +				/* unassigned planes */
  1.2300 +  else if (c >= UCS4_UNABASE) ret = U4W_UNASSGN;
  1.2301 +				/* SIP and reserved plane 3 are wide */
  1.2302 +  else if (c >= UCS4_SIPBASE) ret = 2;
  1.2303 +#if (UCS4_WIDLEN != UCS4_SIPBASE)
  1.2304 +#error "UCS4_WIDLEN != UCS4_SIPBASE"
  1.2305 +#endif
  1.2306 +				/* C0/C1 controls */
  1.2307 +  else if ((c <= UCS2_C0CONTROLEND) ||
  1.2308 +	   ((c >= UCS2_C1CONTROL) && (c <= UCS2_C1CONTROLEND)))
  1.2309 +    ret = U4W_CONTROL;
  1.2310 +				/* BMP and SMP get value from table */
  1.2311 +  else switch (ret = (ucs4_widthtab[(c >> 2)] >> ((3 - (c & 0x3)) << 1)) &0x3){
  1.2312 +  case 0:			/* zero-width */
  1.2313 +    if (c == 0x00ad) ret = 1;	/* force U+00ad (SOFT HYPHEN) to width 1 */
  1.2314 +  case 1:			/* single-width */
  1.2315 +  case 2:			/* double-width */
  1.2316 +    break;
  1.2317 +  case 3:			/* ambiguous width */
  1.2318 +    ret = (c >= 0x2100) ? 2 : 1;/* need to do something better than this */
  1.2319 +    break;
  1.2320 +  }
  1.2321 +  return ret;
  1.2322 +}
  1.2323 +
  1.2324 +/* Return screen width of UTF-8 string
  1.2325 + * Accepts: string
  1.2326 + * Returns: width or negative if not valid UTF-8
  1.2327 + */
  1.2328 +
  1.2329 +long utf8_strwidth (unsigned char *s)
  1.2330 +{
  1.2331 +  unsigned long c,i,ret;
  1.2332 +				/* go through string */
  1.2333 +  for (ret = 0; *s; ret += ucs4_width (c)) {
  1.2334 +    /* It's alright to give a fake value for the byte count to utf8_get()
  1.2335 +     * since the null of a null-terminated string will stop processing anyway.
  1.2336 +     */
  1.2337 +    i = 6;			/* fake value */
  1.2338 +    if ((c = utf8_get (&s,&i)) & U8G_ERROR) return -1;
  1.2339 +  }
  1.2340 +  return ret;
  1.2341 +}
  1.2342 +
  1.2343 +
  1.2344 +/* Return screen width of UTF-8 text
  1.2345 + * Accepts: SIZEDTEXT to string
  1.2346 + * Returns: width or negative if not valid UTF-8
  1.2347 + */
  1.2348 +
  1.2349 +long utf8_textwidth (SIZEDTEXT *utf8)
  1.2350 +{
  1.2351 +  unsigned long c;
  1.2352 +  unsigned char *s = utf8->data;
  1.2353 +  unsigned long i = utf8->size;
  1.2354 +  unsigned long ret = 0;
  1.2355 +  while (i) {			/* while there's a string to process */
  1.2356 +    if ((c = utf8_get (&s,&i)) & U8G_ERROR) return -1;
  1.2357 +    ret += ucs4_width (c);
  1.2358 +  }
  1.2359 +  return ret;
  1.2360 +}
  1.2361 +
  1.2362 +/* Decomposition (phew!) */
  1.2363 +
  1.2364 +#define MORESINGLE 1		/* single UCS-4 tail value */
  1.2365 +#define MOREMULTIPLE 2		/* multiple UCS-2 tail values */
  1.2366 +
  1.2367 +struct decomposemore {
  1.2368 +  short type;			/* type of more */
  1.2369 +  union {
  1.2370 +    unsigned long single;	/* single decomposed value */
  1.2371 +    struct {			/* multiple BMP values */
  1.2372 +      unsigned short *next;
  1.2373 +      unsigned long count;
  1.2374 +    } multiple;
  1.2375 +  } data;
  1.2376 +};
  1.2377 +
  1.2378 +#define RECURSIVEMORE struct recursivemore
  1.2379 +
  1.2380 +RECURSIVEMORE {
  1.2381 +  struct decomposemore *more;
  1.2382 +  RECURSIVEMORE *next;
  1.2383 +};
  1.2384 +
  1.2385 +
  1.2386 +/* Return decomposition of a UCS-4 character
  1.2387 + * Accepts: character or U8G_ERROR to return next from "more"
  1.2388 + *	    pointer to returned more
  1.2389 + * Returns: [next] decomposed value, more set if still more decomposition
  1.2390 + */
  1.2391 +
  1.2392 +unsigned long ucs4_decompose (unsigned long c,void **more)
  1.2393 +{
  1.2394 +  unsigned long i,ix,ret;
  1.2395 +  struct decomposemore *m;
  1.2396 +  if (c & U8G_ERROR) {		/* want to chase more? */
  1.2397 +				/* do sanity check */
  1.2398 +    if (m = (struct decomposemore *) *more) switch (m->type) {
  1.2399 +    case MORESINGLE:		/* single value */
  1.2400 +      ret = m->data.single;
  1.2401 +      fs_give (more);		/* no more decomposition */
  1.2402 +      break;
  1.2403 +    case MOREMULTIPLE:		/* multiple value */
  1.2404 +      ret = *m->data.multiple.next++;
  1.2405 +      if (!--m->data.multiple.count) fs_give (more);
  1.2406 +      break;
  1.2407 +    default:			/* uh-oh */
  1.2408 +      fatal ("invalid more block argument to ucs4_decompose!");
  1.2409 +    }
  1.2410 +    else fatal ("no more block provided to ucs4_decompose!");
  1.2411 +  }
  1.2412 +
  1.2413 +  else {			/* start decomposition */
  1.2414 +    *more = NIL;		/* initially set no more */
  1.2415 +				/* BMP low decompositions */
  1.2416 +    if (c < UCS4_BMPLOMIN) ret = c;
  1.2417 +				/* fix this someday */
  1.2418 +    else if (c == UCS4_BMPLOMIN) ret = ucs4_dbmplotab[0];
  1.2419 +    else if (c <= UCS4_BMPLOMAX) {
  1.2420 +				/* within range - have a decomposition? */
  1.2421 +      if (i = ucs4_dbmploixtab[c - UCS4_BMPLOMIN]) {
  1.2422 +				/* get first value of decomposition */
  1.2423 +	ret = ucs4_dbmplotab[ix = i & UCS4_BMPLOIXMASK];
  1.2424 +				/* has continuation? */
  1.2425 +	if (i & UCS4_BMPLOSIZEMASK) {
  1.2426 +	  m = (struct decomposemore *)
  1.2427 +	    (*more = memset (fs_get (sizeof (struct decomposemore)),0,
  1.2428 +			    sizeof (struct decomposemore)));
  1.2429 +	  m->type = MOREMULTIPLE;
  1.2430 +	  m->data.multiple.next = &ucs4_dbmplotab[++ix];
  1.2431 +	  m->data.multiple.count = i >> UCS4_BMPLOSIZESHIFT;
  1.2432 +	}
  1.2433 +      }
  1.2434 +      else ret = c;		/* in range but doesn't decompose */
  1.2435 +    }
  1.2436 +				/* BMP CJK compatibility */
  1.2437 +    else if (c < UCS4_BMPCJKMIN) ret = c;
  1.2438 +    else if (c <= UCS4_BMPCJKMAX) {
  1.2439 +      if (!(ret = ucs4_bmpcjk1decomptab[c - UCS4_BMPCJKMIN])) ret = c;
  1.2440 +    }
  1.2441 +				/* BMP CJK compatibility - some not in BMP */
  1.2442 +#if UCS4_BMPCJK2MIN - (UCS4_BMPCJKMAX + 1)
  1.2443 +    else if (c < UCS4_BMPCJK2MIN) ret = c;
  1.2444 +#endif
  1.2445 +    else if (c <= UCS4_BMPCJK2MAX)
  1.2446 +      ret = ucs4_bmpcjk2decomptab[c - UCS4_BMPCJK2MIN];
  1.2447 +				/* BMP high decompositions */
  1.2448 +    else if (c < UCS4_BMPHIMIN) ret = c;
  1.2449 +    else if (c <= UCS4_BMPHIMAX) {
  1.2450 +				/* within range - have a decomposition? */
  1.2451 +      if (i = ucs4_dbmphiixtab[c - UCS4_BMPHIMIN]) {
  1.2452 +				/* get first value of decomposition */
  1.2453 +	ret = ucs4_dbmphitab[ix = i & UCS4_BMPHIIXMASK];
  1.2454 +				/* has continuation? */
  1.2455 +	if (i & UCS4_BMPHISIZEMASK) {
  1.2456 +	  m = (struct decomposemore *)
  1.2457 +	    (*more = memset (fs_get (sizeof (struct decomposemore)),0,
  1.2458 +			    sizeof (struct decomposemore)));
  1.2459 +	  m->type = MOREMULTIPLE;
  1.2460 +	  m->data.multiple.next = &ucs4_dbmphitab[++ix];
  1.2461 +	  m->data.multiple.count = i >> UCS4_BMPHISIZESHIFT;
  1.2462 +	}
  1.2463 +      }
  1.2464 +      else ret = c;		/* in range but doesn't decompose */
  1.2465 +    }
  1.2466 +
  1.2467 +				/* BMP half and full width forms */
  1.2468 +    else if (c < UCS4_BMPHALFFULLMIN) ret = c;
  1.2469 +    else if (c <= UCS4_BMPHALFFULLMAX) {
  1.2470 +      if (!(ret = ucs4_bmphalffulldecomptab[c - UCS4_BMPHALFFULLMIN])) ret = c;
  1.2471 +    }
  1.2472 +				/* SMP music */
  1.2473 +    else if (c < UCS4_SMPMUSIC1MIN) ret = c;
  1.2474 +    else if (c <= UCS4_SMPMUSIC1MAX) {
  1.2475 +      ret = ucs4_smpmusic1decomptab[c -= UCS4_SMPMUSIC1MIN][0];
  1.2476 +      m = (struct decomposemore *)
  1.2477 +	(*more = memset (fs_get (sizeof (struct decomposemore)),0,
  1.2478 +			 sizeof (struct decomposemore)));
  1.2479 +      m->type = MORESINGLE;
  1.2480 +      m->data.single = ucs4_smpmusic1decomptab[c][1];
  1.2481 +    }
  1.2482 +    else if (c < UCS4_SMPMUSIC2MIN) ret = c;
  1.2483 +    else if (c <= UCS4_SMPMUSIC2MAX) {
  1.2484 +      ret = ucs4_smpmusic2decomptab[c -= UCS4_SMPMUSIC2MIN][0];
  1.2485 +      m = (struct decomposemore *)
  1.2486 +	(*more = memset (fs_get (sizeof (struct decomposemore)),0,
  1.2487 +			 sizeof (struct decomposemore)));
  1.2488 +      m->type = MORESINGLE;
  1.2489 +      m->data.single = ucs4_smpmusic2decomptab[c][1];
  1.2490 +    }
  1.2491 +				/* SMP mathematical forms */
  1.2492 +    else if (c < UCS4_SMPMATHMIN) ret = c;
  1.2493 +    else if (c <= UCS4_SMPMATHMAX) {
  1.2494 +      if (!(ret = ucs4_smpmathdecomptab[c - UCS4_SMPMATHMIN])) ret = c;
  1.2495 +    }
  1.2496 +				/* CJK compatibility ideographs in SIP */
  1.2497 +    else if (!(ret = ((c >= UCS4_SIPMIN) && (c <= UCS4_SIPMAX)) ?
  1.2498 +	       ucs4_sipdecomptab[c - UCS4_SIPMIN] : c)) ret = c;
  1.2499 +  }
  1.2500 +  return ret;
  1.2501 +}
  1.2502 +
  1.2503 +/* Return recursive decomposition of a UCS-4 character
  1.2504 + * Accepts: character or U8G_ERROR to return next from "more"
  1.2505 + *	    pointer to returned more
  1.2506 + * Returns: [next] decomposed value, more set if still more decomposition
  1.2507 + */
  1.2508 +
  1.2509 +unsigned long ucs4_decompose_recursive (unsigned long c,void **more)
  1.2510 +{
  1.2511 +  unsigned long c1;
  1.2512 +  void *m,*mn;
  1.2513 +  RECURSIVEMORE *mr;
  1.2514 +  if (c & U8G_ERROR) {		/* want to chase more? */
  1.2515 +    mn = NIL;
  1.2516 +    if (mr = (RECURSIVEMORE *) *more) switch (mr->more->type) {
  1.2517 +    case MORESINGLE:		/* decompose single value */
  1.2518 +      c = ucs4_decompose_recursive (mr->more->data.single,&mn);
  1.2519 +      *more = mr->next;		/* done with this more, remove it */
  1.2520 +      fs_give ((void **) &mr->more);
  1.2521 +      fs_give ((void **) &mr);
  1.2522 +      break;
  1.2523 +    case MOREMULTIPLE:		/* decompose current value in multiple */
  1.2524 +      c = ucs4_decompose_recursive (*mr->more->data.multiple.next++,&mn);
  1.2525 +				/* if done with this multiple decomposition */
  1.2526 +      if (!--mr->more->data.multiple.count) {
  1.2527 +	*more = mr->next;	/* done with this more, remove it */
  1.2528 +	fs_give ((void **) &mr->more);
  1.2529 +	fs_give ((void **) &mr);
  1.2530 +      }
  1.2531 +      break;
  1.2532 +    default:			/* uh-oh */
  1.2533 +      fatal ("invalid more block argument to ucs4_decompose_recursive!");
  1.2534 +    }
  1.2535 +    else fatal ("no more block provided to ucs4_decompose_recursive!");
  1.2536 +    if (mr = mn) {		/* did this value recurse on us? */
  1.2537 +      mr->next = *more;		/* yes, insert new more at head */
  1.2538 +      *more = mr;
  1.2539 +    }
  1.2540 +  }
  1.2541 +  else {			/* start decomposition */
  1.2542 +    *more = NIL;		/* initially set no more */
  1.2543 +    mr = NIL;
  1.2544 +    do {			/* repeatedly decompose this codepoint */
  1.2545 +      c = ucs4_decompose (c1 = c,&m);
  1.2546 +      if (m) {			/* multi-byte decomposition */
  1.2547 +	if (c1 == c) fatal ("endless multiple decomposition!");
  1.2548 +				/* create a block to stash this more */
  1.2549 +	mr = memset (fs_get (sizeof (RECURSIVEMORE)),0,sizeof (RECURSIVEMORE));
  1.2550 +	mr->more = m;		/* note the expansion */
  1.2551 +	mr->next = *more;	/* old list is the tail */
  1.2552 +	*more = mr;		/* and this is the new head */
  1.2553 +      }
  1.2554 +    } while (c1 != c);		/* until nothing more to decompose */
  1.2555 +  }
  1.2556 +  return c;
  1.2557 +}

UW-IMAP'd extensions by yuuji