imapext-2007
diff src/c-client/utf8.h @ 0:ada5e610ab86
imap-2007e
author | yuuji@gentei.org |
---|---|
date | Mon, 14 Sep 2009 15:17:45 +0900 |
parents | |
children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/c-client/utf8.h Mon Sep 14 15:17:45 2009 +0900 1.3 @@ -0,0 +1,584 @@ 1.4 +/* ======================================================================== 1.5 + * Copyright 1988-2008 University of Washington 1.6 + * 1.7 + * Licensed under the Apache License, Version 2.0 (the "License"); 1.8 + * you may not use this file except in compliance with the License. 1.9 + * You may obtain a copy of the License at 1.10 + * 1.11 + * http://www.apache.org/licenses/LICENSE-2.0 1.12 + * 1.13 + * 1.14 + * ======================================================================== 1.15 + */ 1.16 + 1.17 +/* 1.18 + * Program: UTF-8 routines 1.19 + * 1.20 + * Author: Mark Crispin 1.21 + * Networks and Distributed Computing 1.22 + * Computing & Communications 1.23 + * University of Washington 1.24 + * Administration Building, AG-44 1.25 + * Seattle, WA 98195 1.26 + * Internet: MRC@CAC.Washington.EDU 1.27 + * 1.28 + * Date: 11 June 1997 1.29 + * Last Edited: 17 January 2008 1.30 + */ 1.31 + 1.32 +/* UTF-8 size and conversion routines from UCS-2 values (thus in the BMP). 1.33 + * Don't use these if UTF-16 data (surrogate pairs) are an issue. 1.34 + * For UCS-4 values, use the utf8_size() and utf8_put() functions. 1.35 + */ 1.36 + 1.37 +#define UTF8_SIZE_BMP(c) ((c & 0xff80) ? ((c & 0xf800) ? 3 : 2) : 1) 1.38 + 1.39 +#define UTF8_PUT_BMP(b,c) { \ 1.40 + if (c & 0xff80) { /* non-ASCII? */ \ 1.41 + if (c & 0xf800) { /* three byte code */ \ 1.42 + *b++ = 0xe0 | (c >> 12); \ 1.43 + *b++ = 0x80 | ((c >> 6) & 0x3f); \ 1.44 + } \ 1.45 + else *b++ = 0xc0 | ((c >> 6) & 0x3f); \ 1.46 + *b++ = 0x80 | (c & 0x3f); \ 1.47 + } \ 1.48 + else *b++ = c; \ 1.49 +} 1.50 + 1.51 +/* utf8_text() flag values */ 1.52 + 1.53 +#define U8T_CASECANON 2 /* canonicalize case */ 1.54 +#define U8T_DECOMPOSE 4 /* decompose */ 1.55 + /* full canonicalization */ 1.56 +#define U8T_CANONICAL (U8T_CASECANON | U8T_DECOMPOSE) 1.57 + 1.58 + 1.59 +/* utf8_get() return values */ 1.60 + 1.61 + /* 0x0000 - 0xffff BMP plane */ 1.62 +#define U8GM_NONBMP 0xffff0000 /* mask for non-BMP values */ 1.63 + /* 0x10000 - 0x10ffff extended planes */ 1.64 + /* 0x110000 - 0x7ffffff non-Unicode */ 1.65 +#define U8G_ERROR 0x80000000 /* error flag */ 1.66 +#define U8G_BADCONT U8G_ERROR+1 /* continuation when not in progress */ 1.67 +#define U8G_INCMPLT U8G_ERROR+2 /* incomplete UTF-8 character */ 1.68 +#define U8G_NOTUTF8 U8G_ERROR+3 /* not a valid UTF-8 octet */ 1.69 +#define U8G_ENDSTRG U8G_ERROR+4 /* end of string */ 1.70 +#define U8G_ENDSTRI U8G_ERROR+5 /* end of string w/ incomplete UTF-8 char */ 1.71 +#define U8G_SURROGA U8G_ERROR+6 /* surrogate codepoint */ 1.72 +#define U8G_NOTUNIC U8G_ERROR+7 /* non-Unicode codepoint */ 1.73 + 1.74 + 1.75 +/* ucs4_width() return values */ 1.76 + 1.77 +#define U4W_ERROR 0x80000000 /* error flags */ 1.78 +#define U4W_NOTUNCD U4W_ERROR+1 /* not a Unicode char */ 1.79 +#define U4W_PRIVATE U4W_ERROR+2 /* private-space plane */ 1.80 +#define U4W_SSPCHAR U4W_ERROR+3 /* Supplementary Special-purpose Plane */ 1.81 +#define U4W_UNASSGN U4W_ERROR+4 /* unassigned space plane */ 1.82 +#define U4W_CONTROL U4W_ERROR+5 /* C0/C1 control */ 1.83 +#define U4W_CTLSRGT U4W_CONTROL /* in case legacy code references this */ 1.84 + 1.85 +/* ISO-2022 engine states */ 1.86 + 1.87 +#define I2S_CHAR 0 /* character */ 1.88 +#define I2S_ESC 1 /* previous character was ESC */ 1.89 +#define I2S_MUL 2 /* previous character was multi-byte code */ 1.90 +#define I2S_INT 3 /* previous character was intermediate */ 1.91 + 1.92 + 1.93 +/* ISO-2022 Gn selections */ 1.94 + 1.95 +#define I2C_G0 0 /* G0 */ 1.96 +#define I2C_G1 1 /* G1 */ 1.97 +#define I2C_G2 2 /* G2 */ 1.98 +#define I2C_G3 3 /* G3 */ 1.99 +#define I2C_SG2 (2 << 2) /* single shift G2 */ 1.100 +#define I2C_SG3 (3 << 2) /* single shift G2 */ 1.101 + 1.102 + 1.103 +/* ISO-2022 octet definitions */ 1.104 + 1.105 +#define I2C_ESC 0x1b /* ESCape */ 1.106 + 1.107 + /* Intermediate character */ 1.108 +#define I2C_STRUCTURE 0x20 /* announce code structure */ 1.109 +#define I2C_C0 0x21 /* C0 */ 1.110 +#define I2C_C1 0x22 /* C1 */ 1.111 +#define I2C_CONTROL 0x23 /* single control function */ 1.112 +#define I2C_MULTI 0x24 /* multi-byte character set */ 1.113 +#define I2C_OTHER 0x25 /* other coding system */ 1.114 +#define I2C_REVISED 0x26 /* revised registration */ 1.115 +#define I2C_G0_94 0x28 /* G0 94-character set */ 1.116 +#define I2C_G1_94 0x29 /* G1 94-character set */ 1.117 +#define I2C_G2_94 0x2A /* G2 94-character set */ 1.118 +#define I2C_G3_94 0x2B /* G3 94-character set */ 1.119 +#define I2C_G0_96 0x2C /* (not in ISO-2022) G0 96-character set */ 1.120 +#define I2C_G1_96 0x2D /* G1 96-character set */ 1.121 +#define I2C_G2_96 0x2E /* G2 96-character set */ 1.122 +#define I2C_G3_96 0x2F /* G3 96-character set */ 1.123 + 1.124 + /* Locking shifts */ 1.125 +#define I2C_SI 0x0f /* lock shift to G0 (Shift In) */ 1.126 +#define I2C_SO 0x0e /* lock shift to G1 (Shift Out) */ 1.127 + /* prefixed by ESC */ 1.128 +#define I2C_LS2 0x6e /* lock shift to G2 */ 1.129 +#define I2C_LS3 0x6f /* lock shift to G3 */ 1.130 +#define I2C_LS1R 0x7e /* lock shift GR to G1 */ 1.131 +#define I2C_LS2R 0x7d /* lock shift GR to G2 */ 1.132 +#define I2C_LS3R 0x7c /* lock shift GR to G3 */ 1.133 + 1.134 + /* Single shifts */ 1.135 +#define I2C_SS2_ALT 0x8e /* single shift to G2 (SS2) */ 1.136 +#define I2C_SS3_ALT 0x8f /* single shift to G3 (SS3) */ 1.137 +#define I2C_SS2_ALT_7 0x19 /* single shift to G2 (SS2) */ 1.138 +#define I2C_SS3_ALT_7 0x1d /* single shift to G3 (SS3) */ 1.139 + /* prefixed by ESC */ 1.140 +#define I2C_SS2 0x4e /* single shift to G2 (SS2) */ 1.141 +#define I2C_SS3 0x4f /* single shift to G3 (SS3) */ 1.142 + 1.143 +/* 94 character sets */ 1.144 + 1.145 + /* 4/0 ISO 646 IRV */ 1.146 +#define I2CS_94_BRITISH 0x41 /* 4/1 ISO 646 British */ 1.147 +#define I2CS_94_ASCII 0x42 /* 4/2 ISO 646 USA (ASCII) */ 1.148 + /* 4/3 NATS Finland/Sweden (primary) */ 1.149 + /* 4/4 NATS Finland/Sweden (secondary) */ 1.150 + /* 4/5 NATS Denmark/Norway (primary) */ 1.151 + /* 4/6 NATS Denmark/Norway (secondary) */ 1.152 + /* 4/7 ISO 646 Swedish SEN 850200 */ 1.153 + /* 4/8 ISO 646 Swedish names */ 1.154 +#define I2CS_94_JIS_BUGROM 0x48 /* 4/8 some buggy software does this */ 1.155 +#define I2CS_94_JIS_KANA 0x49 /* 4/9 JIS X 0201-1976 right half */ 1.156 +#define I2CS_94_JIS_ROMAN 0x4a /* 4/a JIS X 0201-1976 left half */ 1.157 + /* 4/b ISO 646 German */ 1.158 + /* 4/c ISO 646 Portuguese (Olivetti) */ 1.159 + /* 4/d ISO 6438 African */ 1.160 + /* 4/e ISO 5427 Cyrillic (Honeywell-Bull) */ 1.161 + /* 4/f DIN 31624 extended bibliography */ 1.162 + /* 5/0 ISO 5426-1980 Bibliography */ 1.163 + /* 5/1 ISO 5427-1981 Cyrillic*/ 1.164 + /* 5/2 ISO 646 French (withdrawn) */ 1.165 + /* 5/3 ISO 5428-1980 Greek bibliography */ 1.166 + /* 5/4 GB 1988-80 Chinese */ 1.167 + /* 5/5 Latin-Greek (Honeywell-Bull) */ 1.168 + /* 5/6 UK Viewdata/Teletext */ 1.169 + /* 5/7 INIS (IRV subset) */ 1.170 + /* 5/8 ISO 5428 Greek Bibliography */ 1.171 + /* 5/9 ISO 646 Italian (Olivetti) */ 1.172 + /* 5/a ISO 646 Spanish (Olivetti) */ 1.173 + /* 5/b Greek (Olivetti) */ 1.174 + /* 5/c Latin-Greek (Olivetti) */ 1.175 + /* 5/d INIS non-standard extension */ 1.176 + /* 5/e INIS Cyrillic extension */ 1.177 + /* 5/f Arabic CODAR-U IERA */ 1.178 + /* 6/0 ISO 646 Norwegian */ 1.179 + /* 6/1 Norwegian version 2 (withdrawn) */ 1.180 + /* 6/2 Videotex supplementary */ 1.181 + /* 6/3 Videotex supplementary #2 */ 1.182 + /* 6/4 Videotex supplementary #3 */ 1.183 + /* 6/5 APL */ 1.184 + /* 6/6 ISO 646 French */ 1.185 + /* 6/7 ISO 646 Portuguese (IBM) */ 1.186 + /* 6/8 ISO 646 Spanish (IBM) */ 1.187 + /* 6/9 ISO 646 Hungarian */ 1.188 + /* 6/a Greek ELOT (withdrawn) */ 1.189 + /* 6/b ISO 9036 Arabic 7-bit */ 1.190 + /* 6/c ISO 646 IRV supplementary set */ 1.191 + /* 6/d JIS C6229-1984 OCR-A */ 1.192 + /* 6/e JIS C6229-1984 OCR-B */ 1.193 + /* 6/f JIS C6229-1984 OCR-B additional */ 1.194 + /* 7/0 JIS C6229-1984 hand-printed */ 1.195 + /* 7/1 JIS C6229-1984 additional hand-printd */ 1.196 + /* 7/2 JIS C6229-1984 katakana hand-printed */ 1.197 + /* 7/3 E13B Japanese graphic */ 1.198 + /* 7/4 Supplementary Videotex (withdrawn) */ 1.199 + /* 7/5 Teletex primary CCITT T.61 */ 1.200 + /* 7/6 Teletex secondary CCITT T.61 */ 1.201 + /* 7/7 CSA Z 243.4-1985 Alternate primary #1 */ 1.202 + /* 7/8 CSA Z 243.4-1985 Alternate primary #2 */ 1.203 + /* 7/9 Mosaic CCITT T.101 */ 1.204 + /* 7/a Serbocroatian/Slovenian Latin */ 1.205 + /* 7/b Serbocroatian Cyrillic */ 1.206 + /* 7/c Supplementary CCITT T.101 */ 1.207 + /* 7/d Macedonian Cyrillic */ 1.208 + 1.209 +/* 94 character sets - second intermediate byte */ 1.210 + 1.211 + /* 4/0 Greek primary CCITT */ 1.212 + /* 4/1 Cuba */ 1.213 + /* 4/2 ISO/IEC 646 invariant */ 1.214 + /* 4/3 Irish Gaelic 7-bit */ 1.215 + /* 4/4 Turkmen */ 1.216 + 1.217 + 1.218 +/* 94x94 character sets */ 1.219 + 1.220 +#define I2CS_94x94_JIS_OLD 0x40 /* 4/0 JIS X 0208-1978 */ 1.221 +#define I2CS_94x94_GB 0x41 /* 4/1 GB 2312 */ 1.222 +#define I2CS_94x94_JIS_NEW 0x42 /* 4/2 JIS X 0208-1983 */ 1.223 +#define I2CS_94x94_KSC 0x43 /* 4/3 KSC 5601 */ 1.224 +#define I2CS_94x94_JIS_EXT 0x44 /* 4/4 JIS X 0212-1990 */ 1.225 + /* 4/5 CCITT Chinese */ 1.226 + /* 4/6 Blisssymbol Graphic */ 1.227 +#define I2CS_94x94_CNS1 0x47 /* 4/7 CNS 11643 plane 1 */ 1.228 +#define I2CS_94x94_CNS2 0x48 /* 4/8 CNS 11643 plane 2 */ 1.229 +#define I2CS_94x94_CNS3 0x49 /* 4/9 CNS 11643 plane 3 */ 1.230 +#define I2CS_94x94_CNS4 0x4a /* 4/a CNS 11643 plane 4 */ 1.231 +#define I2CS_94x94_CNS5 0x4b /* 4/b CNS 11643 plane 5 */ 1.232 +#define I2CS_94x94_CNS6 0x4c /* 4/c CNS 11643 plane 6 */ 1.233 +#define I2CS_94x94_CNS7 0x4d /* 4/d CNS 11643 plane 7 */ 1.234 + /* 4/e DPRK (North Korea) KGCII */ 1.235 + /* 4/f JGCII plane 1 */ 1.236 + /* 5/0 JGCII plane 2 */ 1.237 + 1.238 +/* 96 character sets */ 1.239 + 1.240 +#define I2CS_96_ISO8859_1 0x41 /* 4/1 Latin-1 (Western Europe) */ 1.241 +#define I2CS_96_ISO8859_2 0x42 /* 4/2 Latin-2 (Czech, Slovak) */ 1.242 +#define I2CS_96_ISO8859_3 0x43 /* 4/3 Latin-3 (Dutch, Turkish) */ 1.243 +#define I2CS_96_ISO8859_4 0x44 /* 4/4 Latin-4 (Scandinavian) */ 1.244 + /* 4/5 CSA Z 243.4-1985 */ 1.245 +#define I2CS_96_ISO8859_7 0x46 /* 4/6 Greek */ 1.246 +#define I2CS_96_ISO8859_6 0x47 /* 4/7 Arabic */ 1.247 +#define I2CS_96_ISO8859_8 0x48 /* 4/8 Hebrew */ 1.248 + /* 4/9 Czechoslovak CSN 369103 */ 1.249 + /* 4/a Supplementary Latin and non-alpha */ 1.250 + /* 4/b Technical */ 1.251 +#define I2CS_96_ISO8859_5 0x4c /* 4/c Cyrillic */ 1.252 +#define I2CS_96_ISO8859_9 0x4d /* 4/d Latin-5 (Finnish, Portuguese) */ 1.253 + /* 4/e ISO 6937-2 residual */ 1.254 + /* 4/f Basic Cyrillic */ 1.255 + /* 5/0 Supplementary Latin 1, 2 and 5 */ 1.256 + /* 5/1 Basic Box */ 1.257 + /* 5/2 Supplementary ISO/IEC 6937 : 1992 */ 1.258 + /* 5/3 CCITT Hebrew supplementary */ 1.259 +#define I2CS_96_TIS620 0x54 /* 5/4 TIS 620 */ 1.260 + /* 5/5 Arabic/French/German */ 1.261 +#define I2CS_96_ISO8859_10 0x56 /* 5/6 Latin-6 (Northern Europe) */ 1.262 + /* 5/7 ??? */ 1.263 + /* 5/8 Sami (Lappish) supplementary */ 1.264 +#define I2CS_96_ISO8859_13 0x59 /* 5/9 Latin-7 (Baltic) */ 1.265 +#define I2CS_96_VSCII 0x5a /* 5/a Vietnamese */ 1.266 + /* 5/b Technical #1 IEC 1289 */ 1.267 +#define I2CS_96_ISO8859_14 0x5c /* 5/c Latin-8 (Celtic) */ 1.268 + /* 5/d Sami supplementary Latin */ 1.269 + /* 5/e Latin/Hebrew */ 1.270 + /* 5/f Celtic supplementary Latin */ 1.271 + /* 6/0 Uralic supplementary Cyrillic */ 1.272 + /* 6/1 Volgaic supplementary Cyrillic */ 1.273 +#define I2CS_96_ISO8859_15 0x62 /* 6/2 Latin-9 (Euro) */ 1.274 + /* 6/3 Latin-1 with Euro */ 1.275 + /* 6/4 Latin-4 with Euro */ 1.276 + /* 6/5 Latin-7 with Euro */ 1.277 +#define I2CS_96_ISO8859_16 0x66 /* 6/6 Latin-10 (Balkan) */ 1.278 + /* 6/7 Ogham */ 1.279 + /* 6/8 Sami supplementary Latin #2 */ 1.280 + /* 7/d Supplementary Mosaic for CCITT 101 */ 1.281 + 1.282 +/* 96x96 character sets */ 1.283 + 1.284 +/* Types of character sets */ 1.285 + 1.286 +#define I2CS_94 0x000 /* 94 character set */ 1.287 +#define I2CS_96 0x100 /* 96 character set */ 1.288 +#define I2CS_MUL 0x200 /* multi-byte */ 1.289 +#define I2CS_94x94 (I2CS_MUL | I2CS_94) 1.290 +#define I2CS_96x96 (I2CS_MUL | I2CS_96) 1.291 + 1.292 + 1.293 +/* Character set identifiers stored in Gn */ 1.294 + 1.295 +#define I2CS_BRITISH (I2CS_94 | I2CS_94_BRITISH) 1.296 +#define I2CS_ASCII (I2CS_94 | I2CS_94_ASCII) 1.297 +#define I2CS_JIS_BUGROM (I2CS_94 | I2CS_94_JIS_BUGROM) 1.298 +#define I2CS_JIS_KANA (I2CS_94 | I2CS_94_JIS_KANA) 1.299 +#define I2CS_JIS_ROMAN (I2CS_94 | I2CS_94_JIS_ROMAN) 1.300 +#define I2CS_JIS_OLD (I2CS_94x94 | I2CS_94x94_JIS_OLD) 1.301 +#define I2CS_GB (I2CS_94x94 | I2CS_94x94_GB) 1.302 +#define I2CS_JIS_NEW (I2CS_94x94 | I2CS_94x94_JIS_NEW) 1.303 +#define I2CS_KSC (I2CS_94x94 | I2CS_94x94_KSC) 1.304 +#define I2CS_JIS_EXT (I2CS_94x94 | I2CS_94x94_JIS_EXT) 1.305 +#define I2CS_CNS1 (I2CS_94x94 | I2CS_94x94_CNS1) 1.306 +#define I2CS_CNS2 (I2CS_94x94 | I2CS_94x94_CNS2) 1.307 +#define I2CS_CNS3 (I2CS_94x94 | I2CS_94x94_CNS3) 1.308 +#define I2CS_CNS4 (I2CS_94x94 | I2CS_94x94_CNS4) 1.309 +#define I2CS_CNS5 (I2CS_94x94 | I2CS_94x94_CNS5) 1.310 +#define I2CS_CNS6 (I2CS_94x94 | I2CS_94x94_CNS6) 1.311 +#define I2CS_CNS7 (I2CS_94x94 | I2CS_94x94_CNS7) 1.312 +#define I2CS_ISO8859_1 (I2CS_96 | I2CS_96_ISO8859_1) 1.313 +#define I2CS_ISO8859_2 (I2CS_96 | I2CS_96_ISO8859_2) 1.314 +#define I2CS_ISO8859_3 (I2CS_96 | I2CS_96_ISO8859_3) 1.315 +#define I2CS_ISO8859_4 (I2CS_96 | I2CS_96_ISO8859_4) 1.316 +#define I2CS_ISO8859_7 (I2CS_96 | I2CS_96_ISO8859_7) 1.317 +#define I2CS_ISO8859_6 (I2CS_96 | I2CS_96_ISO8859_6) 1.318 +#define I2CS_ISO8859_8 (I2CS_96 | I2CS_96_ISO8859_8) 1.319 +#define I2CS_ISO8859_5 (I2CS_96 | I2CS_96_ISO8859_5) 1.320 +#define I2CS_ISO8859_9 (I2CS_96 | I2CS_96_ISO8859_9) 1.321 +#define I2CS_TIS620 (I2CS_96 | I2CS_96_TIS620) 1.322 +#define I2CS_ISO8859_10 (I2CS_96 | I2CS_96_ISO8859_10) 1.323 +#define I2CS_ISO8859_13 (I2CS_96 | I2CS_96_ISO8859_13) 1.324 +#define I2CS_VSCII (I2CS_96 | I2CS_96_VSCII) 1.325 +#define I2CS_ISO8859_14 (I2CS_96 | I2CS_96_ISO8859_14) 1.326 +#define I2CS_ISO8859_15 (I2CS_96 | I2CS_96_ISO8859_15) 1.327 +#define I2CS_ISO8859_16 (I2CS_96 | I2CS_96_ISO8859_16) 1.328 + 1.329 + 1.330 +/* Miscellaneous ISO 2022 definitions */ 1.331 + 1.332 +#define EUC_CS2 0x8e /* single shift CS2 */ 1.333 +#define EUC_CS3 0x8f /* single shift CS3 */ 1.334 + 1.335 +#define BITS7 0x7f /* 7-bit value mask */ 1.336 +#define BIT8 0x80 /* 8th bit mask */ 1.337 + 1.338 +/* The following saves us from having to have yet more charset tables */ 1.339 + 1.340 +/* Unicode codepoints */ 1.341 + 1.342 +#define UCS2_C0CONTROL 0x00 /* first C0 control */ 1.343 +#define UCS2_C0CONTROLEND 0x1F /* last C0 control */ 1.344 +#define UCS2_C1CONTROL 0x80 /* first C1 control */ 1.345 +#define UCS2_C1CONTROLEND 0x9F /* last C1 control */ 1.346 + 1.347 + /* ISO 646 substituted Unicode codepoints */ 1.348 +#define UCS2_POUNDSTERLING 0x00a3 1.349 +#define UCS2_YEN 0x00a5 1.350 +#define UCS2_OVERLINE 0x203e 1.351 +#define UCS2_EURO 0x20ac 1.352 +#define UCS2_KATAKANA 0xff61 /* first katakana codepoint */ 1.353 +#define UCS2_BOM 0xfeff /* byte order mark */ 1.354 +#define UCS2_BOGON 0xfffd /* replacement character */ 1.355 + /* next two codepoints are not Unicode chars */ 1.356 +#define UCS2_BOMCHECK 0xfffe /* used to check byte order with UCS2_BOM */ 1.357 +#define UCS2_NOTCHAR 0xffff /* not a character */ 1.358 + 1.359 +#define UCS4_BMPBASE 0x0000 /* Basic Multilingual Plane */ 1.360 +#define UCS4_SMPBASE 0x10000 /* Supplementary Multilinugual Plane */ 1.361 +#define UCS4_SIPBASE 0x20000 /* Supplementary Ideographic Plane */ 1.362 + /* EastAsianWidth says plane 3 is wide */ 1.363 +#define UCS4_UNABASE 0x40000 /* unassigned space */ 1.364 +#define UCS4_SSPBASE 0xe0000 /* Supplementary Special-purpose Plane */ 1.365 +#define UCS4_PVTBASE 0xf0000 /* private-space (two planes) */ 1.366 +#define UCS4_MAXUNICODE 0x10ffff/* highest Unicode codepoint */ 1.367 + 1.368 +#define UTF16_BASE 0x10000 /* base of codepoints needing surrogates */ 1.369 +#define UTF16_SHIFT 10 /* surrogate shift */ 1.370 +#define UTF16_MASK 0x3ff /* surrogate mask */ 1.371 +#define UTF16_SURR 0xd800 /* UTF-16 surrogate area */ 1.372 +#define UTF16_SURRH 0xd800 /* UTF-16 first high surrogate */ 1.373 +#define UTF16_SURRHEND 0xdbff /* UTF-16 last high surrogate */ 1.374 +#define UTF16_SURRL 0xdc00 /* UTF-16 first low surrogate */ 1.375 +#define UTF16_SURRLEND 0xdfff /* UTF-16 last low surrogate */ 1.376 +#define UTF16_MAXSURR 0xdfff /* end of UTF-16 surrogates */ 1.377 + 1.378 + 1.379 +/* UBOGON is used to represent a codepoint in a character set which does not 1.380 + * map to Unicode. It is also used for mapping failures, e.g. incomplete 1.381 + * shift sequences. This name has the same text width as 0x????, for 1.382 + * convenience in the mapping tables. 1.383 + * 1.384 + * NOCHAR is used to represent a codepoint in Unicode which does not map to 1.385 + * the target character set in a reverse mapping table. This name has the 1.386 + * same text width as 0x???? in case we ever add static reverse mapping tables. 1.387 + */ 1.388 + 1.389 +#define UBOGON UCS2_BOGON 1.390 +#define NOCHAR UCS2_NOTCHAR 1.391 + 1.392 +/* Codepoints in non-Unicode character sets */ 1.393 + 1.394 +/* Codepoints in ISO 646 character sets */ 1.395 + 1.396 +/* British ASCII codepoints */ 1.397 + 1.398 +#define BRITISH_POUNDSTERLING 0x23 1.399 + 1.400 +/* JIS Roman codepoints */ 1.401 + 1.402 +#define JISROMAN_YEN 0x5c 1.403 +#define JISROMAN_OVERLINE 0x7e 1.404 + 1.405 + 1.406 +/* Hankaku katakana codepoints & parameters 1.407 + * 1.408 + * In earlier versions, MAX_KANA_7 and MAX_KANA_8 were the maximum codepoint 1.409 + * values. Although this made sense, it was confusing with the "max ku" and 1.410 + * "max ten" values used in the double-byte tables; there are 1-origin, but 1.411 + * the calculated values used for "ku" and "ten" are 0-origin (derived by 1.412 + * substracting the "base"). What this all meant is that for double byte 1.413 + * characters the limit test is of the form (value < max_ku), but for single 1.414 + * byte characters (which used the same cell to hold the max ku) the limit 1.415 + * test was (value <= max_ku). 1.416 + * 1.417 + * By making MAX_KANA_[78] be maximum+1, the same (value < max_ku) limit test 1.418 + * is used throughout. - 6/15/2006 1.419 + */ 1.420 + 1.421 +#define MIN_KANA_7 0x21 1.422 +#define MAX_KANA_7 0x60 /* maximum value + 1 */ 1.423 +#define KANA_7 (UCS2_KATAKANA - MIN_KANA_7) 1.424 +#define MIN_KANA_8 (MIN_KANA_7 | BIT8) 1.425 +#define MAX_KANA_8 (MAX_KANA_7 | BIT8) 1.426 +#define KANA_8 (UCS2_KATAKANA - MIN_KANA_8) 1.427 + 1.428 +/* Charset scripts */ 1.429 + 1.430 +/* The term "script" is used here in a very loose sense, enough to make 1.431 + * purists cringe. Basically, the idea is to give the main program some 1.432 + * idea of how it should treat the characters of text in a charset with 1.433 + * respect to font, drawing routines, etc. 1.434 + * 1.435 + * In some cases, "script" is associated with a charset; in other cases, 1.436 + * it's more closely tied to a language. 1.437 + */ 1.438 + 1.439 +#define SC_UNICODE 0x1 /* Unicode */ 1.440 +#define SC_LATIN_1 0x10 /* Western Europe */ 1.441 +#define SC_LATIN_2 0x20 /* Eastern Europe */ 1.442 +#define SC_LATIN_3 0x40 /* Southern Europe */ 1.443 +#define SC_LATIN_4 0x80 /* Northern Europe */ 1.444 +#define SC_LATIN_5 0x100 /* Turkish */ 1.445 +#define SC_LATIN_6 0x200 /* Nordic */ 1.446 +#define SC_LATIN_7 0x400 /* Baltic */ 1.447 +#define SC_LATIN_8 0x800 /* Celtic */ 1.448 +#define SC_LATIN_9 0x1000 /* Euro */ 1.449 +#define SC_LATIN_0 SC_LATIN_9 /* colloquial name for Latin-9 */ 1.450 +#define SC_ARABIC 0x2000 1.451 +#define SC_CYRILLIC 0x4000 1.452 +#define SC_GREEK 0x8000 1.453 +#define SC_HEBREW 0x10000 1.454 +#define SC_THAI 0x20000 1.455 +#define SC_UKRANIAN 0x40000 1.456 +#define SC_LATIN_10 0x80000 /* Balkan */ 1.457 +#define SC_VIETNAMESE 0x100000 1.458 +#define SC_CHINESE_SIMPLIFIED 0x1000000 1.459 +#define SC_CHINESE_TRADITIONAL 0x2000000 1.460 +#define SC_JAPANESE 0x4000000 1.461 +#define SC_KOREAN 0x8000000 1.462 + 1.463 + 1.464 +/* Script table */ 1.465 + 1.466 +typedef struct utf8_scent { 1.467 + char *name; /* script name */ 1.468 + char *description; /* script description */ 1.469 + unsigned long script; /* script bitmask */ 1.470 +} SCRIPT; 1.471 + 1.472 +/* Character set table support */ 1.473 + 1.474 +typedef struct utf8_csent { 1.475 + char *name; /* charset name */ 1.476 + unsigned short type; /* type of charset */ 1.477 + unsigned short flags; /* charset flags */ 1.478 + void *tab; /* additional data */ 1.479 + unsigned long script; /* script(s) implemented by this charset */ 1.480 + char *preferred; /* preferred charset over this one */ 1.481 +} CHARSET; 1.482 + 1.483 + 1.484 +struct utf8_eucparam { 1.485 + unsigned int base_ku : 8; /* base row */ 1.486 + unsigned int base_ten : 8; /* base column */ 1.487 + unsigned int max_ku : 8; /* maximum row */ 1.488 + unsigned int max_ten : 8; /* maximum column */ 1.489 + void *tab; /* conversion table */ 1.490 +}; 1.491 + 1.492 + 1.493 +/* Charset types */ 1.494 + 1.495 +#define CT_UNKNOWN 0 /* unknown 8-bit */ 1.496 +#define CT_ASCII 1 /* 7-bit ASCII no table */ 1.497 +#define CT_UCS2 2 /* 2 byte 16-bit Unicode no table */ 1.498 +#define CT_UCS4 3 /* 4 byte 32-bit Unicode no table */ 1.499 +#define CT_1BYTE0 10 /* 1 byte ISO 8859-1 no table */ 1.500 +#define CT_1BYTE 11 /* 1 byte ASCII + table 0x80-0xff */ 1.501 +#define CT_1BYTE8 12 /* 1 byte table 0x00 - 0xff */ 1.502 +#define CT_EUC 100 /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ 1.503 +#define CT_DBYTE 101 /* 2 byte ASCII + utf8_eucparam */ 1.504 +#define CT_DBYTE2 102 /* 2 byte ASCII + utf8_eucparam plane1/2 */ 1.505 +#define CT_UTF16 1000 /* variable UTF-16 encoded Unicode no table */ 1.506 +#define CT_UTF8 1001 /* variable UTF-8 encoded Unicode no table */ 1.507 +#define CT_UTF7 1002 /* variable UTF-7 encoded Unicode no table */ 1.508 +#define CT_2022 10000 /* variable ISO-2022 encoded no table */ 1.509 +#define CT_SJIS 10001 /* 2 byte Shift-JIS encoded JIS no table */ 1.510 + 1.511 + 1.512 +/* Character set flags */ 1.513 + 1.514 +#define CF_PRIMARY 0x1 /* primary name for this charset */ 1.515 +#define CF_DISPLAY 0x2 /* charset used in displays */ 1.516 +#define CF_POSTING 0x4 /* charset used in email posting */ 1.517 +#define CF_UNSUPRT 0x8 /* charset unsupported (can't convert to it) */ 1.518 +#define CF_NOEMAIL 0x10 /* charset not used in email */ 1.519 + 1.520 + 1.521 +/* UTF-7 engine states */ 1.522 + 1.523 +#define U7_ASCII 0 /* ASCII character */ 1.524 +#define U7_PLUS 1 /* plus seen */ 1.525 +#define U7_UNICODE 2 /* Unicode characters */ 1.526 +#define U7_MINUS 3 /* absorbed minus seen */ 1.527 + 1.528 +/* Function prototypes */ 1.529 + 1.530 +typedef unsigned long (*ucs4cn_t) (unsigned long c); 1.531 +typedef unsigned long (*ucs4de_t) (unsigned long c,void **more); 1.532 + 1.533 +SCRIPT *utf8_script (char *script); 1.534 +const CHARSET *utf8_charset (char *charset); 1.535 +char *utf8_badcharset (char *charset); 1.536 +long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags); 1.537 +long utf8_text_cs (SIZEDTEXT *text,const CHARSET *cs,SIZEDTEXT *ret, 1.538 + ucs4cn_t cv,ucs4de_t de); 1.539 +long utf8_cstext (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret, 1.540 + unsigned long errch); 1.541 +long utf8_cstocstext (SIZEDTEXT *text,char *sc,SIZEDTEXT *ret,char *dc, 1.542 + unsigned long errch); 1.543 +unsigned short *utf8_rmap (char *charset); 1.544 +unsigned short *utf8_rmap_cs (const CHARSET *cs); 1.545 +unsigned short *utf8_rmap_gen (const CHARSET *cs,unsigned short *oldmap); 1.546 +long utf8_rmaptext (SIZEDTEXT *text,unsigned short *rmap,SIZEDTEXT *ret, 1.547 + unsigned long errch,long iso2022jp); 1.548 +unsigned long utf8_rmapsize (SIZEDTEXT *text,unsigned short *rmap, 1.549 + unsigned long errch,long iso2022jp); 1.550 +long ucs4_rmaptext (unsigned long *ucs4,unsigned long len,unsigned short *rmap, 1.551 + SIZEDTEXT *ret,unsigned long errch); 1.552 +long ucs4_rmaplen (unsigned long *ucs4,unsigned long len,unsigned short *rmap, 1.553 + unsigned long errch); 1.554 +long ucs4_rmapbuf (unsigned char *t,unsigned long *ucs4,unsigned long len, 1.555 + unsigned short *rmap,unsigned long errch); 1.556 +unsigned long utf8_get (unsigned char **s,unsigned long *i); 1.557 +unsigned long utf8_get_raw (unsigned char **s,unsigned long *i); 1.558 +unsigned long ucs4_cs_get (CHARSET *cs,unsigned char **s,unsigned long *i); 1.559 +unsigned long *utf8_csvalidmap (char *charsets[]); 1.560 +const CHARSET *utf8_infercharset (SIZEDTEXT *src); 1.561 +long utf8_validate (unsigned char *s,unsigned long i); 1.562 +void utf8_text_1byte0 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); 1.563 +void utf8_text_1byte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, 1.564 + ucs4de_t de); 1.565 +void utf8_text_1byte8 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, 1.566 + ucs4de_t de); 1.567 +void utf8_text_euc (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, 1.568 + ucs4de_t de); 1.569 +void utf8_text_dbyte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, 1.570 + ucs4de_t de); 1.571 +void utf8_text_dbyte2 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, 1.572 + ucs4de_t de); 1.573 +void utf8_text_sjis (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); 1.574 +void utf8_text_2022 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); 1.575 +void utf8_text_utf7 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); 1.576 +void utf8_text_utf8 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); 1.577 +void utf8_text_ucs2 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); 1.578 +void utf8_text_ucs4 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); 1.579 +void utf8_text_utf16 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); 1.580 +unsigned long utf8_size (unsigned long c); 1.581 +unsigned char *utf8_put (unsigned char *s,unsigned long c); 1.582 +unsigned long ucs4_titlecase (unsigned long c); 1.583 +long ucs4_width (unsigned long c); 1.584 +long utf8_strwidth (unsigned char *s); 1.585 +long utf8_textwidth (SIZEDTEXT *utf8); 1.586 +unsigned long ucs4_decompose (unsigned long c,void **more); 1.587 +unsigned long ucs4_decompose_recursive (unsigned long c,void **more);