imapext-2007

annotate src/c-client/utf8.h @ 0:ada5e610ab86

imap-2007e
author yuuji@gentei.org
date Mon, 14 Sep 2009 15:17:45 +0900
parents
children
rev   line source
yuuji@0 1 /* ========================================================================
yuuji@0 2 * Copyright 1988-2008 University of Washington
yuuji@0 3 *
yuuji@0 4 * Licensed under the Apache License, Version 2.0 (the "License");
yuuji@0 5 * you may not use this file except in compliance with the License.
yuuji@0 6 * You may obtain a copy of the License at
yuuji@0 7 *
yuuji@0 8 * http://www.apache.org/licenses/LICENSE-2.0
yuuji@0 9 *
yuuji@0 10 *
yuuji@0 11 * ========================================================================
yuuji@0 12 */
yuuji@0 13
yuuji@0 14 /*
yuuji@0 15 * Program: UTF-8 routines
yuuji@0 16 *
yuuji@0 17 * Author: Mark Crispin
yuuji@0 18 * Networks and Distributed Computing
yuuji@0 19 * Computing & Communications
yuuji@0 20 * University of Washington
yuuji@0 21 * Administration Building, AG-44
yuuji@0 22 * Seattle, WA 98195
yuuji@0 23 * Internet: MRC@CAC.Washington.EDU
yuuji@0 24 *
yuuji@0 25 * Date: 11 June 1997
yuuji@0 26 * Last Edited: 17 January 2008
yuuji@0 27 */
yuuji@0 28
yuuji@0 29 /* UTF-8 size and conversion routines from UCS-2 values (thus in the BMP).
yuuji@0 30 * Don't use these if UTF-16 data (surrogate pairs) are an issue.
yuuji@0 31 * For UCS-4 values, use the utf8_size() and utf8_put() functions.
yuuji@0 32 */
yuuji@0 33
yuuji@0 34 #define UTF8_SIZE_BMP(c) ((c & 0xff80) ? ((c & 0xf800) ? 3 : 2) : 1)
yuuji@0 35
yuuji@0 36 #define UTF8_PUT_BMP(b,c) { \
yuuji@0 37 if (c & 0xff80) { /* non-ASCII? */ \
yuuji@0 38 if (c & 0xf800) { /* three byte code */ \
yuuji@0 39 *b++ = 0xe0 | (c >> 12); \
yuuji@0 40 *b++ = 0x80 | ((c >> 6) & 0x3f); \
yuuji@0 41 } \
yuuji@0 42 else *b++ = 0xc0 | ((c >> 6) & 0x3f); \
yuuji@0 43 *b++ = 0x80 | (c & 0x3f); \
yuuji@0 44 } \
yuuji@0 45 else *b++ = c; \
yuuji@0 46 }
yuuji@0 47
yuuji@0 48 /* utf8_text() flag values */
yuuji@0 49
yuuji@0 50 #define U8T_CASECANON 2 /* canonicalize case */
yuuji@0 51 #define U8T_DECOMPOSE 4 /* decompose */
yuuji@0 52 /* full canonicalization */
yuuji@0 53 #define U8T_CANONICAL (U8T_CASECANON | U8T_DECOMPOSE)
yuuji@0 54
yuuji@0 55
yuuji@0 56 /* utf8_get() return values */
yuuji@0 57
yuuji@0 58 /* 0x0000 - 0xffff BMP plane */
yuuji@0 59 #define U8GM_NONBMP 0xffff0000 /* mask for non-BMP values */
yuuji@0 60 /* 0x10000 - 0x10ffff extended planes */
yuuji@0 61 /* 0x110000 - 0x7ffffff non-Unicode */
yuuji@0 62 #define U8G_ERROR 0x80000000 /* error flag */
yuuji@0 63 #define U8G_BADCONT U8G_ERROR+1 /* continuation when not in progress */
yuuji@0 64 #define U8G_INCMPLT U8G_ERROR+2 /* incomplete UTF-8 character */
yuuji@0 65 #define U8G_NOTUTF8 U8G_ERROR+3 /* not a valid UTF-8 octet */
yuuji@0 66 #define U8G_ENDSTRG U8G_ERROR+4 /* end of string */
yuuji@0 67 #define U8G_ENDSTRI U8G_ERROR+5 /* end of string w/ incomplete UTF-8 char */
yuuji@0 68 #define U8G_SURROGA U8G_ERROR+6 /* surrogate codepoint */
yuuji@0 69 #define U8G_NOTUNIC U8G_ERROR+7 /* non-Unicode codepoint */
yuuji@0 70
yuuji@0 71
yuuji@0 72 /* ucs4_width() return values */
yuuji@0 73
yuuji@0 74 #define U4W_ERROR 0x80000000 /* error flags */
yuuji@0 75 #define U4W_NOTUNCD U4W_ERROR+1 /* not a Unicode char */
yuuji@0 76 #define U4W_PRIVATE U4W_ERROR+2 /* private-space plane */
yuuji@0 77 #define U4W_SSPCHAR U4W_ERROR+3 /* Supplementary Special-purpose Plane */
yuuji@0 78 #define U4W_UNASSGN U4W_ERROR+4 /* unassigned space plane */
yuuji@0 79 #define U4W_CONTROL U4W_ERROR+5 /* C0/C1 control */
yuuji@0 80 #define U4W_CTLSRGT U4W_CONTROL /* in case legacy code references this */
yuuji@0 81
yuuji@0 82 /* ISO-2022 engine states */
yuuji@0 83
yuuji@0 84 #define I2S_CHAR 0 /* character */
yuuji@0 85 #define I2S_ESC 1 /* previous character was ESC */
yuuji@0 86 #define I2S_MUL 2 /* previous character was multi-byte code */
yuuji@0 87 #define I2S_INT 3 /* previous character was intermediate */
yuuji@0 88
yuuji@0 89
yuuji@0 90 /* ISO-2022 Gn selections */
yuuji@0 91
yuuji@0 92 #define I2C_G0 0 /* G0 */
yuuji@0 93 #define I2C_G1 1 /* G1 */
yuuji@0 94 #define I2C_G2 2 /* G2 */
yuuji@0 95 #define I2C_G3 3 /* G3 */
yuuji@0 96 #define I2C_SG2 (2 << 2) /* single shift G2 */
yuuji@0 97 #define I2C_SG3 (3 << 2) /* single shift G2 */
yuuji@0 98
yuuji@0 99
yuuji@0 100 /* ISO-2022 octet definitions */
yuuji@0 101
yuuji@0 102 #define I2C_ESC 0x1b /* ESCape */
yuuji@0 103
yuuji@0 104 /* Intermediate character */
yuuji@0 105 #define I2C_STRUCTURE 0x20 /* announce code structure */
yuuji@0 106 #define I2C_C0 0x21 /* C0 */
yuuji@0 107 #define I2C_C1 0x22 /* C1 */
yuuji@0 108 #define I2C_CONTROL 0x23 /* single control function */
yuuji@0 109 #define I2C_MULTI 0x24 /* multi-byte character set */
yuuji@0 110 #define I2C_OTHER 0x25 /* other coding system */
yuuji@0 111 #define I2C_REVISED 0x26 /* revised registration */
yuuji@0 112 #define I2C_G0_94 0x28 /* G0 94-character set */
yuuji@0 113 #define I2C_G1_94 0x29 /* G1 94-character set */
yuuji@0 114 #define I2C_G2_94 0x2A /* G2 94-character set */
yuuji@0 115 #define I2C_G3_94 0x2B /* G3 94-character set */
yuuji@0 116 #define I2C_G0_96 0x2C /* (not in ISO-2022) G0 96-character set */
yuuji@0 117 #define I2C_G1_96 0x2D /* G1 96-character set */
yuuji@0 118 #define I2C_G2_96 0x2E /* G2 96-character set */
yuuji@0 119 #define I2C_G3_96 0x2F /* G3 96-character set */
yuuji@0 120
yuuji@0 121 /* Locking shifts */
yuuji@0 122 #define I2C_SI 0x0f /* lock shift to G0 (Shift In) */
yuuji@0 123 #define I2C_SO 0x0e /* lock shift to G1 (Shift Out) */
yuuji@0 124 /* prefixed by ESC */
yuuji@0 125 #define I2C_LS2 0x6e /* lock shift to G2 */
yuuji@0 126 #define I2C_LS3 0x6f /* lock shift to G3 */
yuuji@0 127 #define I2C_LS1R 0x7e /* lock shift GR to G1 */
yuuji@0 128 #define I2C_LS2R 0x7d /* lock shift GR to G2 */
yuuji@0 129 #define I2C_LS3R 0x7c /* lock shift GR to G3 */
yuuji@0 130
yuuji@0 131 /* Single shifts */
yuuji@0 132 #define I2C_SS2_ALT 0x8e /* single shift to G2 (SS2) */
yuuji@0 133 #define I2C_SS3_ALT 0x8f /* single shift to G3 (SS3) */
yuuji@0 134 #define I2C_SS2_ALT_7 0x19 /* single shift to G2 (SS2) */
yuuji@0 135 #define I2C_SS3_ALT_7 0x1d /* single shift to G3 (SS3) */
yuuji@0 136 /* prefixed by ESC */
yuuji@0 137 #define I2C_SS2 0x4e /* single shift to G2 (SS2) */
yuuji@0 138 #define I2C_SS3 0x4f /* single shift to G3 (SS3) */
yuuji@0 139
yuuji@0 140 /* 94 character sets */
yuuji@0 141
yuuji@0 142 /* 4/0 ISO 646 IRV */
yuuji@0 143 #define I2CS_94_BRITISH 0x41 /* 4/1 ISO 646 British */
yuuji@0 144 #define I2CS_94_ASCII 0x42 /* 4/2 ISO 646 USA (ASCII) */
yuuji@0 145 /* 4/3 NATS Finland/Sweden (primary) */
yuuji@0 146 /* 4/4 NATS Finland/Sweden (secondary) */
yuuji@0 147 /* 4/5 NATS Denmark/Norway (primary) */
yuuji@0 148 /* 4/6 NATS Denmark/Norway (secondary) */
yuuji@0 149 /* 4/7 ISO 646 Swedish SEN 850200 */
yuuji@0 150 /* 4/8 ISO 646 Swedish names */
yuuji@0 151 #define I2CS_94_JIS_BUGROM 0x48 /* 4/8 some buggy software does this */
yuuji@0 152 #define I2CS_94_JIS_KANA 0x49 /* 4/9 JIS X 0201-1976 right half */
yuuji@0 153 #define I2CS_94_JIS_ROMAN 0x4a /* 4/a JIS X 0201-1976 left half */
yuuji@0 154 /* 4/b ISO 646 German */
yuuji@0 155 /* 4/c ISO 646 Portuguese (Olivetti) */
yuuji@0 156 /* 4/d ISO 6438 African */
yuuji@0 157 /* 4/e ISO 5427 Cyrillic (Honeywell-Bull) */
yuuji@0 158 /* 4/f DIN 31624 extended bibliography */
yuuji@0 159 /* 5/0 ISO 5426-1980 Bibliography */
yuuji@0 160 /* 5/1 ISO 5427-1981 Cyrillic*/
yuuji@0 161 /* 5/2 ISO 646 French (withdrawn) */
yuuji@0 162 /* 5/3 ISO 5428-1980 Greek bibliography */
yuuji@0 163 /* 5/4 GB 1988-80 Chinese */
yuuji@0 164 /* 5/5 Latin-Greek (Honeywell-Bull) */
yuuji@0 165 /* 5/6 UK Viewdata/Teletext */
yuuji@0 166 /* 5/7 INIS (IRV subset) */
yuuji@0 167 /* 5/8 ISO 5428 Greek Bibliography */
yuuji@0 168 /* 5/9 ISO 646 Italian (Olivetti) */
yuuji@0 169 /* 5/a ISO 646 Spanish (Olivetti) */
yuuji@0 170 /* 5/b Greek (Olivetti) */
yuuji@0 171 /* 5/c Latin-Greek (Olivetti) */
yuuji@0 172 /* 5/d INIS non-standard extension */
yuuji@0 173 /* 5/e INIS Cyrillic extension */
yuuji@0 174 /* 5/f Arabic CODAR-U IERA */
yuuji@0 175 /* 6/0 ISO 646 Norwegian */
yuuji@0 176 /* 6/1 Norwegian version 2 (withdrawn) */
yuuji@0 177 /* 6/2 Videotex supplementary */
yuuji@0 178 /* 6/3 Videotex supplementary #2 */
yuuji@0 179 /* 6/4 Videotex supplementary #3 */
yuuji@0 180 /* 6/5 APL */
yuuji@0 181 /* 6/6 ISO 646 French */
yuuji@0 182 /* 6/7 ISO 646 Portuguese (IBM) */
yuuji@0 183 /* 6/8 ISO 646 Spanish (IBM) */
yuuji@0 184 /* 6/9 ISO 646 Hungarian */
yuuji@0 185 /* 6/a Greek ELOT (withdrawn) */
yuuji@0 186 /* 6/b ISO 9036 Arabic 7-bit */
yuuji@0 187 /* 6/c ISO 646 IRV supplementary set */
yuuji@0 188 /* 6/d JIS C6229-1984 OCR-A */
yuuji@0 189 /* 6/e JIS C6229-1984 OCR-B */
yuuji@0 190 /* 6/f JIS C6229-1984 OCR-B additional */
yuuji@0 191 /* 7/0 JIS C6229-1984 hand-printed */
yuuji@0 192 /* 7/1 JIS C6229-1984 additional hand-printd */
yuuji@0 193 /* 7/2 JIS C6229-1984 katakana hand-printed */
yuuji@0 194 /* 7/3 E13B Japanese graphic */
yuuji@0 195 /* 7/4 Supplementary Videotex (withdrawn) */
yuuji@0 196 /* 7/5 Teletex primary CCITT T.61 */
yuuji@0 197 /* 7/6 Teletex secondary CCITT T.61 */
yuuji@0 198 /* 7/7 CSA Z 243.4-1985 Alternate primary #1 */
yuuji@0 199 /* 7/8 CSA Z 243.4-1985 Alternate primary #2 */
yuuji@0 200 /* 7/9 Mosaic CCITT T.101 */
yuuji@0 201 /* 7/a Serbocroatian/Slovenian Latin */
yuuji@0 202 /* 7/b Serbocroatian Cyrillic */
yuuji@0 203 /* 7/c Supplementary CCITT T.101 */
yuuji@0 204 /* 7/d Macedonian Cyrillic */
yuuji@0 205
yuuji@0 206 /* 94 character sets - second intermediate byte */
yuuji@0 207
yuuji@0 208 /* 4/0 Greek primary CCITT */
yuuji@0 209 /* 4/1 Cuba */
yuuji@0 210 /* 4/2 ISO/IEC 646 invariant */
yuuji@0 211 /* 4/3 Irish Gaelic 7-bit */
yuuji@0 212 /* 4/4 Turkmen */
yuuji@0 213
yuuji@0 214
yuuji@0 215 /* 94x94 character sets */
yuuji@0 216
yuuji@0 217 #define I2CS_94x94_JIS_OLD 0x40 /* 4/0 JIS X 0208-1978 */
yuuji@0 218 #define I2CS_94x94_GB 0x41 /* 4/1 GB 2312 */
yuuji@0 219 #define I2CS_94x94_JIS_NEW 0x42 /* 4/2 JIS X 0208-1983 */
yuuji@0 220 #define I2CS_94x94_KSC 0x43 /* 4/3 KSC 5601 */
yuuji@0 221 #define I2CS_94x94_JIS_EXT 0x44 /* 4/4 JIS X 0212-1990 */
yuuji@0 222 /* 4/5 CCITT Chinese */
yuuji@0 223 /* 4/6 Blisssymbol Graphic */
yuuji@0 224 #define I2CS_94x94_CNS1 0x47 /* 4/7 CNS 11643 plane 1 */
yuuji@0 225 #define I2CS_94x94_CNS2 0x48 /* 4/8 CNS 11643 plane 2 */
yuuji@0 226 #define I2CS_94x94_CNS3 0x49 /* 4/9 CNS 11643 plane 3 */
yuuji@0 227 #define I2CS_94x94_CNS4 0x4a /* 4/a CNS 11643 plane 4 */
yuuji@0 228 #define I2CS_94x94_CNS5 0x4b /* 4/b CNS 11643 plane 5 */
yuuji@0 229 #define I2CS_94x94_CNS6 0x4c /* 4/c CNS 11643 plane 6 */
yuuji@0 230 #define I2CS_94x94_CNS7 0x4d /* 4/d CNS 11643 plane 7 */
yuuji@0 231 /* 4/e DPRK (North Korea) KGCII */
yuuji@0 232 /* 4/f JGCII plane 1 */
yuuji@0 233 /* 5/0 JGCII plane 2 */
yuuji@0 234
yuuji@0 235 /* 96 character sets */
yuuji@0 236
yuuji@0 237 #define I2CS_96_ISO8859_1 0x41 /* 4/1 Latin-1 (Western Europe) */
yuuji@0 238 #define I2CS_96_ISO8859_2 0x42 /* 4/2 Latin-2 (Czech, Slovak) */
yuuji@0 239 #define I2CS_96_ISO8859_3 0x43 /* 4/3 Latin-3 (Dutch, Turkish) */
yuuji@0 240 #define I2CS_96_ISO8859_4 0x44 /* 4/4 Latin-4 (Scandinavian) */
yuuji@0 241 /* 4/5 CSA Z 243.4-1985 */
yuuji@0 242 #define I2CS_96_ISO8859_7 0x46 /* 4/6 Greek */
yuuji@0 243 #define I2CS_96_ISO8859_6 0x47 /* 4/7 Arabic */
yuuji@0 244 #define I2CS_96_ISO8859_8 0x48 /* 4/8 Hebrew */
yuuji@0 245 /* 4/9 Czechoslovak CSN 369103 */
yuuji@0 246 /* 4/a Supplementary Latin and non-alpha */
yuuji@0 247 /* 4/b Technical */
yuuji@0 248 #define I2CS_96_ISO8859_5 0x4c /* 4/c Cyrillic */
yuuji@0 249 #define I2CS_96_ISO8859_9 0x4d /* 4/d Latin-5 (Finnish, Portuguese) */
yuuji@0 250 /* 4/e ISO 6937-2 residual */
yuuji@0 251 /* 4/f Basic Cyrillic */
yuuji@0 252 /* 5/0 Supplementary Latin 1, 2 and 5 */
yuuji@0 253 /* 5/1 Basic Box */
yuuji@0 254 /* 5/2 Supplementary ISO/IEC 6937 : 1992 */
yuuji@0 255 /* 5/3 CCITT Hebrew supplementary */
yuuji@0 256 #define I2CS_96_TIS620 0x54 /* 5/4 TIS 620 */
yuuji@0 257 /* 5/5 Arabic/French/German */
yuuji@0 258 #define I2CS_96_ISO8859_10 0x56 /* 5/6 Latin-6 (Northern Europe) */
yuuji@0 259 /* 5/7 ??? */
yuuji@0 260 /* 5/8 Sami (Lappish) supplementary */
yuuji@0 261 #define I2CS_96_ISO8859_13 0x59 /* 5/9 Latin-7 (Baltic) */
yuuji@0 262 #define I2CS_96_VSCII 0x5a /* 5/a Vietnamese */
yuuji@0 263 /* 5/b Technical #1 IEC 1289 */
yuuji@0 264 #define I2CS_96_ISO8859_14 0x5c /* 5/c Latin-8 (Celtic) */
yuuji@0 265 /* 5/d Sami supplementary Latin */
yuuji@0 266 /* 5/e Latin/Hebrew */
yuuji@0 267 /* 5/f Celtic supplementary Latin */
yuuji@0 268 /* 6/0 Uralic supplementary Cyrillic */
yuuji@0 269 /* 6/1 Volgaic supplementary Cyrillic */
yuuji@0 270 #define I2CS_96_ISO8859_15 0x62 /* 6/2 Latin-9 (Euro) */
yuuji@0 271 /* 6/3 Latin-1 with Euro */
yuuji@0 272 /* 6/4 Latin-4 with Euro */
yuuji@0 273 /* 6/5 Latin-7 with Euro */
yuuji@0 274 #define I2CS_96_ISO8859_16 0x66 /* 6/6 Latin-10 (Balkan) */
yuuji@0 275 /* 6/7 Ogham */
yuuji@0 276 /* 6/8 Sami supplementary Latin #2 */
yuuji@0 277 /* 7/d Supplementary Mosaic for CCITT 101 */
yuuji@0 278
yuuji@0 279 /* 96x96 character sets */
yuuji@0 280
yuuji@0 281 /* Types of character sets */
yuuji@0 282
yuuji@0 283 #define I2CS_94 0x000 /* 94 character set */
yuuji@0 284 #define I2CS_96 0x100 /* 96 character set */
yuuji@0 285 #define I2CS_MUL 0x200 /* multi-byte */
yuuji@0 286 #define I2CS_94x94 (I2CS_MUL | I2CS_94)
yuuji@0 287 #define I2CS_96x96 (I2CS_MUL | I2CS_96)
yuuji@0 288
yuuji@0 289
yuuji@0 290 /* Character set identifiers stored in Gn */
yuuji@0 291
yuuji@0 292 #define I2CS_BRITISH (I2CS_94 | I2CS_94_BRITISH)
yuuji@0 293 #define I2CS_ASCII (I2CS_94 | I2CS_94_ASCII)
yuuji@0 294 #define I2CS_JIS_BUGROM (I2CS_94 | I2CS_94_JIS_BUGROM)
yuuji@0 295 #define I2CS_JIS_KANA (I2CS_94 | I2CS_94_JIS_KANA)
yuuji@0 296 #define I2CS_JIS_ROMAN (I2CS_94 | I2CS_94_JIS_ROMAN)
yuuji@0 297 #define I2CS_JIS_OLD (I2CS_94x94 | I2CS_94x94_JIS_OLD)
yuuji@0 298 #define I2CS_GB (I2CS_94x94 | I2CS_94x94_GB)
yuuji@0 299 #define I2CS_JIS_NEW (I2CS_94x94 | I2CS_94x94_JIS_NEW)
yuuji@0 300 #define I2CS_KSC (I2CS_94x94 | I2CS_94x94_KSC)
yuuji@0 301 #define I2CS_JIS_EXT (I2CS_94x94 | I2CS_94x94_JIS_EXT)
yuuji@0 302 #define I2CS_CNS1 (I2CS_94x94 | I2CS_94x94_CNS1)
yuuji@0 303 #define I2CS_CNS2 (I2CS_94x94 | I2CS_94x94_CNS2)
yuuji@0 304 #define I2CS_CNS3 (I2CS_94x94 | I2CS_94x94_CNS3)
yuuji@0 305 #define I2CS_CNS4 (I2CS_94x94 | I2CS_94x94_CNS4)
yuuji@0 306 #define I2CS_CNS5 (I2CS_94x94 | I2CS_94x94_CNS5)
yuuji@0 307 #define I2CS_CNS6 (I2CS_94x94 | I2CS_94x94_CNS6)
yuuji@0 308 #define I2CS_CNS7 (I2CS_94x94 | I2CS_94x94_CNS7)
yuuji@0 309 #define I2CS_ISO8859_1 (I2CS_96 | I2CS_96_ISO8859_1)
yuuji@0 310 #define I2CS_ISO8859_2 (I2CS_96 | I2CS_96_ISO8859_2)
yuuji@0 311 #define I2CS_ISO8859_3 (I2CS_96 | I2CS_96_ISO8859_3)
yuuji@0 312 #define I2CS_ISO8859_4 (I2CS_96 | I2CS_96_ISO8859_4)
yuuji@0 313 #define I2CS_ISO8859_7 (I2CS_96 | I2CS_96_ISO8859_7)
yuuji@0 314 #define I2CS_ISO8859_6 (I2CS_96 | I2CS_96_ISO8859_6)
yuuji@0 315 #define I2CS_ISO8859_8 (I2CS_96 | I2CS_96_ISO8859_8)
yuuji@0 316 #define I2CS_ISO8859_5 (I2CS_96 | I2CS_96_ISO8859_5)
yuuji@0 317 #define I2CS_ISO8859_9 (I2CS_96 | I2CS_96_ISO8859_9)
yuuji@0 318 #define I2CS_TIS620 (I2CS_96 | I2CS_96_TIS620)
yuuji@0 319 #define I2CS_ISO8859_10 (I2CS_96 | I2CS_96_ISO8859_10)
yuuji@0 320 #define I2CS_ISO8859_13 (I2CS_96 | I2CS_96_ISO8859_13)
yuuji@0 321 #define I2CS_VSCII (I2CS_96 | I2CS_96_VSCII)
yuuji@0 322 #define I2CS_ISO8859_14 (I2CS_96 | I2CS_96_ISO8859_14)
yuuji@0 323 #define I2CS_ISO8859_15 (I2CS_96 | I2CS_96_ISO8859_15)
yuuji@0 324 #define I2CS_ISO8859_16 (I2CS_96 | I2CS_96_ISO8859_16)
yuuji@0 325
yuuji@0 326
yuuji@0 327 /* Miscellaneous ISO 2022 definitions */
yuuji@0 328
yuuji@0 329 #define EUC_CS2 0x8e /* single shift CS2 */
yuuji@0 330 #define EUC_CS3 0x8f /* single shift CS3 */
yuuji@0 331
yuuji@0 332 #define BITS7 0x7f /* 7-bit value mask */
yuuji@0 333 #define BIT8 0x80 /* 8th bit mask */
yuuji@0 334
yuuji@0 335 /* The following saves us from having to have yet more charset tables */
yuuji@0 336
yuuji@0 337 /* Unicode codepoints */
yuuji@0 338
yuuji@0 339 #define UCS2_C0CONTROL 0x00 /* first C0 control */
yuuji@0 340 #define UCS2_C0CONTROLEND 0x1F /* last C0 control */
yuuji@0 341 #define UCS2_C1CONTROL 0x80 /* first C1 control */
yuuji@0 342 #define UCS2_C1CONTROLEND 0x9F /* last C1 control */
yuuji@0 343
yuuji@0 344 /* ISO 646 substituted Unicode codepoints */
yuuji@0 345 #define UCS2_POUNDSTERLING 0x00a3
yuuji@0 346 #define UCS2_YEN 0x00a5
yuuji@0 347 #define UCS2_OVERLINE 0x203e
yuuji@0 348 #define UCS2_EURO 0x20ac
yuuji@0 349 #define UCS2_KATAKANA 0xff61 /* first katakana codepoint */
yuuji@0 350 #define UCS2_BOM 0xfeff /* byte order mark */
yuuji@0 351 #define UCS2_BOGON 0xfffd /* replacement character */
yuuji@0 352 /* next two codepoints are not Unicode chars */
yuuji@0 353 #define UCS2_BOMCHECK 0xfffe /* used to check byte order with UCS2_BOM */
yuuji@0 354 #define UCS2_NOTCHAR 0xffff /* not a character */
yuuji@0 355
yuuji@0 356 #define UCS4_BMPBASE 0x0000 /* Basic Multilingual Plane */
yuuji@0 357 #define UCS4_SMPBASE 0x10000 /* Supplementary Multilinugual Plane */
yuuji@0 358 #define UCS4_SIPBASE 0x20000 /* Supplementary Ideographic Plane */
yuuji@0 359 /* EastAsianWidth says plane 3 is wide */
yuuji@0 360 #define UCS4_UNABASE 0x40000 /* unassigned space */
yuuji@0 361 #define UCS4_SSPBASE 0xe0000 /* Supplementary Special-purpose Plane */
yuuji@0 362 #define UCS4_PVTBASE 0xf0000 /* private-space (two planes) */
yuuji@0 363 #define UCS4_MAXUNICODE 0x10ffff/* highest Unicode codepoint */
yuuji@0 364
yuuji@0 365 #define UTF16_BASE 0x10000 /* base of codepoints needing surrogates */
yuuji@0 366 #define UTF16_SHIFT 10 /* surrogate shift */
yuuji@0 367 #define UTF16_MASK 0x3ff /* surrogate mask */
yuuji@0 368 #define UTF16_SURR 0xd800 /* UTF-16 surrogate area */
yuuji@0 369 #define UTF16_SURRH 0xd800 /* UTF-16 first high surrogate */
yuuji@0 370 #define UTF16_SURRHEND 0xdbff /* UTF-16 last high surrogate */
yuuji@0 371 #define UTF16_SURRL 0xdc00 /* UTF-16 first low surrogate */
yuuji@0 372 #define UTF16_SURRLEND 0xdfff /* UTF-16 last low surrogate */
yuuji@0 373 #define UTF16_MAXSURR 0xdfff /* end of UTF-16 surrogates */
yuuji@0 374
yuuji@0 375
yuuji@0 376 /* UBOGON is used to represent a codepoint in a character set which does not
yuuji@0 377 * map to Unicode. It is also used for mapping failures, e.g. incomplete
yuuji@0 378 * shift sequences. This name has the same text width as 0x????, for
yuuji@0 379 * convenience in the mapping tables.
yuuji@0 380 *
yuuji@0 381 * NOCHAR is used to represent a codepoint in Unicode which does not map to
yuuji@0 382 * the target character set in a reverse mapping table. This name has the
yuuji@0 383 * same text width as 0x???? in case we ever add static reverse mapping tables.
yuuji@0 384 */
yuuji@0 385
yuuji@0 386 #define UBOGON UCS2_BOGON
yuuji@0 387 #define NOCHAR UCS2_NOTCHAR
yuuji@0 388
yuuji@0 389 /* Codepoints in non-Unicode character sets */
yuuji@0 390
yuuji@0 391 /* Codepoints in ISO 646 character sets */
yuuji@0 392
yuuji@0 393 /* British ASCII codepoints */
yuuji@0 394
yuuji@0 395 #define BRITISH_POUNDSTERLING 0x23
yuuji@0 396
yuuji@0 397 /* JIS Roman codepoints */
yuuji@0 398
yuuji@0 399 #define JISROMAN_YEN 0x5c
yuuji@0 400 #define JISROMAN_OVERLINE 0x7e
yuuji@0 401
yuuji@0 402
yuuji@0 403 /* Hankaku katakana codepoints & parameters
yuuji@0 404 *
yuuji@0 405 * In earlier versions, MAX_KANA_7 and MAX_KANA_8 were the maximum codepoint
yuuji@0 406 * values. Although this made sense, it was confusing with the "max ku" and
yuuji@0 407 * "max ten" values used in the double-byte tables; there are 1-origin, but
yuuji@0 408 * the calculated values used for "ku" and "ten" are 0-origin (derived by
yuuji@0 409 * substracting the "base"). What this all meant is that for double byte
yuuji@0 410 * characters the limit test is of the form (value < max_ku), but for single
yuuji@0 411 * byte characters (which used the same cell to hold the max ku) the limit
yuuji@0 412 * test was (value <= max_ku).
yuuji@0 413 *
yuuji@0 414 * By making MAX_KANA_[78] be maximum+1, the same (value < max_ku) limit test
yuuji@0 415 * is used throughout. - 6/15/2006
yuuji@0 416 */
yuuji@0 417
yuuji@0 418 #define MIN_KANA_7 0x21
yuuji@0 419 #define MAX_KANA_7 0x60 /* maximum value + 1 */
yuuji@0 420 #define KANA_7 (UCS2_KATAKANA - MIN_KANA_7)
yuuji@0 421 #define MIN_KANA_8 (MIN_KANA_7 | BIT8)
yuuji@0 422 #define MAX_KANA_8 (MAX_KANA_7 | BIT8)
yuuji@0 423 #define KANA_8 (UCS2_KATAKANA - MIN_KANA_8)
yuuji@0 424
yuuji@0 425 /* Charset scripts */
yuuji@0 426
yuuji@0 427 /* The term "script" is used here in a very loose sense, enough to make
yuuji@0 428 * purists cringe. Basically, the idea is to give the main program some
yuuji@0 429 * idea of how it should treat the characters of text in a charset with
yuuji@0 430 * respect to font, drawing routines, etc.
yuuji@0 431 *
yuuji@0 432 * In some cases, "script" is associated with a charset; in other cases,
yuuji@0 433 * it's more closely tied to a language.
yuuji@0 434 */
yuuji@0 435
yuuji@0 436 #define SC_UNICODE 0x1 /* Unicode */
yuuji@0 437 #define SC_LATIN_1 0x10 /* Western Europe */
yuuji@0 438 #define SC_LATIN_2 0x20 /* Eastern Europe */
yuuji@0 439 #define SC_LATIN_3 0x40 /* Southern Europe */
yuuji@0 440 #define SC_LATIN_4 0x80 /* Northern Europe */
yuuji@0 441 #define SC_LATIN_5 0x100 /* Turkish */
yuuji@0 442 #define SC_LATIN_6 0x200 /* Nordic */
yuuji@0 443 #define SC_LATIN_7 0x400 /* Baltic */
yuuji@0 444 #define SC_LATIN_8 0x800 /* Celtic */
yuuji@0 445 #define SC_LATIN_9 0x1000 /* Euro */
yuuji@0 446 #define SC_LATIN_0 SC_LATIN_9 /* colloquial name for Latin-9 */
yuuji@0 447 #define SC_ARABIC 0x2000
yuuji@0 448 #define SC_CYRILLIC 0x4000
yuuji@0 449 #define SC_GREEK 0x8000
yuuji@0 450 #define SC_HEBREW 0x10000
yuuji@0 451 #define SC_THAI 0x20000
yuuji@0 452 #define SC_UKRANIAN 0x40000
yuuji@0 453 #define SC_LATIN_10 0x80000 /* Balkan */
yuuji@0 454 #define SC_VIETNAMESE 0x100000
yuuji@0 455 #define SC_CHINESE_SIMPLIFIED 0x1000000
yuuji@0 456 #define SC_CHINESE_TRADITIONAL 0x2000000
yuuji@0 457 #define SC_JAPANESE 0x4000000
yuuji@0 458 #define SC_KOREAN 0x8000000
yuuji@0 459
yuuji@0 460
yuuji@0 461 /* Script table */
yuuji@0 462
yuuji@0 463 typedef struct utf8_scent {
yuuji@0 464 char *name; /* script name */
yuuji@0 465 char *description; /* script description */
yuuji@0 466 unsigned long script; /* script bitmask */
yuuji@0 467 } SCRIPT;
yuuji@0 468
yuuji@0 469 /* Character set table support */
yuuji@0 470
yuuji@0 471 typedef struct utf8_csent {
yuuji@0 472 char *name; /* charset name */
yuuji@0 473 unsigned short type; /* type of charset */
yuuji@0 474 unsigned short flags; /* charset flags */
yuuji@0 475 void *tab; /* additional data */
yuuji@0 476 unsigned long script; /* script(s) implemented by this charset */
yuuji@0 477 char *preferred; /* preferred charset over this one */
yuuji@0 478 } CHARSET;
yuuji@0 479
yuuji@0 480
yuuji@0 481 struct utf8_eucparam {
yuuji@0 482 unsigned int base_ku : 8; /* base row */
yuuji@0 483 unsigned int base_ten : 8; /* base column */
yuuji@0 484 unsigned int max_ku : 8; /* maximum row */
yuuji@0 485 unsigned int max_ten : 8; /* maximum column */
yuuji@0 486 void *tab; /* conversion table */
yuuji@0 487 };
yuuji@0 488
yuuji@0 489
yuuji@0 490 /* Charset types */
yuuji@0 491
yuuji@0 492 #define CT_UNKNOWN 0 /* unknown 8-bit */
yuuji@0 493 #define CT_ASCII 1 /* 7-bit ASCII no table */
yuuji@0 494 #define CT_UCS2 2 /* 2 byte 16-bit Unicode no table */
yuuji@0 495 #define CT_UCS4 3 /* 4 byte 32-bit Unicode no table */
yuuji@0 496 #define CT_1BYTE0 10 /* 1 byte ISO 8859-1 no table */
yuuji@0 497 #define CT_1BYTE 11 /* 1 byte ASCII + table 0x80-0xff */
yuuji@0 498 #define CT_1BYTE8 12 /* 1 byte table 0x00 - 0xff */
yuuji@0 499 #define CT_EUC 100 /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
yuuji@0 500 #define CT_DBYTE 101 /* 2 byte ASCII + utf8_eucparam */
yuuji@0 501 #define CT_DBYTE2 102 /* 2 byte ASCII + utf8_eucparam plane1/2 */
yuuji@0 502 #define CT_UTF16 1000 /* variable UTF-16 encoded Unicode no table */
yuuji@0 503 #define CT_UTF8 1001 /* variable UTF-8 encoded Unicode no table */
yuuji@0 504 #define CT_UTF7 1002 /* variable UTF-7 encoded Unicode no table */
yuuji@0 505 #define CT_2022 10000 /* variable ISO-2022 encoded no table */
yuuji@0 506 #define CT_SJIS 10001 /* 2 byte Shift-JIS encoded JIS no table */
yuuji@0 507
yuuji@0 508
yuuji@0 509 /* Character set flags */
yuuji@0 510
yuuji@0 511 #define CF_PRIMARY 0x1 /* primary name for this charset */
yuuji@0 512 #define CF_DISPLAY 0x2 /* charset used in displays */
yuuji@0 513 #define CF_POSTING 0x4 /* charset used in email posting */
yuuji@0 514 #define CF_UNSUPRT 0x8 /* charset unsupported (can't convert to it) */
yuuji@0 515 #define CF_NOEMAIL 0x10 /* charset not used in email */
yuuji@0 516
yuuji@0 517
yuuji@0 518 /* UTF-7 engine states */
yuuji@0 519
yuuji@0 520 #define U7_ASCII 0 /* ASCII character */
yuuji@0 521 #define U7_PLUS 1 /* plus seen */
yuuji@0 522 #define U7_UNICODE 2 /* Unicode characters */
yuuji@0 523 #define U7_MINUS 3 /* absorbed minus seen */
yuuji@0 524
yuuji@0 525 /* Function prototypes */
yuuji@0 526
yuuji@0 527 typedef unsigned long (*ucs4cn_t) (unsigned long c);
yuuji@0 528 typedef unsigned long (*ucs4de_t) (unsigned long c,void **more);
yuuji@0 529
yuuji@0 530 SCRIPT *utf8_script (char *script);
yuuji@0 531 const CHARSET *utf8_charset (char *charset);
yuuji@0 532 char *utf8_badcharset (char *charset);
yuuji@0 533 long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags);
yuuji@0 534 long utf8_text_cs (SIZEDTEXT *text,const CHARSET *cs,SIZEDTEXT *ret,
yuuji@0 535 ucs4cn_t cv,ucs4de_t de);
yuuji@0 536 long utf8_cstext (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,
yuuji@0 537 unsigned long errch);
yuuji@0 538 long utf8_cstocstext (SIZEDTEXT *text,char *sc,SIZEDTEXT *ret,char *dc,
yuuji@0 539 unsigned long errch);
yuuji@0 540 unsigned short *utf8_rmap (char *charset);
yuuji@0 541 unsigned short *utf8_rmap_cs (const CHARSET *cs);
yuuji@0 542 unsigned short *utf8_rmap_gen (const CHARSET *cs,unsigned short *oldmap);
yuuji@0 543 long utf8_rmaptext (SIZEDTEXT *text,unsigned short *rmap,SIZEDTEXT *ret,
yuuji@0 544 unsigned long errch,long iso2022jp);
yuuji@0 545 unsigned long utf8_rmapsize (SIZEDTEXT *text,unsigned short *rmap,
yuuji@0 546 unsigned long errch,long iso2022jp);
yuuji@0 547 long ucs4_rmaptext (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
yuuji@0 548 SIZEDTEXT *ret,unsigned long errch);
yuuji@0 549 long ucs4_rmaplen (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
yuuji@0 550 unsigned long errch);
yuuji@0 551 long ucs4_rmapbuf (unsigned char *t,unsigned long *ucs4,unsigned long len,
yuuji@0 552 unsigned short *rmap,unsigned long errch);
yuuji@0 553 unsigned long utf8_get (unsigned char **s,unsigned long *i);
yuuji@0 554 unsigned long utf8_get_raw (unsigned char **s,unsigned long *i);
yuuji@0 555 unsigned long ucs4_cs_get (CHARSET *cs,unsigned char **s,unsigned long *i);
yuuji@0 556 unsigned long *utf8_csvalidmap (char *charsets[]);
yuuji@0 557 const CHARSET *utf8_infercharset (SIZEDTEXT *src);
yuuji@0 558 long utf8_validate (unsigned char *s,unsigned long i);
yuuji@0 559 void utf8_text_1byte0 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
yuuji@0 560 void utf8_text_1byte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
yuuji@0 561 ucs4de_t de);
yuuji@0 562 void utf8_text_1byte8 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
yuuji@0 563 ucs4de_t de);
yuuji@0 564 void utf8_text_euc (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
yuuji@0 565 ucs4de_t de);
yuuji@0 566 void utf8_text_dbyte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
yuuji@0 567 ucs4de_t de);
yuuji@0 568 void utf8_text_dbyte2 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
yuuji@0 569 ucs4de_t de);
yuuji@0 570 void utf8_text_sjis (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
yuuji@0 571 void utf8_text_2022 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
yuuji@0 572 void utf8_text_utf7 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
yuuji@0 573 void utf8_text_utf8 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
yuuji@0 574 void utf8_text_ucs2 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
yuuji@0 575 void utf8_text_ucs4 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
yuuji@0 576 void utf8_text_utf16 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
yuuji@0 577 unsigned long utf8_size (unsigned long c);
yuuji@0 578 unsigned char *utf8_put (unsigned char *s,unsigned long c);
yuuji@0 579 unsigned long ucs4_titlecase (unsigned long c);
yuuji@0 580 long ucs4_width (unsigned long c);
yuuji@0 581 long utf8_strwidth (unsigned char *s);
yuuji@0 582 long utf8_textwidth (SIZEDTEXT *utf8);
yuuji@0 583 unsigned long ucs4_decompose (unsigned long c,void **more);
yuuji@0 584 unsigned long ucs4_decompose_recursive (unsigned long c,void **more);

UW-IMAP'd extensions by yuuji