imapext-2007

diff src/c-client/utf8.h @ 0:ada5e610ab86
imap-2007e
author: yuuji@gentei.org
date: Mon, 14 Sep 2009 15:17:45 +0900
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/c-client/utf8.h	Mon Sep 14 15:17:45 2009 +0900
     1.3 @@ -0,0 +1,584 @@
     1.4 +/* ========================================================================
     1.5 + * Copyright 1988-2008 University of Washington
     1.6 + *
     1.7 + * Licensed under the Apache License, Version 2.0 (the "License");
     1.8 + * you may not use this file except in compliance with the License.
     1.9 + * You may obtain a copy of the License at
    1.10 + *
    1.11 + *     http://www.apache.org/licenses/LICENSE-2.0
    1.12 + *
    1.13 + * 
    1.14 + * ========================================================================
    1.15 + */
    1.16 +
    1.17 +/*
    1.18 + * Program:	UTF-8 routines
    1.19 + *
    1.20 + * Author:	Mark Crispin
    1.21 + *		Networks and Distributed Computing
    1.22 + *		Computing & Communications
    1.23 + *		University of Washington
    1.24 + *		Administration Building, AG-44
    1.25 + *		Seattle, WA  98195
    1.26 + *		Internet: MRC@CAC.Washington.EDU
    1.27 + *
    1.28 + * Date:	11 June 1997
    1.29 + * Last Edited:	17 January 2008
    1.30 + */
    1.31 +
    1.32 +/* UTF-8 size and conversion routines from UCS-2 values (thus in the BMP).
    1.33 + * Don't use these if UTF-16 data (surrogate pairs) are an issue.
    1.34 + * For UCS-4 values, use the utf8_size() and utf8_put() functions.
    1.35 + */
    1.36 +
    1.37 +#define UTF8_SIZE_BMP(c) ((c & 0xff80) ? ((c & 0xf800) ? 3 : 2) : 1)
    1.38 +
    1.39 +#define UTF8_PUT_BMP(b,c) {				\
    1.40 +  if (c & 0xff80) {		/* non-ASCII? */	\
    1.41 +    if (c & 0xf800) {		/* three byte code */	\
    1.42 +      *b++ = 0xe0 | (c >> 12);				\
    1.43 +      *b++ = 0x80 | ((c >> 6) & 0x3f);			\
    1.44 +    }							\
    1.45 +    else *b++ = 0xc0 | ((c >> 6) & 0x3f);		\
    1.46 +    *b++ = 0x80 | (c & 0x3f); 				\
    1.47 +  }							\
    1.48 +  else *b++ = c;					\
    1.49 +}
    1.50 +
    1.51 +/* utf8_text() flag values */
    1.52 +
    1.53 +#define U8T_CASECANON 2		/* canonicalize case */
    1.54 +#define U8T_DECOMPOSE 4		/* decompose */
    1.55 +				/* full canonicalization */
    1.56 +#define U8T_CANONICAL (U8T_CASECANON | U8T_DECOMPOSE)
    1.57 +
    1.58 +
    1.59 +/* utf8_get() return values */
    1.60 +
    1.61 +				/* 0x0000 - 0xffff BMP plane */
    1.62 +#define U8GM_NONBMP 0xffff0000	/* mask for non-BMP values */
    1.63 +				/* 0x10000 - 0x10ffff extended planes */
    1.64 +				/* 0x110000 - 0x7ffffff non-Unicode */
    1.65 +#define U8G_ERROR 0x80000000	/* error flag */
    1.66 +#define U8G_BADCONT U8G_ERROR+1	/* continuation when not in progress */
    1.67 +#define U8G_INCMPLT U8G_ERROR+2	/* incomplete UTF-8 character */
    1.68 +#define U8G_NOTUTF8 U8G_ERROR+3	/* not a valid UTF-8 octet */
    1.69 +#define U8G_ENDSTRG U8G_ERROR+4	/* end of string */
    1.70 +#define U8G_ENDSTRI U8G_ERROR+5	/* end of string w/ incomplete UTF-8 char */
    1.71 +#define U8G_SURROGA U8G_ERROR+6	/* surrogate codepoint */
    1.72 +#define U8G_NOTUNIC U8G_ERROR+7	/* non-Unicode codepoint */
    1.73 +
    1.74 +
    1.75 +/* ucs4_width() return values */
    1.76 +
    1.77 +#define U4W_ERROR 0x80000000	/* error flags */
    1.78 +#define U4W_NOTUNCD U4W_ERROR+1	/* not a Unicode char */
    1.79 +#define U4W_PRIVATE U4W_ERROR+2	/* private-space plane */
    1.80 +#define U4W_SSPCHAR U4W_ERROR+3	/* Supplementary Special-purpose Plane */
    1.81 +#define U4W_UNASSGN U4W_ERROR+4	/* unassigned space plane */
    1.82 +#define U4W_CONTROL U4W_ERROR+5	/* C0/C1 control */
    1.83 +#define U4W_CTLSRGT U4W_CONTROL	/* in case legacy code references this */
    1.84 +
    1.85 +/* ISO-2022 engine states */
    1.86 +
    1.87 +#define I2S_CHAR 0		/* character */
    1.88 +#define I2S_ESC 1		/* previous character was ESC */
    1.89 +#define I2S_MUL 2		/* previous character was multi-byte code */
    1.90 +#define I2S_INT 3		/* previous character was intermediate */
    1.91 +
    1.92 +
    1.93 +/* ISO-2022 Gn selections */
    1.94 +
    1.95 +#define I2C_G0 0		/* G0 */
    1.96 +#define I2C_G1 1		/* G1 */
    1.97 +#define I2C_G2 2		/* G2 */
    1.98 +#define I2C_G3 3		/* G3 */
    1.99 +#define I2C_SG2 (2 << 2)	/* single shift G2 */
   1.100 +#define I2C_SG3 (3 << 2)	/* single shift G2 */
   1.101 +
   1.102 +
   1.103 +/* ISO-2022 octet definitions */
   1.104 +
   1.105 +#define I2C_ESC 0x1b		/* ESCape */
   1.106 +
   1.107 +	/* Intermediate character */
   1.108 +#define I2C_STRUCTURE 0x20	/* announce code structure */
   1.109 +#define I2C_C0 0x21		/* C0 */
   1.110 +#define I2C_C1 0x22		/* C1 */
   1.111 +#define I2C_CONTROL 0x23	/* single control function */
   1.112 +#define I2C_MULTI 0x24		/* multi-byte character set */
   1.113 +#define I2C_OTHER 0x25		/* other coding system */
   1.114 +#define I2C_REVISED 0x26	/* revised registration */
   1.115 +#define I2C_G0_94 0x28		/* G0 94-character set */
   1.116 +#define I2C_G1_94 0x29		/* G1 94-character set */
   1.117 +#define I2C_G2_94 0x2A		/* G2 94-character set */
   1.118 +#define I2C_G3_94 0x2B		/* G3 94-character set */
   1.119 +#define I2C_G0_96 0x2C		/* (not in ISO-2022) G0 96-character set */
   1.120 +#define I2C_G1_96 0x2D		/* G1 96-character set */
   1.121 +#define I2C_G2_96 0x2E		/* G2 96-character set */
   1.122 +#define I2C_G3_96 0x2F		/* G3 96-character set */
   1.123 +
   1.124 +	/* Locking shifts */
   1.125 +#define I2C_SI 0x0f		/* lock shift to G0 (Shift In) */
   1.126 +#define I2C_SO 0x0e		/* lock shift to G1 (Shift Out) */
   1.127 +	/* prefixed by ESC */
   1.128 +#define I2C_LS2 0x6e		/* lock shift to G2 */
   1.129 +#define I2C_LS3 0x6f		/* lock shift to G3 */
   1.130 +#define I2C_LS1R 0x7e		/* lock shift GR to G1 */
   1.131 +#define I2C_LS2R 0x7d		/* lock shift GR to G2 */
   1.132 +#define I2C_LS3R 0x7c		/* lock shift GR to G3 */
   1.133 +
   1.134 +	/* Single shifts */
   1.135 +#define I2C_SS2_ALT 0x8e	/* single shift to G2 (SS2) */
   1.136 +#define I2C_SS3_ALT 0x8f	/* single shift to G3 (SS3) */
   1.137 +#define I2C_SS2_ALT_7 0x19	/* single shift to G2 (SS2) */
   1.138 +#define I2C_SS3_ALT_7 0x1d	/* single shift to G3 (SS3) */
   1.139 +	/* prefixed by ESC */
   1.140 +#define I2C_SS2 0x4e		/* single shift to G2 (SS2) */
   1.141 +#define I2C_SS3 0x4f		/* single shift to G3 (SS3) */
   1.142 +
   1.143 +/* 94 character sets */
   1.144 +
   1.145 +				/* 4/0 ISO 646 IRV */
   1.146 +#define I2CS_94_BRITISH 0x41	/* 4/1 ISO 646 British */
   1.147 +#define I2CS_94_ASCII 0x42	/* 4/2 ISO 646 USA (ASCII) */
   1.148 +				/* 4/3 NATS Finland/Sweden (primary) */
   1.149 +				/* 4/4 NATS Finland/Sweden (secondary) */
   1.150 +				/* 4/5 NATS Denmark/Norway (primary) */
   1.151 +				/* 4/6 NATS Denmark/Norway (secondary) */
   1.152 +				/* 4/7 ISO 646 Swedish SEN 850200 */
   1.153 +				/* 4/8 ISO 646 Swedish names */
   1.154 +#define I2CS_94_JIS_BUGROM 0x48	/* 4/8 some buggy software does this */
   1.155 +#define I2CS_94_JIS_KANA 0x49	/* 4/9 JIS X 0201-1976 right half */
   1.156 +#define I2CS_94_JIS_ROMAN 0x4a	/* 4/a JIS X 0201-1976 left half */
   1.157 +				/* 4/b ISO 646 German */
   1.158 +				/* 4/c ISO 646 Portuguese (Olivetti) */
   1.159 +				/* 4/d ISO 6438 African */
   1.160 +				/* 4/e ISO 5427 Cyrillic (Honeywell-Bull) */
   1.161 +				/* 4/f DIN 31624 extended bibliography  */
   1.162 +				/* 5/0 ISO 5426-1980 Bibliography */
   1.163 +				/* 5/1 ISO 5427-1981 Cyrillic*/
   1.164 +				/* 5/2 ISO 646 French (withdrawn) */
   1.165 +				/* 5/3 ISO 5428-1980 Greek bibliography */
   1.166 +				/* 5/4 GB 1988-80 Chinese */
   1.167 +				/* 5/5 Latin-Greek (Honeywell-Bull) */
   1.168 +				/* 5/6 UK Viewdata/Teletext */
   1.169 +				/* 5/7 INIS (IRV subset) */
   1.170 +				/* 5/8 ISO 5428 Greek Bibliography */
   1.171 +				/* 5/9 ISO 646 Italian (Olivetti) */
   1.172 +				/* 5/a ISO 646 Spanish (Olivetti) */
   1.173 +				/* 5/b Greek (Olivetti) */
   1.174 +				/* 5/c Latin-Greek (Olivetti) */
   1.175 +				/* 5/d INIS non-standard extension */
   1.176 +				/* 5/e INIS Cyrillic extension */
   1.177 +				/* 5/f Arabic CODAR-U IERA */
   1.178 +				/* 6/0 ISO 646 Norwegian */
   1.179 +				/* 6/1 Norwegian version 2 (withdrawn) */
   1.180 +				/* 6/2 Videotex supplementary */
   1.181 +				/* 6/3 Videotex supplementary #2 */
   1.182 +				/* 6/4 Videotex supplementary #3 */
   1.183 +				/* 6/5 APL */
   1.184 +				/* 6/6 ISO 646 French */
   1.185 +				/* 6/7 ISO 646 Portuguese (IBM) */
   1.186 +				/* 6/8 ISO 646 Spanish (IBM) */
   1.187 +				/* 6/9 ISO 646 Hungarian */
   1.188 +				/* 6/a Greek ELOT (withdrawn) */
   1.189 +				/* 6/b ISO 9036 Arabic 7-bit */
   1.190 +				/* 6/c ISO 646 IRV supplementary set */
   1.191 +				/* 6/d JIS C6229-1984 OCR-A */
   1.192 +				/* 6/e JIS C6229-1984 OCR-B */
   1.193 +				/* 6/f JIS C6229-1984 OCR-B additional */
   1.194 +				/* 7/0 JIS C6229-1984 hand-printed */
   1.195 +				/* 7/1 JIS C6229-1984 additional hand-printd */
   1.196 +				/* 7/2 JIS C6229-1984 katakana hand-printed */
   1.197 +				/* 7/3 E13B Japanese graphic */
   1.198 +				/* 7/4 Supplementary Videotex (withdrawn) */
   1.199 +				/* 7/5 Teletex primary CCITT T.61 */
   1.200 +				/* 7/6 Teletex secondary CCITT T.61 */
   1.201 +				/* 7/7 CSA Z 243.4-1985 Alternate primary #1 */
   1.202 +				/* 7/8 CSA Z 243.4-1985 Alternate primary #2 */
   1.203 +				/* 7/9 Mosaic CCITT T.101 */
   1.204 +				/* 7/a Serbocroatian/Slovenian Latin */
   1.205 +				/* 7/b Serbocroatian Cyrillic */
   1.206 +				/* 7/c Supplementary CCITT T.101 */
   1.207 +				/* 7/d Macedonian Cyrillic */
   1.208 +
   1.209 +/* 94 character sets - second intermediate byte */
   1.210 +
   1.211 +				/* 4/0 Greek primary CCITT */
   1.212 +				/* 4/1 Cuba */
   1.213 +				/* 4/2 ISO/IEC 646 invariant */
   1.214 +				/* 4/3 Irish Gaelic 7-bit */
   1.215 +				/* 4/4 Turkmen */
   1.216 +
   1.217 +
   1.218 +/* 94x94 character sets */
   1.219 +
   1.220 +#define I2CS_94x94_JIS_OLD 0x40	/* 4/0 JIS X 0208-1978 */
   1.221 +#define I2CS_94x94_GB 0x41	/* 4/1 GB 2312 */
   1.222 +#define I2CS_94x94_JIS_NEW 0x42	/* 4/2 JIS X 0208-1983 */
   1.223 +#define I2CS_94x94_KSC 0x43	/* 4/3 KSC 5601 */
   1.224 +#define I2CS_94x94_JIS_EXT 0x44	/* 4/4 JIS X 0212-1990 */
   1.225 +				/* 4/5 CCITT Chinese */
   1.226 +				/* 4/6 Blisssymbol Graphic */
   1.227 +#define I2CS_94x94_CNS1 0x47	/* 4/7 CNS 11643 plane 1 */
   1.228 +#define I2CS_94x94_CNS2 0x48	/* 4/8 CNS 11643 plane 2 */
   1.229 +#define I2CS_94x94_CNS3 0x49	/* 4/9 CNS 11643 plane 3 */
   1.230 +#define I2CS_94x94_CNS4 0x4a	/* 4/a CNS 11643 plane 4 */
   1.231 +#define I2CS_94x94_CNS5 0x4b	/* 4/b CNS 11643 plane 5 */
   1.232 +#define I2CS_94x94_CNS6 0x4c	/* 4/c CNS 11643 plane 6 */
   1.233 +#define I2CS_94x94_CNS7 0x4d	/* 4/d CNS 11643 plane 7 */
   1.234 +				/* 4/e DPRK (North Korea) KGCII */
   1.235 +				/* 4/f JGCII plane 1 */
   1.236 +				/* 5/0 JGCII plane 2 */
   1.237 +
   1.238 +/* 96 character sets */
   1.239 +
   1.240 +#define I2CS_96_ISO8859_1 0x41	/* 4/1 Latin-1 (Western Europe) */
   1.241 +#define I2CS_96_ISO8859_2 0x42	/* 4/2 Latin-2 (Czech, Slovak) */
   1.242 +#define I2CS_96_ISO8859_3 0x43	/* 4/3 Latin-3 (Dutch, Turkish) */
   1.243 +#define I2CS_96_ISO8859_4 0x44	/* 4/4 Latin-4 (Scandinavian) */
   1.244 +				/* 4/5 CSA Z 243.4-1985 */
   1.245 +#define I2CS_96_ISO8859_7 0x46	/* 4/6 Greek */
   1.246 +#define I2CS_96_ISO8859_6 0x47	/* 4/7 Arabic */
   1.247 +#define I2CS_96_ISO8859_8 0x48	/* 4/8 Hebrew */
   1.248 +				/* 4/9 Czechoslovak CSN 369103 */
   1.249 +				/* 4/a Supplementary Latin and non-alpha */
   1.250 +				/* 4/b Technical */
   1.251 +#define I2CS_96_ISO8859_5 0x4c	/* 4/c Cyrillic */
   1.252 +#define I2CS_96_ISO8859_9 0x4d	/* 4/d Latin-5 (Finnish, Portuguese) */
   1.253 +				/* 4/e ISO 6937-2 residual */
   1.254 +				/* 4/f Basic Cyrillic */
   1.255 +				/* 5/0 Supplementary Latin 1, 2 and 5 */
   1.256 +				/* 5/1 Basic Box */
   1.257 +				/* 5/2 Supplementary ISO/IEC 6937 : 1992 */
   1.258 +				/* 5/3 CCITT Hebrew supplementary */
   1.259 +#define I2CS_96_TIS620 0x54	/* 5/4 TIS 620 */
   1.260 +				/* 5/5 Arabic/French/German */
   1.261 +#define I2CS_96_ISO8859_10 0x56	/* 5/6 Latin-6 (Northern Europe) */
   1.262 +				/* 5/7 ??? */
   1.263 +				/* 5/8 Sami (Lappish) supplementary */
   1.264 +#define I2CS_96_ISO8859_13 0x59	/* 5/9 Latin-7 (Baltic) */
   1.265 +#define I2CS_96_VSCII 0x5a	/* 5/a Vietnamese */
   1.266 +				/* 5/b Technical #1 IEC 1289 */
   1.267 +#define I2CS_96_ISO8859_14 0x5c	/* 5/c Latin-8 (Celtic) */
   1.268 +				/* 5/d Sami supplementary Latin */
   1.269 +				/* 5/e Latin/Hebrew */
   1.270 +				/* 5/f Celtic supplementary Latin */
   1.271 +				/* 6/0 Uralic supplementary Cyrillic */
   1.272 +				/* 6/1 Volgaic supplementary Cyrillic */
   1.273 +#define I2CS_96_ISO8859_15 0x62	/* 6/2 Latin-9 (Euro) */
   1.274 +				/* 6/3 Latin-1 with Euro */
   1.275 +				/* 6/4 Latin-4 with Euro */
   1.276 +				/* 6/5 Latin-7 with Euro */
   1.277 +#define I2CS_96_ISO8859_16 0x66	/* 6/6 Latin-10 (Balkan) */
   1.278 +				/* 6/7 Ogham */
   1.279 +				/* 6/8 Sami supplementary Latin #2 */
   1.280 +				/* 7/d Supplementary Mosaic for CCITT 101 */
   1.281 +
   1.282 +/* 96x96 character sets */
   1.283 +
   1.284 +/* Types of character sets */
   1.285 +
   1.286 +#define I2CS_94 0x000		/* 94 character set */
   1.287 +#define I2CS_96 0x100		/* 96 character set */
   1.288 +#define I2CS_MUL 0x200		/* multi-byte */
   1.289 +#define I2CS_94x94 (I2CS_MUL | I2CS_94)
   1.290 +#define I2CS_96x96 (I2CS_MUL | I2CS_96)
   1.291 +
   1.292 +
   1.293 +/* Character set identifiers stored in Gn */
   1.294 +
   1.295 +#define I2CS_BRITISH (I2CS_94 | I2CS_94_BRITISH)
   1.296 +#define I2CS_ASCII (I2CS_94 | I2CS_94_ASCII)
   1.297 +#define I2CS_JIS_BUGROM (I2CS_94 | I2CS_94_JIS_BUGROM)
   1.298 +#define I2CS_JIS_KANA (I2CS_94 | I2CS_94_JIS_KANA)
   1.299 +#define I2CS_JIS_ROMAN (I2CS_94 | I2CS_94_JIS_ROMAN)
   1.300 +#define I2CS_JIS_OLD (I2CS_94x94 | I2CS_94x94_JIS_OLD)
   1.301 +#define I2CS_GB (I2CS_94x94 | I2CS_94x94_GB)
   1.302 +#define I2CS_JIS_NEW (I2CS_94x94 | I2CS_94x94_JIS_NEW)
   1.303 +#define I2CS_KSC (I2CS_94x94 | I2CS_94x94_KSC)
   1.304 +#define I2CS_JIS_EXT (I2CS_94x94 | I2CS_94x94_JIS_EXT)
   1.305 +#define I2CS_CNS1 (I2CS_94x94 | I2CS_94x94_CNS1)
   1.306 +#define I2CS_CNS2 (I2CS_94x94 | I2CS_94x94_CNS2)
   1.307 +#define I2CS_CNS3 (I2CS_94x94 | I2CS_94x94_CNS3)
   1.308 +#define I2CS_CNS4 (I2CS_94x94 | I2CS_94x94_CNS4)
   1.309 +#define I2CS_CNS5 (I2CS_94x94 | I2CS_94x94_CNS5)
   1.310 +#define I2CS_CNS6 (I2CS_94x94 | I2CS_94x94_CNS6)
   1.311 +#define I2CS_CNS7 (I2CS_94x94 | I2CS_94x94_CNS7)
   1.312 +#define I2CS_ISO8859_1 (I2CS_96 | I2CS_96_ISO8859_1)
   1.313 +#define I2CS_ISO8859_2 (I2CS_96 | I2CS_96_ISO8859_2)
   1.314 +#define I2CS_ISO8859_3 (I2CS_96 | I2CS_96_ISO8859_3)
   1.315 +#define I2CS_ISO8859_4 (I2CS_96 | I2CS_96_ISO8859_4)
   1.316 +#define I2CS_ISO8859_7 (I2CS_96 | I2CS_96_ISO8859_7)
   1.317 +#define I2CS_ISO8859_6 (I2CS_96 | I2CS_96_ISO8859_6)
   1.318 +#define I2CS_ISO8859_8 (I2CS_96 | I2CS_96_ISO8859_8)
   1.319 +#define I2CS_ISO8859_5 (I2CS_96 | I2CS_96_ISO8859_5)
   1.320 +#define I2CS_ISO8859_9 (I2CS_96 | I2CS_96_ISO8859_9)
   1.321 +#define I2CS_TIS620 (I2CS_96 | I2CS_96_TIS620)
   1.322 +#define I2CS_ISO8859_10 (I2CS_96 | I2CS_96_ISO8859_10)
   1.323 +#define I2CS_ISO8859_13 (I2CS_96 | I2CS_96_ISO8859_13)
   1.324 +#define I2CS_VSCII (I2CS_96 | I2CS_96_VSCII)
   1.325 +#define I2CS_ISO8859_14 (I2CS_96 | I2CS_96_ISO8859_14)
   1.326 +#define I2CS_ISO8859_15 (I2CS_96 | I2CS_96_ISO8859_15)
   1.327 +#define I2CS_ISO8859_16 (I2CS_96 | I2CS_96_ISO8859_16)
   1.328 +
   1.329 +
   1.330 +/* Miscellaneous ISO 2022 definitions */
   1.331 +
   1.332 +#define EUC_CS2 0x8e		/* single shift CS2 */
   1.333 +#define EUC_CS3 0x8f		/* single shift CS3 */
   1.334 +
   1.335 +#define BITS7 0x7f		/* 7-bit value mask */
   1.336 +#define BIT8 0x80		/* 8th bit mask */
   1.337 +
   1.338 +/* The following saves us from having to have yet more charset tables */
   1.339 +
   1.340 +/* Unicode codepoints */
   1.341 +
   1.342 +#define UCS2_C0CONTROL 0x00	/* first C0 control */
   1.343 +#define UCS2_C0CONTROLEND 0x1F	/* last C0 control */
   1.344 +#define UCS2_C1CONTROL 0x80	/* first C1 control */
   1.345 +#define UCS2_C1CONTROLEND 0x9F	/* last C1 control */
   1.346 +
   1.347 +				/* ISO 646 substituted Unicode codepoints */
   1.348 +#define UCS2_POUNDSTERLING 0x00a3
   1.349 +#define UCS2_YEN 0x00a5
   1.350 +#define UCS2_OVERLINE 0x203e
   1.351 +#define UCS2_EURO 0x20ac
   1.352 +#define UCS2_KATAKANA 0xff61	/* first katakana codepoint */
   1.353 +#define UCS2_BOM 0xfeff		/* byte order mark */
   1.354 +#define UCS2_BOGON 0xfffd	/* replacement character */
   1.355 +				/* next two codepoints are not Unicode chars */
   1.356 +#define UCS2_BOMCHECK 0xfffe	/* used to check byte order with UCS2_BOM */
   1.357 +#define UCS2_NOTCHAR 0xffff	/* not a character */
   1.358 +
   1.359 +#define UCS4_BMPBASE 0x0000	/* Basic Multilingual Plane */
   1.360 +#define UCS4_SMPBASE 0x10000	/* Supplementary Multilinugual Plane */
   1.361 +#define UCS4_SIPBASE 0x20000	/* Supplementary Ideographic Plane */
   1.362 +				/* EastAsianWidth says plane 3 is wide */
   1.363 +#define UCS4_UNABASE 0x40000	/* unassigned space */
   1.364 +#define UCS4_SSPBASE 0xe0000	/* Supplementary Special-purpose Plane */
   1.365 +#define UCS4_PVTBASE 0xf0000	/* private-space (two planes) */
   1.366 +#define UCS4_MAXUNICODE 0x10ffff/* highest Unicode codepoint */
   1.367 +
   1.368 +#define UTF16_BASE 0x10000	/* base of codepoints needing surrogates */
   1.369 +#define UTF16_SHIFT 10		/* surrogate shift */
   1.370 +#define UTF16_MASK 0x3ff	/* surrogate mask */
   1.371 +#define UTF16_SURR 0xd800	/* UTF-16 surrogate area */
   1.372 +#define UTF16_SURRH 0xd800	/* UTF-16 first high surrogate */
   1.373 +#define UTF16_SURRHEND 0xdbff	/* UTF-16 last high surrogate */
   1.374 +#define UTF16_SURRL 0xdc00	/* UTF-16 first low surrogate */
   1.375 +#define UTF16_SURRLEND 0xdfff	/* UTF-16 last low surrogate */
   1.376 +#define UTF16_MAXSURR 0xdfff	/* end of UTF-16 surrogates */
   1.377 +
   1.378 +
   1.379 +/* UBOGON is used to represent a codepoint in a character set which does not
   1.380 + * map to Unicode.  It is also used for mapping failures, e.g. incomplete
   1.381 + * shift sequences.  This name has the same text width as 0x????, for
   1.382 + * convenience in the mapping tables.
   1.383 + *
   1.384 + * NOCHAR is used to represent a codepoint in Unicode which does not map to
   1.385 + * the target character set in a reverse mapping table.  This name has the
   1.386 + * same text width as 0x???? in case we ever add static reverse mapping tables.
   1.387 + */
   1.388 +
   1.389 +#define UBOGON UCS2_BOGON
   1.390 +#define NOCHAR UCS2_NOTCHAR
   1.391 +
   1.392 +/* Codepoints in non-Unicode character sets */
   1.393 +
   1.394 +/* Codepoints in ISO 646 character sets */
   1.395 +
   1.396 +/* British ASCII codepoints */
   1.397 +
   1.398 +#define BRITISH_POUNDSTERLING 0x23
   1.399 +
   1.400 +/* JIS Roman codepoints */
   1.401 +
   1.402 +#define JISROMAN_YEN 0x5c
   1.403 +#define JISROMAN_OVERLINE 0x7e
   1.404 +
   1.405 +
   1.406 +/* Hankaku katakana codepoints & parameters
   1.407 + *
   1.408 + * In earlier versions, MAX_KANA_7 and MAX_KANA_8 were the maximum codepoint
   1.409 + * values.  Although this made sense, it was confusing with the "max ku" and
   1.410 + * "max ten" values used in the double-byte tables; there are 1-origin, but
   1.411 + * the calculated values used for "ku" and "ten" are 0-origin (derived by
   1.412 + * substracting the "base").  What this all meant is that for double byte
   1.413 + * characters the limit test is of the form (value < max_ku), but for single
   1.414 + * byte characters (which used the same cell to hold the max ku) the limit
   1.415 + * test was (value <= max_ku).
   1.416 + *
   1.417 + * By making MAX_KANA_[78] be maximum+1, the same (value < max_ku) limit test
   1.418 + * is used throughout.  - 6/15/2006
   1.419 + */
   1.420 +
   1.421 +#define MIN_KANA_7 0x21
   1.422 +#define MAX_KANA_7 0x60		/* maximum value + 1 */
   1.423 +#define KANA_7 (UCS2_KATAKANA - MIN_KANA_7)
   1.424 +#define MIN_KANA_8 (MIN_KANA_7 | BIT8)
   1.425 +#define MAX_KANA_8 (MAX_KANA_7 | BIT8)
   1.426 +#define KANA_8 (UCS2_KATAKANA - MIN_KANA_8)
   1.427 +
   1.428 +/* Charset scripts */
   1.429 +
   1.430 +/*  The term "script" is used here in a very loose sense, enough to make
   1.431 + * purists cringe.  Basically, the idea is to give the main program some
   1.432 + * idea of how it should treat the characters of text in a charset with
   1.433 + * respect to font, drawing routines, etc.
   1.434 + *
   1.435 + *  In some cases, "script" is associated with a charset; in other cases,
   1.436 + * it's more closely tied to a language.
   1.437 + */
   1.438 +
   1.439 +#define SC_UNICODE 0x1		/* Unicode */
   1.440 +#define SC_LATIN_1 0x10		/* Western Europe */
   1.441 +#define SC_LATIN_2 0x20		/* Eastern Europe */
   1.442 +#define SC_LATIN_3 0x40		/* Southern Europe */
   1.443 +#define SC_LATIN_4 0x80		/* Northern Europe */
   1.444 +#define SC_LATIN_5 0x100	/* Turkish */
   1.445 +#define SC_LATIN_6 0x200	/* Nordic */
   1.446 +#define SC_LATIN_7 0x400	/* Baltic */
   1.447 +#define SC_LATIN_8 0x800	/* Celtic */
   1.448 +#define SC_LATIN_9 0x1000	/* Euro */
   1.449 +#define SC_LATIN_0 SC_LATIN_9	/* colloquial name for Latin-9 */
   1.450 +#define SC_ARABIC 0x2000
   1.451 +#define SC_CYRILLIC 0x4000
   1.452 +#define SC_GREEK 0x8000
   1.453 +#define SC_HEBREW 0x10000
   1.454 +#define SC_THAI 0x20000
   1.455 +#define SC_UKRANIAN 0x40000
   1.456 +#define SC_LATIN_10 0x80000	/* Balkan */
   1.457 +#define SC_VIETNAMESE 0x100000
   1.458 +#define SC_CHINESE_SIMPLIFIED 0x1000000
   1.459 +#define SC_CHINESE_TRADITIONAL 0x2000000
   1.460 +#define SC_JAPANESE 0x4000000
   1.461 +#define SC_KOREAN 0x8000000
   1.462 +
   1.463 +
   1.464 +/* Script table */
   1.465 +
   1.466 +typedef struct utf8_scent {
   1.467 +  char *name;			/* script name */
   1.468 +  char *description;		/* script description */
   1.469 +  unsigned long script;		/* script bitmask */
   1.470 +} SCRIPT;
   1.471 +
   1.472 +/* Character set table support */
   1.473 +
   1.474 +typedef struct utf8_csent {
   1.475 +  char *name;			/* charset name */
   1.476 +  unsigned short type;		/* type of charset */
   1.477 +  unsigned short flags;		/* charset flags */
   1.478 +  void *tab;			/* additional data */
   1.479 +  unsigned long script;		/* script(s) implemented by this charset */
   1.480 +  char *preferred;		/* preferred charset over this one */
   1.481 +} CHARSET;
   1.482 +
   1.483 +
   1.484 +struct utf8_eucparam {
   1.485 +  unsigned int base_ku : 8;	/* base row */
   1.486 +  unsigned int base_ten : 8;	/* base column */
   1.487 +  unsigned int max_ku : 8;	/* maximum row */
   1.488 +  unsigned int max_ten : 8;	/* maximum column */
   1.489 +  void *tab;			/* conversion table */
   1.490 +};
   1.491 +
   1.492 +
   1.493 +/* Charset types */
   1.494 +
   1.495 +#define CT_UNKNOWN 0		/* unknown 8-bit */
   1.496 +#define CT_ASCII 1		/* 7-bit ASCII no table */
   1.497 +#define CT_UCS2 2		/* 2 byte 16-bit Unicode no table */
   1.498 +#define CT_UCS4 3		/* 4 byte 32-bit Unicode no table */
   1.499 +#define CT_1BYTE0 10		/* 1 byte ISO 8859-1 no table */
   1.500 +#define CT_1BYTE 11		/* 1 byte ASCII + table 0x80-0xff */
   1.501 +#define CT_1BYTE8 12		/* 1 byte table 0x00 - 0xff */
   1.502 +#define CT_EUC 100		/* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
   1.503 +#define CT_DBYTE 101		/* 2 byte ASCII + utf8_eucparam */
   1.504 +#define CT_DBYTE2 102		/* 2 byte ASCII + utf8_eucparam plane1/2 */
   1.505 +#define CT_UTF16 1000		/* variable UTF-16 encoded Unicode no table */
   1.506 +#define CT_UTF8 1001		/* variable UTF-8 encoded Unicode no table */
   1.507 +#define CT_UTF7 1002		/* variable UTF-7 encoded Unicode no table */
   1.508 +#define CT_2022 10000		/* variable ISO-2022 encoded no table */
   1.509 +#define CT_SJIS 10001		/* 2 byte Shift-JIS encoded JIS no table */
   1.510 +
   1.511 +
   1.512 +/* Character set flags */
   1.513 +
   1.514 +#define CF_PRIMARY 0x1		/* primary name for this charset */
   1.515 +#define CF_DISPLAY 0x2		/* charset used in displays */
   1.516 +#define CF_POSTING 0x4		/* charset used in email posting */
   1.517 +#define CF_UNSUPRT 0x8		/* charset unsupported (can't convert to it) */
   1.518 +#define CF_NOEMAIL 0x10		/* charset not used in email */
   1.519 +
   1.520 +
   1.521 +/* UTF-7 engine states */
   1.522 +
   1.523 +#define U7_ASCII 0		/* ASCII character */
   1.524 +#define U7_PLUS 1		/* plus seen */
   1.525 +#define U7_UNICODE 2		/* Unicode characters */
   1.526 +#define U7_MINUS 3		/* absorbed minus seen */
   1.527 +
   1.528 +/* Function prototypes */
   1.529 +
   1.530 +typedef unsigned long (*ucs4cn_t) (unsigned long c);
   1.531 +typedef unsigned long (*ucs4de_t) (unsigned long c,void **more);
   1.532 +
   1.533 +SCRIPT *utf8_script (char *script);
   1.534 +const CHARSET *utf8_charset (char *charset);
   1.535 +char *utf8_badcharset (char *charset);
   1.536 +long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags);
   1.537 +long utf8_text_cs (SIZEDTEXT *text,const CHARSET *cs,SIZEDTEXT *ret,
   1.538 +		   ucs4cn_t cv,ucs4de_t de);
   1.539 +long utf8_cstext (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,
   1.540 +		  unsigned long errch);
   1.541 +long utf8_cstocstext (SIZEDTEXT *text,char *sc,SIZEDTEXT *ret,char *dc,
   1.542 +		      unsigned long errch);
   1.543 +unsigned short *utf8_rmap (char *charset);
   1.544 +unsigned short *utf8_rmap_cs (const CHARSET *cs);
   1.545 +unsigned short *utf8_rmap_gen (const CHARSET *cs,unsigned short *oldmap);
   1.546 +long utf8_rmaptext (SIZEDTEXT *text,unsigned short *rmap,SIZEDTEXT *ret,
   1.547 +		    unsigned long errch,long iso2022jp);
   1.548 +unsigned long utf8_rmapsize (SIZEDTEXT *text,unsigned short *rmap,
   1.549 +			     unsigned long errch,long iso2022jp);
   1.550 +long ucs4_rmaptext (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
   1.551 +		    SIZEDTEXT *ret,unsigned long errch);
   1.552 +long ucs4_rmaplen (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
   1.553 +		   unsigned long errch);
   1.554 +long ucs4_rmapbuf (unsigned char *t,unsigned long *ucs4,unsigned long len,
   1.555 +		   unsigned short *rmap,unsigned long errch);
   1.556 +unsigned long utf8_get (unsigned char **s,unsigned long *i);
   1.557 +unsigned long utf8_get_raw (unsigned char **s,unsigned long *i);
   1.558 +unsigned long ucs4_cs_get (CHARSET *cs,unsigned char **s,unsigned long *i);
   1.559 +unsigned long *utf8_csvalidmap (char *charsets[]);
   1.560 +const CHARSET *utf8_infercharset (SIZEDTEXT *src);
   1.561 +long utf8_validate (unsigned char *s,unsigned long i);
   1.562 +void utf8_text_1byte0 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
   1.563 +void utf8_text_1byte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
   1.564 +		      ucs4de_t de);
   1.565 +void utf8_text_1byte8 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
   1.566 +		       ucs4de_t de);
   1.567 +void utf8_text_euc (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
   1.568 +		    ucs4de_t de);
   1.569 +void utf8_text_dbyte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
   1.570 +		      ucs4de_t de);
   1.571 +void utf8_text_dbyte2 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
   1.572 +		       ucs4de_t de);
   1.573 +void utf8_text_sjis (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
   1.574 +void utf8_text_2022 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
   1.575 +void utf8_text_utf7 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
   1.576 +void utf8_text_utf8 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
   1.577 +void utf8_text_ucs2 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
   1.578 +void utf8_text_ucs4 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
   1.579 +void utf8_text_utf16 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de);
   1.580 +unsigned long utf8_size (unsigned long c);
   1.581 +unsigned char *utf8_put (unsigned char *s,unsigned long c);
   1.582 +unsigned long ucs4_titlecase (unsigned long c);
   1.583 +long ucs4_width (unsigned long c);
   1.584 +long utf8_strwidth (unsigned char *s);
   1.585 +long utf8_textwidth (SIZEDTEXT *utf8);
   1.586 +unsigned long ucs4_decompose (unsigned long c,void **more);
   1.587 +unsigned long ucs4_decompose_recursive (unsigned long c,void **more);
author	yuuji@gentei.org
date	Mon, 14 Sep 2009 15:17:45 +0900
parents
children