imapext-2007
diff src/c-client/utf8aux.c @ 0:ada5e610ab86
imap-2007e
author | yuuji@gentei.org |
---|---|
date | Mon, 14 Sep 2009 15:17:45 +0900 |
parents | |
children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/c-client/utf8aux.c Mon Sep 14 15:17:45 2009 +0900 1.3 @@ -0,0 +1,449 @@ 1.4 +/* ======================================================================== 1.5 + * Copyright 1988-2007 University of Washington 1.6 + * 1.7 + * Licensed under the Apache License, Version 2.0 (the "License"); 1.8 + * you may not use this file except in compliance with the License. 1.9 + * You may obtain a copy of the License at 1.10 + * 1.11 + * http://www.apache.org/licenses/LICENSE-2.0 1.12 + * 1.13 + * 1.14 + * ======================================================================== 1.15 + */ 1.16 + 1.17 +/* 1.18 + * Program: UTF-8 auxillary routines (c-client and MIME2 support) 1.19 + * 1.20 + * Author: Mark Crispin 1.21 + * Networks and Distributed Computing 1.22 + * Computing & Communications 1.23 + * University of Washington 1.24 + * Administration Building, AG-44 1.25 + * Seattle, WA 98195 1.26 + * Internet: MRC@CAC.Washington.EDU 1.27 + * 1.28 + * Date: 11 June 1997 1.29 + * Last Edited: 12 October 2007 1.30 + */ 1.31 + 1.32 + 1.33 +#include <stdio.h> 1.34 +#include <ctype.h> 1.35 +#include "c-client.h" 1.36 + 1.37 +/* Convert charset labelled stringlist to UTF-8 in place 1.38 + * Accepts: string list 1.39 + * charset 1.40 + */ 1.41 + 1.42 +static void utf8_stringlist (STRINGLIST *st,char *charset) 1.43 +{ 1.44 + SIZEDTEXT txt; 1.45 + /* convert entire stringstruct */ 1.46 + if (st) do if (utf8_text (&st->text,charset,&txt,U8T_CANONICAL)) { 1.47 + fs_give ((void **) &st->text.data); 1.48 + st->text.data = txt.data; /* transfer this text */ 1.49 + st->text.size = txt.size; 1.50 + } while (st = st->next); 1.51 +} 1.52 + 1.53 + 1.54 +/* Convert charset labelled searchpgm to UTF-8 in place 1.55 + * Accepts: search program 1.56 + * charset 1.57 + */ 1.58 + 1.59 +void utf8_searchpgm (SEARCHPGM *pgm,char *charset) 1.60 +{ 1.61 + SIZEDTEXT txt; 1.62 + SEARCHHEADER *hl; 1.63 + SEARCHOR *ol; 1.64 + SEARCHPGMLIST *pl; 1.65 + if (pgm) { /* must have a search program */ 1.66 + utf8_stringlist (pgm->bcc,charset); 1.67 + utf8_stringlist (pgm->cc,charset); 1.68 + utf8_stringlist (pgm->from,charset); 1.69 + utf8_stringlist (pgm->to,charset); 1.70 + utf8_stringlist (pgm->subject,charset); 1.71 + for (hl = pgm->header; hl; hl = hl->next) { 1.72 + if (utf8_text (&hl->line,charset,&txt,U8T_CANONICAL)) { 1.73 + fs_give ((void **) &hl->line.data); 1.74 + hl->line.data = txt.data; 1.75 + hl->line.size = txt.size; 1.76 + } 1.77 + if (utf8_text (&hl->text,charset,&txt,U8T_CANONICAL)) { 1.78 + fs_give ((void **) &hl->text.data); 1.79 + hl->text.data = txt.data; 1.80 + hl->text.size = txt.size; 1.81 + } 1.82 + } 1.83 + utf8_stringlist (pgm->body,charset); 1.84 + utf8_stringlist (pgm->text,charset); 1.85 + for (ol = pgm->or; ol; ol = ol->next) { 1.86 + utf8_searchpgm (ol->first,charset); 1.87 + utf8_searchpgm (ol->second,charset); 1.88 + } 1.89 + for (pl = pgm->not; pl; pl = pl->next) utf8_searchpgm (pl->pgm,charset); 1.90 + utf8_stringlist (pgm->return_path,charset); 1.91 + utf8_stringlist (pgm->sender,charset); 1.92 + utf8_stringlist (pgm->reply_to,charset); 1.93 + utf8_stringlist (pgm->in_reply_to,charset); 1.94 + utf8_stringlist (pgm->message_id,charset); 1.95 + utf8_stringlist (pgm->newsgroups,charset); 1.96 + utf8_stringlist (pgm->followup_to,charset); 1.97 + utf8_stringlist (pgm->references,charset); 1.98 + } 1.99 +} 1.100 + 1.101 +/* Convert MIME-2 sized text to UTF-8 1.102 + * Accepts: source sized text 1.103 + * charset 1.104 + * flags (same as utf8_text()) 1.105 + * Returns: T if successful, NIL if failure 1.106 + */ 1.107 + 1.108 +#define MINENCWORD 9 1.109 +#define MAXENCWORD 75 1.110 + 1.111 +/* This resizing algorithm is stupid, but hopefully it should never be triggered 1.112 + * except for a pathological header. The main concern is that we don't get a 1.113 + * buffer overflow. 1.114 + */ 1.115 + 1.116 +#define DSIZE 65536 /* real headers should never be this big */ 1.117 +#define FUZZ 10 /* paranoia fuzz */ 1.118 + 1.119 +long utf8_mime2text (SIZEDTEXT *src,SIZEDTEXT *dst,long flags) 1.120 +{ 1.121 + unsigned char *s,*se,*e,*ee,*t,*te; 1.122 + char *cs,*ce,*ls; 1.123 + SIZEDTEXT txt,rtxt; 1.124 + unsigned long i; 1.125 + size_t dsize = min (DSIZE,((src->size / 4) + 1) * 9); 1.126 + /* always create buffer if canonicalizing */ 1.127 + dst->data = (flags & U8T_CANONICAL) ? 1.128 + (unsigned char *) fs_get ((size_t) dsize) : NIL; 1.129 + dst->size = 0; /* nothing written yet */ 1.130 + /* look for encoded words */ 1.131 + for (s = src->data, se = src->data + src->size; s < se; s++) { 1.132 + if (((se - s) > MINENCWORD) && (*s == '=') && (s[1] == '?') && 1.133 + (cs = (char *) mime2_token (s+2,se,(unsigned char **) &ce)) && 1.134 + (e = mime2_token ((unsigned char *) ce+1,se,&ee)) && 1.135 + (te = mime2_text (t = e+2,se)) && (ee == e + 1) && 1.136 + ((te - s) < MAXENCWORD)) { 1.137 + if (mime2_decode (e,t,te,&txt)) { 1.138 + *ce = '\0'; /* temporarily tie off charset */ 1.139 + if (ls = strchr (cs,'*')) *ls = '\0'; 1.140 + /* convert to UTF-8 as best we can */ 1.141 + if (!utf8_text (&txt,cs,&rtxt,flags)) utf8_text (&txt,NIL,&rtxt,flags); 1.142 + if (dst->data) { /* make sure existing buffer fits */ 1.143 + while (dsize <= (dst->size + rtxt.size + FUZZ)) { 1.144 + dsize += DSIZE; /* kick it up */ 1.145 + fs_resize ((void **) &dst->data,dsize); 1.146 + } 1.147 + } 1.148 + else { /* make a new buffer */ 1.149 + while (dsize <= (dst->size + rtxt.size)) dsize += DSIZE; 1.150 + memcpy (dst->data = (unsigned char *) fs_get (dsize),src->data, 1.151 + dst->size = s - src->data); 1.152 + } 1.153 + for (i = 0; i < rtxt.size; i++) dst->data[dst->size++] = rtxt.data[i]; 1.154 + 1.155 + /* all done with converted text */ 1.156 + if (rtxt.data != txt.data) fs_give ((void **) &rtxt.data); 1.157 + if (ls) *ls = '*'; /* restore language tag delimiter */ 1.158 + *ce = '?'; /* restore charset delimiter */ 1.159 + /* all done with decoded text */ 1.160 + fs_give ((void **) &txt.data); 1.161 + s = te+1; /* continue scan after encoded word */ 1.162 + /* skip leading whitespace */ 1.163 + for (t = s + 1; (t < se) && ((*t == ' ') || (*t == '\t')); t++); 1.164 + /* see if likely continuation encoded word */ 1.165 + if (t < (se - MINENCWORD)) switch (*t) { 1.166 + case '=': /* possible encoded word? */ 1.167 + if (t[1] == '?') s = t - 1; 1.168 + break; 1.169 + case '\015': /* CR, eat a following LF */ 1.170 + if (t[1] == '\012') t++; 1.171 + case '\012': /* possible end of logical line */ 1.172 + if ((t[1] == ' ') || (t[1] == '\t')) { 1.173 + do t++; 1.174 + while ((t < (se - MINENCWORD)) && ((t[1] == ' ')||(t[1] == '\t'))); 1.175 + if ((t < (se - MINENCWORD)) && (t[1] == '=') && (t[2] == '?')) 1.176 + s = t; /* definitely looks like continuation */ 1.177 + } 1.178 + } 1.179 + } 1.180 + else { /* restore original text */ 1.181 + if (dst->data) fs_give ((void **) &dst->data); 1.182 + dst->data = src->data; 1.183 + dst->size = src->size; 1.184 + return NIL; /* syntax error: MIME-2 decoding failure */ 1.185 + } 1.186 + } 1.187 + else do if (dst->data) { /* stash ASCII characters until LWSP */ 1.188 + if (dsize < (dst->size + FUZZ)) { 1.189 + dsize += DSIZE; /* kick it up */ 1.190 + fs_resize ((void **) &dst->data,dsize); 1.191 + } 1.192 + /* kludge: assumes ASCII doesn't decompose and titlecases to one byte */ 1.193 + dst->data[dst->size++] = (flags & U8T_CASECANON) ? 1.194 + (unsigned char) ucs4_titlecase (*s) : *s; 1.195 + } 1.196 + while ((*s != ' ') && (*s != '\t') && (*s != '\015') && (*s != '\012') && 1.197 + (++s < se)); 1.198 + } 1.199 + if (dst->data) dst->data[dst->size] = '\0'; 1.200 + else { /* nothing converted, return identity */ 1.201 + dst->data = src->data; 1.202 + dst->size = src->size; 1.203 + } 1.204 + return T; /* success */ 1.205 +} 1.206 + 1.207 +/* Decode MIME-2 text 1.208 + * Accepts: Encoding 1.209 + * text 1.210 + * text end 1.211 + * destination sized text 1.212 + * Returns: T if successful, else NIL 1.213 + */ 1.214 + 1.215 +long mime2_decode (unsigned char *e,unsigned char *t,unsigned char *te, 1.216 + SIZEDTEXT *txt) 1.217 +{ 1.218 + unsigned char *q; 1.219 + txt->data = NIL; /* initially no returned data */ 1.220 + switch (*e) { /* dispatch based upon encoding */ 1.221 + case 'Q': case 'q': /* sort-of QUOTED-PRINTABLE */ 1.222 + txt->data = (unsigned char *) fs_get ((size_t) (te - t) + 1); 1.223 + for (q = t,txt->size = 0; q < te; q++) switch (*q) { 1.224 + case '=': /* quoted character */ 1.225 + /* both must be hex */ 1.226 + if (!isxdigit (q[1]) || !isxdigit (q[2])) { 1.227 + fs_give ((void **) &txt->data); 1.228 + return NIL; /* syntax error: bad quoted character */ 1.229 + } 1.230 + /* assemble character */ 1.231 + txt->data[txt->size++] = hex2byte (q[1],q[2]); 1.232 + q += 2; /* advance past quoted character */ 1.233 + break; 1.234 + case '_': /* convert to space */ 1.235 + txt->data[txt->size++] = ' '; 1.236 + break; 1.237 + default: /* ordinary character */ 1.238 + txt->data[txt->size++] = *q; 1.239 + break; 1.240 + } 1.241 + txt->data[txt->size] = '\0'; 1.242 + break; 1.243 + case 'B': case 'b': /* BASE64 */ 1.244 + if (txt->data = (unsigned char *) rfc822_base64 (t,te - t,&txt->size)) 1.245 + break; 1.246 + default: /* any other encoding is unknown */ 1.247 + return NIL; /* syntax error: unknown encoding */ 1.248 + } 1.249 + return T; 1.250 +} 1.251 + 1.252 +/* Get MIME-2 token from encoded word 1.253 + * Accepts: current text pointer 1.254 + * text limit pointer 1.255 + * pointer to returned end pointer 1.256 + * Returns: current text pointer & end pointer if success, else NIL 1.257 + */ 1.258 + 1.259 +unsigned char *mime2_token (unsigned char *s,unsigned char *se, 1.260 + unsigned char **t) 1.261 +{ 1.262 + for (*t = s; **t != '?'; ++*t) { 1.263 + if ((*t < se) && isgraph (**t)) switch (**t) { 1.264 + case '(': case ')': case '<': case '>': case '@': case ',': case ';': 1.265 + case ':': case '\\': case '"': case '/': case '[': case ']': case '.': 1.266 + case '=': 1.267 + return NIL; /* none of these are valid in tokens */ 1.268 + } 1.269 + else return NIL; /* out of text or CTL or space */ 1.270 + } 1.271 + return s; 1.272 +} 1.273 + 1.274 + 1.275 +/* Get MIME-2 text from encoded word 1.276 + * Accepts: current text pointer 1.277 + * text limit pointer 1.278 + * pointer to returned end pointer 1.279 + * Returns: end pointer if success, else NIL 1.280 + */ 1.281 + 1.282 +unsigned char *mime2_text (unsigned char *s,unsigned char *se) 1.283 +{ 1.284 + unsigned char *t = se - 1; 1.285 + /* search for closing ?, make sure valid */ 1.286 + while ((s < t) && (*s != '?') && isgraph (*s++)); 1.287 + return ((s < t) && (*s == '?') && (s[1] == '=') && 1.288 + ((se == (s + 2)) || (s[2] == ' ') || (s[2] == '\t') || 1.289 + (s[2] == '\015') || (s[2] == '\012'))) ? s : NIL; 1.290 +} 1.291 + 1.292 +/* Convert UTF-16 string to Modified Base64 1.293 + * Accepts: destination pointer 1.294 + * source string 1.295 + * source length in octets 1.296 + * Returns: updated destination pointer 1.297 + */ 1.298 + 1.299 +static unsigned char *utf16_to_mbase64 (unsigned char *t,unsigned char *s, 1.300 + size_t i) 1.301 +{ 1.302 + char *v = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; 1.303 + *t++ = '&'; /* write shift-in */ 1.304 + while (i >= 3) { /* process tuplets */ 1.305 + *t++ = v[s[0] >> 2]; /* byte 1: high 6 bits (1) */ 1.306 + /* byte 2: low 2 bits (1), high 4 bits (2) */ 1.307 + *t++ = v[((s[0] << 4) + (s[1] >> 4)) & 0x3f]; 1.308 + /* byte 3: low 4 bits (2), high 2 bits (3) */ 1.309 + *t++ = v[((s[1] << 2) + (s[2] >> 6)) & 0x3f]; 1.310 + *t++ = v[s[2] & 0x3f]; /* byte 4: low 6 bits (3) */ 1.311 + s += 3; 1.312 + i -= 3; 1.313 + } 1.314 + if (i) { 1.315 + *t++ = v[s[0] >> 2]; /* byte 1: high 6 bits (1) */ 1.316 + /* byte 2: low 2 bits (1), high 4 bits (2) */ 1.317 + *t++ = v[((s[0] << 4) + (--i ? (s[1] >> 4) : 0)) & 0x3f]; 1.318 + /* byte 3: low 4 bits (2) */ 1.319 + if (i) *t++ = v[(s[1] << 2) & 0x3f]; 1.320 + } 1.321 + *t++ = '-'; /* write shift-out */ 1.322 + return t; 1.323 +} 1.324 + 1.325 + 1.326 +/* Poot a UTF-16 value to a buffer 1.327 + * Accepts: buffer pointer 1.328 + * value 1.329 + * Returns: updated pointer 1.330 + */ 1.331 + 1.332 +static unsigned char *utf16_poot (unsigned char *s,unsigned long c) 1.333 +{ 1.334 + *s++ = (unsigned char) (c >> 8); 1.335 + *s++ = (unsigned char) (c & 0xff); 1.336 + return s; 1.337 +} 1.338 + 1.339 +/* Convert UTF-8 to Modified UTF-7 1.340 + * Accepts: UTF-8 string 1.341 + * Returns: Modified UTF-7 string on success, NIL if invalid UTF-8 1.342 + */ 1.343 + 1.344 +#define MAXUNIUTF8 4 /* maximum length of Unicode UTF-8 sequence */ 1.345 + 1.346 +unsigned char *utf8_to_mutf7 (unsigned char *src) 1.347 +{ 1.348 + unsigned char *u16buf,*utf16; 1.349 + unsigned char *ret,*t; 1.350 + unsigned long j,c; 1.351 + unsigned char *s = src; 1.352 + unsigned long i = 0; 1.353 + int nonascii = 0; 1.354 + while (*s) { /* pass one: count destination octets */ 1.355 + if (*s & 0x80) { /* non-ASCII character? */ 1.356 + j = MAXUNIUTF8; /* get single UCS-4 codepoint */ 1.357 + if ((c = utf8_get (&s,&j)) & U8G_ERROR) return NIL; 1.358 + /* tally number of UTF-16 octets */ 1.359 + nonascii += (c & U8GM_NONBMP) ? 4 : 2; 1.360 + } 1.361 + else { /* ASCII character */ 1.362 + if (nonascii) { /* add pending Modified BASE64 size + shifts */ 1.363 + i += ((nonascii / 3) * 4) + ((j = nonascii % 3) ? j + 1 : 0) + 2; 1.364 + nonascii = 0; /* back to ASCII */ 1.365 + } 1.366 + if (*s == '&') i += 2; /* two octets if the escape */ 1.367 + else ++i; /* otherwise just count another octet */ 1.368 + ++s; /* advance to next source octet */ 1.369 + } 1.370 + } 1.371 + if (nonascii) /* add pending Modified BASE64 size + shifts */ 1.372 + i += ((nonascii / 3) * 4) + ((j = nonascii % 3) ? j + 1 : 0) + 2; 1.373 + 1.374 + /* create return buffer */ 1.375 + t = ret = (unsigned char *) fs_get (i + 1); 1.376 + /* and scratch buffer */ 1.377 + utf16 = u16buf = (unsigned char *) fs_get (i + 1); 1.378 + for (s = src; *s;) { /* pass two: copy destination octets */ 1.379 + if (*s & 0x80) { /* non-ASCII character? */ 1.380 + j = MAXUNIUTF8; /* get single UCS-4 codepoint */ 1.381 + if ((c = utf8_get (&s,&j)) & U8G_ERROR) return NIL; 1.382 + if (c & U8GM_NONBMP) { /* non-BMP? */ 1.383 + c -= UTF16_BASE; /* yes, convert to surrogate */ 1.384 + utf16 = utf16_poot (utf16_poot (utf16,(c >> UTF16_SHIFT)+UTF16_SURRH), 1.385 + (c & UTF16_MASK) + UTF16_SURRL); 1.386 + } 1.387 + else utf16 = utf16_poot (utf16,c); 1.388 + } 1.389 + else { /* ASCII character */ 1.390 + if (utf16 != u16buf) { /* add pending Modified BASE64 size + shifts */ 1.391 + t = utf16_to_mbase64 (t,u16buf,utf16 - u16buf); 1.392 + utf16 = u16buf; /* reset buffer */ 1.393 + } 1.394 + *t++ = *s; /* copy the character */ 1.395 + if (*s == '&') *t++ = '-';/* special sequence if the escape */ 1.396 + ++s; /* advance to next source octet */ 1.397 + } 1.398 + } 1.399 + /* add pending Modified BASE64 size + shifts */ 1.400 + if (utf16 != u16buf) t = utf16_to_mbase64 (t,u16buf,utf16 - u16buf); 1.401 + *t = '\0'; /* tie off destination */ 1.402 + if (i != (t - ret)) fatal ("utf8_to_mutf7 botch"); 1.403 + fs_give ((void **) &u16buf); 1.404 + return ret; 1.405 +} 1.406 + 1.407 +/* Convert Modified UTF-7 to UTF-8 1.408 + * Accepts: Modified UTF-7 string 1.409 + * Returns: UTF-8 string on success, NIL if invalid Modified UTF-7 1.410 + */ 1.411 + 1.412 +unsigned char *utf8_from_mutf7 (unsigned char *src) 1.413 +{ 1.414 + SIZEDTEXT utf8,utf7; 1.415 + unsigned char *s; 1.416 + int mbase64 = 0; 1.417 + /* disallow bogus strings */ 1.418 + if (mail_utf7_valid (src)) return NIL; 1.419 + /* initialize SIZEDTEXTs */ 1.420 + memset (&utf7,0,sizeof (SIZEDTEXT)); 1.421 + memset (&utf8,0,sizeof (SIZEDTEXT)); 1.422 + /* make copy of source */ 1.423 + for (s = cpytxt (&utf7,src,strlen (src)); *s; ++s) switch (*s) { 1.424 + case '&': /* Modified UTF-7 uses & instead of + */ 1.425 + *s = '+'; 1.426 + mbase64 = T; /* note that we are in Modified BASE64 */ 1.427 + break; 1.428 + case '+': /* temporarily swap text + to & */ 1.429 + if (!mbase64) *s = '&'; 1.430 + break; 1.431 + case '-': /* shift back to ASCII */ 1.432 + mbase64 = NIL; 1.433 + break; 1.434 + case ',': /* Modified UTF-7 uses , instead of / ... */ 1.435 + if (mbase64) *s = '/'; /* ...in Modified BASE64 */ 1.436 + break; 1.437 + } 1.438 + /* do the conversion */ 1.439 + utf8_text_utf7 (&utf7,&utf8,NIL,NIL); 1.440 + /* no longer need copy of source */ 1.441 + fs_give ((void **) &utf7.data); 1.442 + /* post-process: switch & and + */ 1.443 + for (s = utf8.data; *s; ++s) switch (*s) { 1.444 + case '&': 1.445 + *s = '+'; 1.446 + break; 1.447 + case '+': 1.448 + *s = '&'; 1.449 + break; 1.450 + } 1.451 + return utf8.data; 1.452 +}