imapext-2007

diff src/c-client/utf8aux.c @ 0:ada5e610ab86

imap-2007e
author yuuji@gentei.org
date Mon, 14 Sep 2009 15:17:45 +0900
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/c-client/utf8aux.c	Mon Sep 14 15:17:45 2009 +0900
     1.3 @@ -0,0 +1,449 @@
     1.4 +/* ========================================================================
     1.5 + * Copyright 1988-2007 University of Washington
     1.6 + *
     1.7 + * Licensed under the Apache License, Version 2.0 (the "License");
     1.8 + * you may not use this file except in compliance with the License.
     1.9 + * You may obtain a copy of the License at
    1.10 + *
    1.11 + *     http://www.apache.org/licenses/LICENSE-2.0
    1.12 + *
    1.13 + * 
    1.14 + * ========================================================================
    1.15 + */
    1.16 +
    1.17 +/*
    1.18 + * Program:	UTF-8 auxillary routines (c-client and MIME2 support)
    1.19 + *
    1.20 + * Author:	Mark Crispin
    1.21 + *		Networks and Distributed Computing
    1.22 + *		Computing & Communications
    1.23 + *		University of Washington
    1.24 + *		Administration Building, AG-44
    1.25 + *		Seattle, WA  98195
    1.26 + *		Internet: MRC@CAC.Washington.EDU
    1.27 + *
    1.28 + * Date:	11 June 1997
    1.29 + * Last Edited:	12 October 2007
    1.30 + */
    1.31 +
    1.32 +
    1.33 +#include <stdio.h>
    1.34 +#include <ctype.h>
    1.35 +#include "c-client.h"
    1.36 +
    1.37 +/* Convert charset labelled stringlist to UTF-8 in place
    1.38 + * Accepts: string list
    1.39 + *	    charset
    1.40 + */
    1.41 +
    1.42 +static void utf8_stringlist (STRINGLIST *st,char *charset)
    1.43 +{
    1.44 +  SIZEDTEXT txt;
    1.45 +				/* convert entire stringstruct */
    1.46 +  if (st) do if (utf8_text (&st->text,charset,&txt,U8T_CANONICAL)) {
    1.47 +    fs_give ((void **) &st->text.data);
    1.48 +    st->text.data = txt.data; /* transfer this text */
    1.49 +    st->text.size = txt.size;
    1.50 +  } while (st = st->next);
    1.51 +}
    1.52 +
    1.53 +
    1.54 +/* Convert charset labelled searchpgm to UTF-8 in place
    1.55 + * Accepts: search program
    1.56 + *	    charset
    1.57 + */
    1.58 +
    1.59 +void utf8_searchpgm (SEARCHPGM *pgm,char *charset)
    1.60 +{
    1.61 +  SIZEDTEXT txt;
    1.62 +  SEARCHHEADER *hl;
    1.63 +  SEARCHOR *ol;
    1.64 +  SEARCHPGMLIST *pl;
    1.65 +  if (pgm) {			/* must have a search program */
    1.66 +    utf8_stringlist (pgm->bcc,charset);
    1.67 +    utf8_stringlist (pgm->cc,charset);
    1.68 +    utf8_stringlist (pgm->from,charset);
    1.69 +    utf8_stringlist (pgm->to,charset);
    1.70 +    utf8_stringlist (pgm->subject,charset);
    1.71 +    for (hl = pgm->header; hl; hl = hl->next) {
    1.72 +      if (utf8_text (&hl->line,charset,&txt,U8T_CANONICAL)) {
    1.73 +	fs_give ((void **) &hl->line.data);
    1.74 +	hl->line.data = txt.data;
    1.75 +	hl->line.size = txt.size;
    1.76 +      }
    1.77 +      if (utf8_text (&hl->text,charset,&txt,U8T_CANONICAL)) {
    1.78 +	fs_give ((void **) &hl->text.data);
    1.79 +	hl->text.data = txt.data;
    1.80 +	hl->text.size = txt.size;
    1.81 +      }
    1.82 +    }
    1.83 +    utf8_stringlist (pgm->body,charset);
    1.84 +    utf8_stringlist (pgm->text,charset);
    1.85 +    for (ol = pgm->or; ol; ol = ol->next) {
    1.86 +      utf8_searchpgm (ol->first,charset);
    1.87 +      utf8_searchpgm (ol->second,charset);
    1.88 +    }
    1.89 +    for (pl = pgm->not; pl; pl = pl->next) utf8_searchpgm (pl->pgm,charset);
    1.90 +    utf8_stringlist (pgm->return_path,charset);
    1.91 +    utf8_stringlist (pgm->sender,charset);
    1.92 +    utf8_stringlist (pgm->reply_to,charset);
    1.93 +    utf8_stringlist (pgm->in_reply_to,charset);
    1.94 +    utf8_stringlist (pgm->message_id,charset);
    1.95 +    utf8_stringlist (pgm->newsgroups,charset);
    1.96 +    utf8_stringlist (pgm->followup_to,charset);
    1.97 +    utf8_stringlist (pgm->references,charset);
    1.98 +  }
    1.99 +}
   1.100 +
   1.101 +/* Convert MIME-2 sized text to UTF-8
   1.102 + * Accepts: source sized text
   1.103 + *	    charset
   1.104 + *	    flags (same as utf8_text())
   1.105 + * Returns: T if successful, NIL if failure
   1.106 + */
   1.107 +
   1.108 +#define MINENCWORD 9
   1.109 +#define MAXENCWORD 75
   1.110 +
   1.111 +/* This resizing algorithm is stupid, but hopefully it should never be triggered
   1.112 + * except for a pathological header.  The main concern is that we don't get a
   1.113 + * buffer overflow.
   1.114 + */
   1.115 +
   1.116 +#define DSIZE 65536		/* real headers should never be this big */
   1.117 +#define FUZZ 10			/* paranoia fuzz */
   1.118 +
   1.119 +long utf8_mime2text (SIZEDTEXT *src,SIZEDTEXT *dst,long flags)
   1.120 +{
   1.121 +  unsigned char *s,*se,*e,*ee,*t,*te;
   1.122 +  char *cs,*ce,*ls;
   1.123 +  SIZEDTEXT txt,rtxt;
   1.124 +  unsigned long i;
   1.125 +  size_t dsize = min (DSIZE,((src->size / 4) + 1) * 9);
   1.126 +				/* always create buffer if canonicalizing */
   1.127 +  dst->data = (flags & U8T_CANONICAL) ?
   1.128 +    (unsigned char *) fs_get ((size_t) dsize) : NIL;
   1.129 +  dst->size = 0;		/* nothing written yet */
   1.130 +				/* look for encoded words */
   1.131 +  for (s = src->data, se = src->data + src->size; s < se; s++) {
   1.132 +    if (((se - s) > MINENCWORD) && (*s == '=') && (s[1] == '?') &&
   1.133 +      (cs = (char *) mime2_token (s+2,se,(unsigned char **) &ce)) &&
   1.134 +	(e = mime2_token ((unsigned char *) ce+1,se,&ee)) &&
   1.135 +	(te = mime2_text (t = e+2,se)) && (ee == e + 1) &&
   1.136 +	((te - s) < MAXENCWORD)) {
   1.137 +      if (mime2_decode (e,t,te,&txt)) {
   1.138 +	*ce = '\0';		/* temporarily tie off charset */
   1.139 +	if (ls = strchr (cs,'*')) *ls = '\0';
   1.140 +				/* convert to UTF-8 as best we can */
   1.141 +	if (!utf8_text (&txt,cs,&rtxt,flags)) utf8_text (&txt,NIL,&rtxt,flags);
   1.142 +	if (dst->data) {	/* make sure existing buffer fits */
   1.143 +	  while (dsize <= (dst->size + rtxt.size + FUZZ)) {
   1.144 +	    dsize += DSIZE;	/* kick it up */
   1.145 +	    fs_resize ((void **) &dst->data,dsize);
   1.146 +	  }
   1.147 +	}
   1.148 +	else {			/* make a new buffer */
   1.149 +	  while (dsize <= (dst->size + rtxt.size)) dsize += DSIZE;
   1.150 +	  memcpy (dst->data = (unsigned char *) fs_get (dsize),src->data,
   1.151 +		  dst->size = s - src->data);
   1.152 +	}
   1.153 +	for (i = 0; i < rtxt.size; i++) dst->data[dst->size++] = rtxt.data[i];
   1.154 +
   1.155 +				/* all done with converted text */
   1.156 +	if (rtxt.data != txt.data) fs_give ((void **) &rtxt.data);
   1.157 +	if (ls) *ls = '*';	/* restore language tag delimiter */
   1.158 +	*ce = '?';		/* restore charset delimiter */
   1.159 +				/* all done with decoded text */
   1.160 +	fs_give ((void **) &txt.data);
   1.161 +	s = te+1;		/* continue scan after encoded word */
   1.162 +				/* skip leading whitespace */
   1.163 +	for (t = s + 1; (t < se) && ((*t == ' ') || (*t == '\t')); t++);
   1.164 +				/* see if likely continuation encoded word */
   1.165 +	if (t < (se - MINENCWORD)) switch (*t) {
   1.166 +	case '=':		/* possible encoded word? */
   1.167 +	  if (t[1] == '?') s = t - 1;
   1.168 +	  break;
   1.169 +	case '\015':		/* CR, eat a following LF */
   1.170 +	  if (t[1] == '\012') t++;
   1.171 +	case '\012':		/* possible end of logical line */
   1.172 +	  if ((t[1] == ' ') || (t[1] == '\t')) {
   1.173 +	    do t++;
   1.174 +	    while ((t < (se - MINENCWORD)) && ((t[1] == ' ')||(t[1] == '\t')));
   1.175 +	    if ((t < (se - MINENCWORD)) && (t[1] == '=') && (t[2] == '?'))
   1.176 +	      s = t;		/* definitely looks like continuation */
   1.177 +	  }
   1.178 +	}
   1.179 +      }
   1.180 +      else {			/* restore original text */
   1.181 +	if (dst->data) fs_give ((void **) &dst->data);
   1.182 +	dst->data = src->data;
   1.183 +	dst->size = src->size;
   1.184 +	return NIL;		/* syntax error: MIME-2 decoding failure */
   1.185 +      }
   1.186 +    }
   1.187 +    else do if (dst->data) {	/* stash ASCII characters until LWSP */
   1.188 +      if (dsize < (dst->size + FUZZ)) {
   1.189 +	dsize += DSIZE;		/* kick it up */
   1.190 +	fs_resize ((void **) &dst->data,dsize);
   1.191 +      }
   1.192 +      /* kludge: assumes ASCII doesn't decompose and titlecases to one byte */
   1.193 +      dst->data[dst->size++] = (flags & U8T_CASECANON) ?
   1.194 +	(unsigned char) ucs4_titlecase (*s) : *s;
   1.195 +    }
   1.196 +    while ((*s != ' ') && (*s != '\t') && (*s != '\015') && (*s != '\012') &&
   1.197 +	   (++s < se));
   1.198 +  }
   1.199 +  if (dst->data) dst->data[dst->size] = '\0';
   1.200 +  else {			/* nothing converted, return identity */
   1.201 +    dst->data = src->data;
   1.202 +    dst->size = src->size;
   1.203 +  }
   1.204 +  return T;			/* success */
   1.205 +}
   1.206 +
   1.207 +/* Decode MIME-2 text
   1.208 + * Accepts: Encoding
   1.209 + *	    text
   1.210 + *	    text end
   1.211 + *	    destination sized text
   1.212 + * Returns: T if successful, else NIL
   1.213 + */
   1.214 +
   1.215 +long mime2_decode (unsigned char *e,unsigned char *t,unsigned char *te,
   1.216 +		   SIZEDTEXT *txt)
   1.217 +{
   1.218 +  unsigned char *q;
   1.219 +  txt->data = NIL;		/* initially no returned data */
   1.220 +  switch (*e) {			/* dispatch based upon encoding */
   1.221 +  case 'Q': case 'q':		/* sort-of QUOTED-PRINTABLE */
   1.222 +    txt->data = (unsigned char *) fs_get ((size_t) (te - t) + 1);
   1.223 +    for (q = t,txt->size = 0; q < te; q++) switch (*q) {
   1.224 +    case '=':			/* quoted character */
   1.225 +				/* both must be hex */
   1.226 +      if (!isxdigit (q[1]) || !isxdigit (q[2])) {
   1.227 +	fs_give ((void **) &txt->data);
   1.228 +	return NIL;		/* syntax error: bad quoted character */
   1.229 +      }
   1.230 +				/* assemble character */
   1.231 +      txt->data[txt->size++] = hex2byte (q[1],q[2]);
   1.232 +      q += 2;			/* advance past quoted character */
   1.233 +      break;
   1.234 +    case '_':			/* convert to space */
   1.235 +      txt->data[txt->size++] = ' ';
   1.236 +      break;
   1.237 +    default:			/* ordinary character */
   1.238 +      txt->data[txt->size++] = *q;
   1.239 +      break;
   1.240 +    }
   1.241 +    txt->data[txt->size] = '\0';
   1.242 +    break;
   1.243 +  case 'B': case 'b':		/* BASE64 */
   1.244 +    if (txt->data = (unsigned char *) rfc822_base64 (t,te - t,&txt->size))
   1.245 +      break;
   1.246 +  default:			/* any other encoding is unknown */
   1.247 +    return NIL;			/* syntax error: unknown encoding */
   1.248 +  }
   1.249 +  return T;
   1.250 +}
   1.251 +
   1.252 +/* Get MIME-2 token from encoded word
   1.253 + * Accepts: current text pointer
   1.254 + *	    text limit pointer
   1.255 + *	    pointer to returned end pointer
   1.256 + * Returns: current text pointer & end pointer if success, else NIL
   1.257 + */
   1.258 +
   1.259 +unsigned char *mime2_token (unsigned char *s,unsigned char *se,
   1.260 +			    unsigned char **t)
   1.261 +{
   1.262 +  for (*t = s; **t != '?'; ++*t) {
   1.263 +    if ((*t < se) && isgraph (**t)) switch (**t) {
   1.264 +    case '(': case ')': case '<': case '>': case '@': case ',': case ';':
   1.265 +    case ':': case '\\': case '"': case '/': case '[': case ']': case '.':
   1.266 +    case '=':
   1.267 +      return NIL;		/* none of these are valid in tokens */
   1.268 +    }
   1.269 +    else return NIL;		/* out of text or CTL or space */
   1.270 +  }
   1.271 +  return s;
   1.272 +}
   1.273 +
   1.274 +
   1.275 +/* Get MIME-2 text from encoded word
   1.276 + * Accepts: current text pointer
   1.277 + *	    text limit pointer
   1.278 + *	    pointer to returned end pointer
   1.279 + * Returns: end pointer if success, else NIL
   1.280 + */
   1.281 +
   1.282 +unsigned char *mime2_text (unsigned char *s,unsigned char *se)
   1.283 +{
   1.284 +  unsigned char *t = se - 1;
   1.285 +				/* search for closing ?, make sure valid */
   1.286 +  while ((s < t) && (*s != '?') && isgraph (*s++));
   1.287 +  return ((s < t) && (*s == '?') && (s[1] == '=') &&
   1.288 +	  ((se == (s + 2)) || (s[2] == ' ') || (s[2] == '\t') ||
   1.289 +	   (s[2] == '\015') || (s[2] == '\012'))) ? s : NIL;
   1.290 +}
   1.291 +
   1.292 +/* Convert UTF-16 string to Modified Base64
   1.293 + * Accepts: destination pointer
   1.294 + *	    source string
   1.295 + *	    source length in octets
   1.296 + * Returns: updated destination pointer
   1.297 + */
   1.298 +
   1.299 +static unsigned char *utf16_to_mbase64 (unsigned char *t,unsigned char *s,
   1.300 +					size_t i)
   1.301 +{
   1.302 +  char *v = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
   1.303 +  *t++ = '&';			/* write shift-in */
   1.304 +  while (i >= 3) {		/* process tuplets */
   1.305 +    *t++ = v[s[0] >> 2];	/* byte 1: high 6 bits (1) */
   1.306 +				/* byte 2: low 2 bits (1), high 4 bits (2) */
   1.307 +    *t++ = v[((s[0] << 4) + (s[1] >> 4)) & 0x3f];
   1.308 +				/* byte 3: low 4 bits (2), high 2 bits (3) */
   1.309 +    *t++ = v[((s[1] << 2) + (s[2] >> 6)) & 0x3f];
   1.310 +    *t++ = v[s[2] & 0x3f];	/* byte 4: low 6 bits (3) */
   1.311 +    s += 3;
   1.312 +    i -= 3;
   1.313 +  }
   1.314 +  if (i) {
   1.315 +    *t++ = v[s[0] >> 2];	/* byte 1: high 6 bits (1) */
   1.316 +				/* byte 2: low 2 bits (1), high 4 bits (2) */
   1.317 +    *t++ = v[((s[0] << 4) + (--i ? (s[1] >> 4) : 0)) & 0x3f];
   1.318 +				/* byte 3: low 4 bits (2) */
   1.319 +    if (i) *t++ = v[(s[1] << 2) & 0x3f];
   1.320 +  }
   1.321 +  *t++ = '-';			/* write shift-out */
   1.322 +  return t;
   1.323 +}
   1.324 +
   1.325 +
   1.326 +/* Poot a UTF-16 value to a buffer
   1.327 + * Accepts: buffer pointer
   1.328 + *	    value
   1.329 + * Returns: updated pointer
   1.330 + */
   1.331 +
   1.332 +static unsigned char *utf16_poot (unsigned char *s,unsigned long c)
   1.333 +{
   1.334 +  *s++ = (unsigned char) (c >> 8);
   1.335 +  *s++ = (unsigned char) (c & 0xff);
   1.336 +  return s;
   1.337 +}
   1.338 +
   1.339 +/* Convert UTF-8 to Modified UTF-7
   1.340 + * Accepts: UTF-8 string
   1.341 + * Returns: Modified UTF-7 string on success, NIL if invalid UTF-8
   1.342 + */
   1.343 +
   1.344 +#define MAXUNIUTF8 4		/* maximum length of Unicode UTF-8 sequence */
   1.345 +
   1.346 +unsigned char *utf8_to_mutf7 (unsigned char *src)
   1.347 +{
   1.348 +  unsigned char *u16buf,*utf16;
   1.349 +  unsigned char *ret,*t;
   1.350 +  unsigned long j,c;
   1.351 +  unsigned char *s = src;
   1.352 +  unsigned long i = 0;
   1.353 +  int nonascii = 0;
   1.354 +  while (*s) {			/* pass one: count destination octets */
   1.355 +    if (*s & 0x80) {		/* non-ASCII character? */
   1.356 +      j = MAXUNIUTF8;		/* get single UCS-4 codepoint */
   1.357 +      if ((c = utf8_get (&s,&j)) & U8G_ERROR) return NIL;
   1.358 +				/* tally number of UTF-16 octets */
   1.359 +      nonascii += (c & U8GM_NONBMP) ? 4 : 2;
   1.360 +    }
   1.361 +    else {			/* ASCII character */
   1.362 +      if (nonascii) {		/* add pending Modified BASE64 size + shifts */
   1.363 +	i += ((nonascii / 3) * 4) + ((j = nonascii % 3) ? j + 1 : 0) + 2;
   1.364 +	nonascii = 0;		/* back to ASCII */
   1.365 +      }
   1.366 +      if (*s == '&') i += 2;	/* two octets if the escape */
   1.367 +      else ++i;			/* otherwise just count another octet */
   1.368 +      ++s;			/* advance to next source octet */
   1.369 +    }
   1.370 +  }
   1.371 +  if (nonascii)			/* add pending Modified BASE64 size + shifts */
   1.372 +    i += ((nonascii / 3) * 4) + ((j = nonascii % 3) ? j + 1 : 0) + 2;
   1.373 +
   1.374 +				/* create return buffer */
   1.375 +  t = ret = (unsigned char *) fs_get (i + 1);
   1.376 +				/* and scratch buffer */
   1.377 +  utf16 = u16buf = (unsigned char *) fs_get (i + 1);
   1.378 +  for (s = src; *s;) {		/* pass two: copy destination octets */
   1.379 +    if (*s & 0x80) {		/* non-ASCII character? */
   1.380 +      j = MAXUNIUTF8;		/* get single UCS-4 codepoint */
   1.381 +      if ((c = utf8_get (&s,&j)) & U8G_ERROR) return NIL;
   1.382 +      if (c & U8GM_NONBMP) {	/* non-BMP? */
   1.383 +	c -= UTF16_BASE;	/* yes, convert to surrogate */
   1.384 +	utf16 = utf16_poot (utf16_poot (utf16,(c >> UTF16_SHIFT)+UTF16_SURRH),
   1.385 +			    (c & UTF16_MASK) + UTF16_SURRL);
   1.386 +      }
   1.387 +      else utf16 = utf16_poot (utf16,c);
   1.388 +    }
   1.389 +    else {			/* ASCII character */
   1.390 +      if (utf16 != u16buf) {	/* add pending Modified BASE64 size + shifts */
   1.391 +	t = utf16_to_mbase64 (t,u16buf,utf16 - u16buf);
   1.392 +	utf16 = u16buf;		/* reset buffer */
   1.393 +      }
   1.394 +      *t++ = *s;		/* copy the character */
   1.395 +      if (*s == '&') *t++ = '-';/* special sequence if the escape */
   1.396 +      ++s;			/* advance to next source octet */
   1.397 +    }
   1.398 +  }
   1.399 +				/* add pending Modified BASE64 size + shifts */
   1.400 +  if (utf16 != u16buf) t = utf16_to_mbase64 (t,u16buf,utf16 - u16buf);
   1.401 +  *t = '\0';			/* tie off destination */
   1.402 +  if (i != (t - ret)) fatal ("utf8_to_mutf7 botch");
   1.403 +  fs_give ((void **) &u16buf);
   1.404 +  return ret;
   1.405 +}
   1.406 +
   1.407 +/* Convert Modified UTF-7 to UTF-8
   1.408 + * Accepts: Modified UTF-7 string
   1.409 + * Returns: UTF-8 string on success, NIL if invalid Modified UTF-7
   1.410 + */
   1.411 +
   1.412 +unsigned char *utf8_from_mutf7 (unsigned char *src)
   1.413 +{
   1.414 +  SIZEDTEXT utf8,utf7;
   1.415 +  unsigned char *s;
   1.416 +  int mbase64 = 0;
   1.417 +				/* disallow bogus strings */
   1.418 +  if (mail_utf7_valid (src)) return NIL;
   1.419 +				/* initialize SIZEDTEXTs */
   1.420 +  memset (&utf7,0,sizeof (SIZEDTEXT));
   1.421 +  memset (&utf8,0,sizeof (SIZEDTEXT));
   1.422 +				/* make copy of source */
   1.423 +  for (s = cpytxt (&utf7,src,strlen (src)); *s; ++s) switch (*s) {
   1.424 +  case '&':			/* Modified UTF-7 uses & instead of + */
   1.425 +    *s = '+';
   1.426 +    mbase64 = T;		/* note that we are in Modified BASE64 */
   1.427 +    break;
   1.428 +  case '+':			/* temporarily swap text + to & */
   1.429 +    if (!mbase64) *s = '&';
   1.430 +    break;
   1.431 +  case '-':			/* shift back to ASCII */
   1.432 +    mbase64 = NIL;
   1.433 +    break;
   1.434 +  case ',':			/* Modified UTF-7 uses , instead of / ... */
   1.435 +    if (mbase64) *s = '/';	/* ...in Modified BASE64 */
   1.436 +    break;
   1.437 +  }
   1.438 +				/* do the conversion */
   1.439 +  utf8_text_utf7 (&utf7,&utf8,NIL,NIL);
   1.440 +				/* no longer need copy of source */
   1.441 +  fs_give ((void **) &utf7.data);
   1.442 +				/* post-process: switch & and + */
   1.443 +  for (s = utf8.data; *s; ++s) switch (*s) {
   1.444 +  case '&':
   1.445 +    *s = '+';
   1.446 +    break;
   1.447 +  case '+':
   1.448 +    *s = '&';
   1.449 +    break;
   1.450 +  }
   1.451 +  return utf8.data;
   1.452 +}

UW-IMAP'd extensions by yuuji