imapext-2007: ada5e610ab86 src/c-client/utf8aux.c

imapext-2007

view src/c-client/utf8aux.c @ 0:ada5e610ab86

imap-2007e

author	yuuji@gentei.org
date	Mon, 14 Sep 2009 15:17:45 +0900
parents
children

line source

1 /* ========================================================================

3 *

4 * Licensed under the Apache License, Version 2.0 (the "License");

5 * you may not use this file except in compliance with the License.

6 * You may obtain a copy of the License at

7 *

8 * http://www.apache.org/licenses/LICENSE-2.0

9 *

10 *

11 * ========================================================================

12 */

14 /*

15 * Program: UTF-8 auxillary routines (c-client and MIME2 support)

16 *

17 * Author: Mark Crispin

18 * Networks and Distributed Computing

19 * Computing & Communications

20 * University of Washington

21 * Administration Building, AG-44

22 * Seattle, WA 98195

23 * Internet: MRC@CAC.Washington.EDU

24 *

25 * Date: 11 June 1997

26 * Last Edited: 12 October 2007

27 */

30 #include <stdio.h>

31 #include <ctype.h>

32 #include "c-client.h"

34 /* Convert charset labelled stringlist to UTF-8 in place

35 * Accepts: string list

36 * charset

37 */

39 static void utf8_stringlist (STRINGLIST *st,char *charset)

40 {

41 SIZEDTEXT txt;

42 /* convert entire stringstruct */

43 if (st) do if (utf8_text (&st->text,charset,&txt,U8T_CANONICAL)) {

44 fs_give ((void **) &st->text.data);

45 st->text.data = txt.data; /* transfer this text */

46 st->text.size = txt.size;

47 } while (st = st->next);

48 }

51 /* Convert charset labelled searchpgm to UTF-8 in place

52 * Accepts: search program

53 * charset

54 */

56 void utf8_searchpgm (SEARCHPGM *pgm,char *charset)

57 {

58 SIZEDTEXT txt;

59 SEARCHHEADER *hl;

60 SEARCHOR *ol;

61 SEARCHPGMLIST *pl;

62 if (pgm) { /* must have a search program */

63 utf8_stringlist (pgm->bcc,charset);

64 utf8_stringlist (pgm->cc,charset);

65 utf8_stringlist (pgm->from,charset);

66 utf8_stringlist (pgm->to,charset);

67 utf8_stringlist (pgm->subject,charset);

68 for (hl = pgm->header; hl; hl = hl->next) {

69 if (utf8_text (&hl->line,charset,&txt,U8T_CANONICAL)) {

70 fs_give ((void **) &hl->line.data);

71 hl->line.data = txt.data;

72 hl->line.size = txt.size;

73 }

74 if (utf8_text (&hl->text,charset,&txt,U8T_CANONICAL)) {

75 fs_give ((void **) &hl->text.data);

76 hl->text.data = txt.data;

77 hl->text.size = txt.size;

78 }

79 }

80 utf8_stringlist (pgm->body,charset);

81 utf8_stringlist (pgm->text,charset);

82 for (ol = pgm->or; ol; ol = ol->next) {

83 utf8_searchpgm (ol->first,charset);

84 utf8_searchpgm (ol->second,charset);

85 }

86 for (pl = pgm->not; pl; pl = pl->next) utf8_searchpgm (pl->pgm,charset);

87 utf8_stringlist (pgm->return_path,charset);

88 utf8_stringlist (pgm->sender,charset);

89 utf8_stringlist (pgm->reply_to,charset);

90 utf8_stringlist (pgm->in_reply_to,charset);

91 utf8_stringlist (pgm->message_id,charset);

92 utf8_stringlist (pgm->newsgroups,charset);

93 utf8_stringlist (pgm->followup_to,charset);

94 utf8_stringlist (pgm->references,charset);

95 }

96 }

98 /* Convert MIME-2 sized text to UTF-8

99 * Accepts: source sized text

100 * charset

101 * flags (same as utf8_text())

102 * Returns: T if successful, NIL if failure

103 */

104

105 #define MINENCWORD 9

106 #define MAXENCWORD 75

107

108 /* This resizing algorithm is stupid, but hopefully it should never be triggered

109 * except for a pathological header. The main concern is that we don't get a

110 * buffer overflow.

111 */

112

113 #define DSIZE 65536 /* real headers should never be this big */

114 #define FUZZ 10 /* paranoia fuzz */

115

116 long utf8_mime2text (SIZEDTEXT *src,SIZEDTEXT *dst,long flags)

117 {

118 unsigned char *s,*se,*e,*ee,*t,*te;

119 char *cs,*ce,*ls;

120 SIZEDTEXT txt,rtxt;

121 unsigned long i;

122 size_t dsize = min (DSIZE,((src->size / 4) + 1) * 9);

123 /* always create buffer if canonicalizing */

124 dst->data = (flags & U8T_CANONICAL) ?

125 (unsigned char *) fs_get ((size_t) dsize) : NIL;

126 dst->size = 0; /* nothing written yet */

127 /* look for encoded words */

128 for (s = src->data, se = src->data + src->size; s < se; s++) {

129 if (((se - s) > MINENCWORD) && (*s == '=') && (s[1] == '?') &&

130 (cs = (char *) mime2_token (s+2,se,(unsigned char **) &ce)) &&

131 (e = mime2_token ((unsigned char *) ce+1,se,&ee)) &&

132 (te = mime2_text (t = e+2,se)) && (ee == e + 1) &&

133 ((te - s) < MAXENCWORD)) {

134 if (mime2_decode (e,t,te,&txt)) {

135 *ce = '\0'; /* temporarily tie off charset */

136 if (ls = strchr (cs,'*')) *ls = '\0';

137 /* convert to UTF-8 as best we can */

138 if (!utf8_text (&txt,cs,&rtxt,flags)) utf8_text (&txt,NIL,&rtxt,flags);

139 if (dst->data) { /* make sure existing buffer fits */

140 while (dsize <= (dst->size + rtxt.size + FUZZ)) {

141 dsize += DSIZE; /* kick it up */

142 fs_resize ((void **) &dst->data,dsize);

143 }

144 }

145 else { /* make a new buffer */

146 while (dsize <= (dst->size + rtxt.size)) dsize += DSIZE;

147 memcpy (dst->data = (unsigned char *) fs_get (dsize),src->data,

148 dst->size = s - src->data);

149 }

150 for (i = 0; i < rtxt.size; i++) dst->data[dst->size++] = rtxt.data[i];

151

152 /* all done with converted text */

153 if (rtxt.data != txt.data) fs_give ((void **) &rtxt.data);

154 if (ls) *ls = '*'; /* restore language tag delimiter */

155 *ce = '?'; /* restore charset delimiter */

156 /* all done with decoded text */

157 fs_give ((void **) &txt.data);

158 s = te+1; /* continue scan after encoded word */

159 /* skip leading whitespace */

160 for (t = s + 1; (t < se) && ((*t == ' ') || (*t == '\t')); t++);

161 /* see if likely continuation encoded word */

162 if (t < (se - MINENCWORD)) switch (*t) {

163 case '=': /* possible encoded word? */

164 if (t[1] == '?') s = t - 1;

165 break;

166 case '\015': /* CR, eat a following LF */

167 if (t[1] == '\012') t++;

168 case '\012': /* possible end of logical line */

169 if ((t[1] == ' ') || (t[1] == '\t')) {

170 do t++;

171 while ((t < (se - MINENCWORD)) && ((t[1] == ' ')||(t[1] == '\t')));

172 if ((t < (se - MINENCWORD)) && (t[1] == '=') && (t[2] == '?'))

173 s = t; /* definitely looks like continuation */

174 }

175 }

176 }

177 else { /* restore original text */

178 if (dst->data) fs_give ((void **) &dst->data);

179 dst->data = src->data;

180 dst->size = src->size;

181 return NIL; /* syntax error: MIME-2 decoding failure */

182 }

183 }

184 else do if (dst->data) { /* stash ASCII characters until LWSP */

185 if (dsize < (dst->size + FUZZ)) {

186 dsize += DSIZE; /* kick it up */

187 fs_resize ((void **) &dst->data,dsize);

188 }

189 /* kludge: assumes ASCII doesn't decompose and titlecases to one byte */

190 dst->data[dst->size++] = (flags & U8T_CASECANON) ?

191 (unsigned char) ucs4_titlecase (*s) : *s;

192 }

193 while ((*s != ' ') && (*s != '\t') && (*s != '\015') && (*s != '\012') &&

194 (++s < se));

195 }

196 if (dst->data) dst->data[dst->size] = '\0';

197 else { /* nothing converted, return identity */

198 dst->data = src->data;

199 dst->size = src->size;

200 }

201 return T; /* success */

202 }

203

204 /* Decode MIME-2 text

205 * Accepts: Encoding

206 * text

207 * text end

208 * destination sized text

209 * Returns: T if successful, else NIL

210 */

211

212 long mime2_decode (unsigned char *e,unsigned char *t,unsigned char *te,

213 SIZEDTEXT *txt)

214 {

215 unsigned char *q;

216 txt->data = NIL; /* initially no returned data */

217 switch (*e) { /* dispatch based upon encoding */

218 case 'Q': case 'q': /* sort-of QUOTED-PRINTABLE */

219 txt->data = (unsigned char *) fs_get ((size_t) (te - t) + 1);

220 for (q = t,txt->size = 0; q < te; q++) switch (*q) {

221 case '=': /* quoted character */

222 /* both must be hex */

223 if (!isxdigit (q[1]) || !isxdigit (q[2])) {

224 fs_give ((void **) &txt->data);

225 return NIL; /* syntax error: bad quoted character */

226 }

227 /* assemble character */

228 txt->data[txt->size++] = hex2byte (q[1],q[2]);

229 q += 2; /* advance past quoted character */

230 break;

231 case '_': /* convert to space */

232 txt->data[txt->size++] = ' ';

233 break;

234 default: /* ordinary character */

235 txt->data[txt->size++] = *q;

236 break;

237 }

238 txt->data[txt->size] = '\0';

239 break;

240 case 'B': case 'b': /* BASE64 */

241 if (txt->data = (unsigned char *) rfc822_base64 (t,te - t,&txt->size))

242 break;

243 default: /* any other encoding is unknown */

244 return NIL; /* syntax error: unknown encoding */

245 }

246 return T;

247 }

248

249 /* Get MIME-2 token from encoded word

250 * Accepts: current text pointer

251 * text limit pointer

252 * pointer to returned end pointer

253 * Returns: current text pointer & end pointer if success, else NIL

254 */

255

256 unsigned char *mime2_token (unsigned char *s,unsigned char *se,

257 unsigned char **t)

258 {

259 for (*t = s; **t != '?'; ++*t) {

260 if ((*t < se) && isgraph (**t)) switch (**t) {

261 case '(': case ')': case '<': case '>': case '@': case ',': case ';':

262 case ':': case '\\': case '"': case '/': case '[': case ']': case '.':

263 case '=':

264 return NIL; /* none of these are valid in tokens */

265 }

266 else return NIL; /* out of text or CTL or space */

267 }

268 return s;

269 }

270

271

272 /* Get MIME-2 text from encoded word

273 * Accepts: current text pointer

274 * text limit pointer

275 * pointer to returned end pointer

276 * Returns: end pointer if success, else NIL

277 */

278

279 unsigned char *mime2_text (unsigned char *s,unsigned char *se)

280 {

281 unsigned char *t = se - 1;

282 /* search for closing ?, make sure valid */

283 while ((s < t) && (*s != '?') && isgraph (*s++));

284 return ((s < t) && (*s == '?') && (s[1] == '=') &&

285 ((se == (s + 2)) || (s[2] == ' ') || (s[2] == '\t') ||

286 (s[2] == '\015') || (s[2] == '\012'))) ? s : NIL;

287 }

288

289 /* Convert UTF-16 string to Modified Base64

290 * Accepts: destination pointer

291 * source string

292 * source length in octets

293 * Returns: updated destination pointer

294 */

295

296 static unsigned char *utf16_to_mbase64 (unsigned char *t,unsigned char *s,

297 size_t i)

298 {

299 char *v = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";

300 *t++ = '&'; /* write shift-in */

301 while (i >= 3) { /* process tuplets */

302 *t++ = v[s[0] >> 2]; /* byte 1: high 6 bits (1) */

303 /* byte 2: low 2 bits (1), high 4 bits (2) */

304 *t++ = v[((s[0] << 4) + (s[1] >> 4)) & 0x3f];

305 /* byte 3: low 4 bits (2), high 2 bits (3) */

306 *t++ = v[((s[1] << 2) + (s[2] >> 6)) & 0x3f];

307 *t++ = v[s[2] & 0x3f]; /* byte 4: low 6 bits (3) */

308 s += 3;

309 i -= 3;

310 }

311 if (i) {

312 *t++ = v[s[0] >> 2]; /* byte 1: high 6 bits (1) */

313 /* byte 2: low 2 bits (1), high 4 bits (2) */

314 *t++ = v[((s[0] << 4) + (--i ? (s[1] >> 4) : 0)) & 0x3f];

315 /* byte 3: low 4 bits (2) */

316 if (i) *t++ = v[(s[1] << 2) & 0x3f];

317 }

318 *t++ = '-'; /* write shift-out */

319 return t;

320 }

321

322

323 /* Poot a UTF-16 value to a buffer

324 * Accepts: buffer pointer

325 * value

326 * Returns: updated pointer

327 */

328

329 static unsigned char *utf16_poot (unsigned char *s,unsigned long c)

330 {

331 *s++ = (unsigned char) (c >> 8);

332 *s++ = (unsigned char) (c & 0xff);

333 return s;

334 }

335

336 /* Convert UTF-8 to Modified UTF-7

337 * Accepts: UTF-8 string

338 * Returns: Modified UTF-7 string on success, NIL if invalid UTF-8

339 */

340

341 #define MAXUNIUTF8 4 /* maximum length of Unicode UTF-8 sequence */

342

343 unsigned char *utf8_to_mutf7 (unsigned char *src)

344 {

345 unsigned char *u16buf,*utf16;

346 unsigned char *ret,*t;

347 unsigned long j,c;

348 unsigned char *s = src;

349 unsigned long i = 0;

350 int nonascii = 0;

351 while (*s) { /* pass one: count destination octets */

352 if (*s & 0x80) { /* non-ASCII character? */

353 j = MAXUNIUTF8; /* get single UCS-4 codepoint */

354 if ((c = utf8_get (&s,&j)) & U8G_ERROR) return NIL;

355 /* tally number of UTF-16 octets */

356 nonascii += (c & U8GM_NONBMP) ? 4 : 2;

357 }

358 else { /* ASCII character */

359 if (nonascii) { /* add pending Modified BASE64 size + shifts */

360 i += ((nonascii / 3) * 4) + ((j = nonascii % 3) ? j + 1 : 0) + 2;

361 nonascii = 0; /* back to ASCII */

362 }

363 if (*s == '&') i += 2; /* two octets if the escape */

364 else ++i; /* otherwise just count another octet */

365 ++s; /* advance to next source octet */

366 }

367 }

368 if (nonascii) /* add pending Modified BASE64 size + shifts */

369 i += ((nonascii / 3) * 4) + ((j = nonascii % 3) ? j + 1 : 0) + 2;

370

371 /* create return buffer */

372 t = ret = (unsigned char *) fs_get (i + 1);

373 /* and scratch buffer */

374 utf16 = u16buf = (unsigned char *) fs_get (i + 1);

375 for (s = src; *s;) { /* pass two: copy destination octets */

376 if (*s & 0x80) { /* non-ASCII character? */

377 j = MAXUNIUTF8; /* get single UCS-4 codepoint */

378 if ((c = utf8_get (&s,&j)) & U8G_ERROR) return NIL;

379 if (c & U8GM_NONBMP) { /* non-BMP? */

380 c -= UTF16_BASE; /* yes, convert to surrogate */

381 utf16 = utf16_poot (utf16_poot (utf16,(c >> UTF16_SHIFT)+UTF16_SURRH),

382 (c & UTF16_MASK) + UTF16_SURRL);

383 }

384 else utf16 = utf16_poot (utf16,c);

385 }

386 else { /* ASCII character */

387 if (utf16 != u16buf) { /* add pending Modified BASE64 size + shifts */

388 t = utf16_to_mbase64 (t,u16buf,utf16 - u16buf);

389 utf16 = u16buf; /* reset buffer */

390 }

391 *t++ = *s; /* copy the character */

392 if (*s == '&') *t++ = '-';/* special sequence if the escape */

393 ++s; /* advance to next source octet */

394 }

395 }

396 /* add pending Modified BASE64 size + shifts */

397 if (utf16 != u16buf) t = utf16_to_mbase64 (t,u16buf,utf16 - u16buf);

398 *t = '\0'; /* tie off destination */

399 if (i != (t - ret)) fatal ("utf8_to_mutf7 botch");

400 fs_give ((void **) &u16buf);

401 return ret;

402 }

403

404 /* Convert Modified UTF-7 to UTF-8

405 * Accepts: Modified UTF-7 string

406 * Returns: UTF-8 string on success, NIL if invalid Modified UTF-7

407 */

408

409 unsigned char *utf8_from_mutf7 (unsigned char *src)

410 {

411 SIZEDTEXT utf8,utf7;

412 unsigned char *s;

413 int mbase64 = 0;

414 /* disallow bogus strings */

415 if (mail_utf7_valid (src)) return NIL;

416 /* initialize SIZEDTEXTs */

417 memset (&utf7,0,sizeof (SIZEDTEXT));

418 memset (&utf8,0,sizeof (SIZEDTEXT));

419 /* make copy of source */

420 for (s = cpytxt (&utf7,src,strlen (src)); *s; ++s) switch (*s) {

421 case '&': /* Modified UTF-7 uses & instead of + */

422 *s = '+';

423 mbase64 = T; /* note that we are in Modified BASE64 */

424 break;

425 case '+': /* temporarily swap text + to & */

426 if (!mbase64) *s = '&';

427 break;

428 case '-': /* shift back to ASCII */

429 mbase64 = NIL;

430 break;

431 case ',': /* Modified UTF-7 uses , instead of / ... */

432 if (mbase64) *s = '/'; /* ...in Modified BASE64 */

433 break;

434 }

435 /* do the conversion */

436 utf8_text_utf7 (&utf7,&utf8,NIL,NIL);

437 /* no longer need copy of source */

438 fs_give ((void **) &utf7.data);

439 /* post-process: switch & and + */

440 for (s = utf8.data; *s; ++s) switch (*s) {

441 case '&':

442 *s = '+';

443 break;

444 case '+':

445 *s = '&';

446 break;

447 }

448 return utf8.data;

449 }