Libparserutils
codec_ascii.c
Go to the documentation of this file.
1/*
2 * This file is part of LibParserUtils.
3 * Licensed under the MIT License,
4 * http://www.opensource.org/licenses/mit-license.php
5 * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
6 */
7
8#include <assert.h>
9#include <stdlib.h>
10#include <string.h>
11
13
15#include "utils/endian.h"
16#include "utils/utils.h"
17
21typedef struct charset_ascii_codec {
24#define READ_BUFSIZE (8)
28 size_t read_len;
30#define WRITE_BUFSIZE (8)
34 size_t write_len;
37
38static bool charset_ascii_codec_handles_charset(const char *charset);
40 const char *charset, parserutils_charset_codec **codec);
45 const uint8_t **source, size_t *sourcelen,
46 uint8_t **dest, size_t *destlen);
49 const uint8_t **source, size_t *sourcelen,
50 uint8_t **dest, size_t *destlen);
55 const uint8_t **source, size_t *sourcelen,
56 uint8_t **dest, size_t *destlen);
59 uint32_t ucs4, uint8_t **dest, size_t *destlen);
61 uint32_t ucs4, uint8_t **s, size_t *len);
63 const uint8_t *s, size_t len, uint32_t *ucs4);
64
71bool charset_ascii_codec_handles_charset(const char *charset)
72{
73 static uint16_t ascii;
74 uint16_t match = parserutils_charset_mibenum_from_name(charset,
75 strlen(charset));
76
77 if (ascii == 0) {
79 "US-ASCII", SLEN("US-ASCII"));
80 }
81
82 if (ascii != 0 && ascii == match)
83 return true;
84
85 return false;
86}
87
99{
101
102 UNUSED(charset);
103
104 c = malloc(sizeof(charset_ascii_codec));
105 if (c == NULL)
106 return PARSERUTILS_NOMEM;
107
108 c->read_buf[0] = 0;
109 c->read_len = 0;
110
111 c->write_buf[0] = 0;
112 c->write_len = 0;
113
114 /* Finally, populate vtable */
119
120 *codec = (parserutils_charset_codec *) c;
121
122 return PARSERUTILS_OK;
123}
124
132{
133 UNUSED(codec);
134
135 return PARSERUTILS_OK;
136}
137
166 const uint8_t **source, size_t *sourcelen,
167 uint8_t **dest, size_t *destlen)
168{
170 uint32_t ucs4;
171 uint32_t *towrite;
172 size_t towritelen;
173 parserutils_error error;
174
175 /* Process any outstanding characters from the previous call */
176 if (c->write_len > 0) {
177 uint32_t *pwrite = c->write_buf;
178
179 while (c->write_len > 0) {
180 error = charset_ascii_from_ucs4(c, pwrite[0],
181 dest, destlen);
182 if (error != PARSERUTILS_OK) {
183 uint32_t len;
184 assert(error == PARSERUTILS_NOMEM);
185
186 for (len = 0; len < c->write_len; len++) {
187 c->write_buf[len] = pwrite[len];
188 }
189
190 return error;
191 }
192
193 pwrite++;
194 c->write_len--;
195 }
196 }
197
198 /* Now process the characters for this call */
199 while (*sourcelen > 0) {
200 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
201 towrite = &ucs4;
202 towritelen = 1;
203
204 /* Output current characters */
205 while (towritelen > 0) {
206 error = charset_ascii_from_ucs4(c, towrite[0], dest,
207 destlen);
208 if (error != PARSERUTILS_OK) {
209 uint32_t len;
210 if (error != PARSERUTILS_NOMEM) {
211 return error;
212 }
213
214 /* Insufficient output space */
215 assert(towritelen < WRITE_BUFSIZE);
216
217 c->write_len = towritelen;
218
219 /* Copy pending chars to save area, for
220 * processing next call. */
221 for (len = 0; len < towritelen; len++)
222 c->write_buf[len] = towrite[len];
223
224 /* Claim character we've just buffered,
225 * so it's not reprocessed */
226 *source += 4;
227 *sourcelen -= 4;
228
229 return PARSERUTILS_NOMEM;
230 }
231
232 towrite++;
233 towritelen--;
234 }
235
236 *source += 4;
237 *sourcelen -= 4;
238 }
239
240 return PARSERUTILS_OK;
241}
242
285 const uint8_t **source, size_t *sourcelen,
286 uint8_t **dest, size_t *destlen)
287{
289 parserutils_error error;
290
291 if (c->read_len > 0) {
292 /* Output left over from last decode */
293 uint32_t *pread = c->read_buf;
294
295 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
296 *((uint32_t *) (void *) *dest) =
297 endian_host_to_big(pread[0]);
298
299 *dest += 4;
300 *destlen -= 4;
301
302 pread++;
303 c->read_len--;
304 }
305
306 if (*destlen < c->read_len * 4) {
307 /* Ran out of output buffer */
308 size_t i;
309
310 /* Shuffle remaining output down */
311 for (i = 0; i < c->read_len; i++)
312 c->read_buf[i] = pread[i];
313
314 return PARSERUTILS_NOMEM;
315 }
316 }
317
318 /* Finally, the "normal" case; process all outstanding characters */
319 while (*sourcelen > 0) {
321 source, sourcelen, dest, destlen);
322 if (error != PARSERUTILS_OK) {
323 return error;
324 }
325 }
326
327 return PARSERUTILS_OK;
328}
329
337{
339
340 c->read_buf[0] = 0;
341 c->read_len = 0;
342
343 c->write_buf[0] = 0;
344 c->write_len = 0;
345
346 return PARSERUTILS_OK;
347}
348
349
379 const uint8_t **source, size_t *sourcelen,
380 uint8_t **dest, size_t *destlen)
381{
382 uint32_t ucs4;
383 parserutils_error error;
384
385 /* Convert a single character */
386 error = charset_ascii_to_ucs4(c, *source, *sourcelen, &ucs4);
387 if (error == PARSERUTILS_OK) {
388 /* Read a character */
390 ucs4, dest, destlen);
391 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
392 /* output succeeded; update source pointers */
393 *source += 1;
394 *sourcelen -= 1;
395 }
396
397 return error;
398 } else if (error == PARSERUTILS_NEEDDATA) {
399 /* Can only happen if sourcelen == 0 */
400 return error;
401 } else if (error == PARSERUTILS_INVALID) {
402 /* Illegal input sequence */
403
404 /* Strict errormode; simply flag invalid character */
405 if (c->base.errormode ==
407 return PARSERUTILS_INVALID;
408 }
409
410 /* output U+FFFD and continue processing. */
412 0xFFFD, dest, destlen);
413 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
414 /* output succeeded; update source pointers */
415 *source += 1;
416 *sourcelen -= 1;
417 }
418
419 return error;
420 }
421
422 return PARSERUTILS_OK;
423}
424
437 uint32_t ucs4, uint8_t **dest, size_t *destlen)
438{
439 if (*destlen < 4) {
440 /* Run out of output buffer */
441 c->read_len = 1;
442 c->read_buf[0] = ucs4;
443
444 return PARSERUTILS_NOMEM;
445 }
446
447 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
448 *dest += 4;
449 *destlen -= 4;
450
451 return PARSERUTILS_OK;
452}
453
471 uint32_t ucs4, uint8_t **s, size_t *len)
472{
473 uint8_t out = 0;
474
475 if (*len < 1)
476 return PARSERUTILS_NOMEM;
477
478 if (ucs4 < 0x80) {
479 /* ASCII */
480 out = ucs4;
481 } else {
483 return PARSERUTILS_INVALID;
484 else
485 out = '?';
486 }
487
488 *(*s) = out;
489 (*s)++;
490 (*len)--;
491
492 return PARSERUTILS_OK;
493}
494
507 const uint8_t *s, size_t len, uint32_t *ucs4)
508{
509 uint32_t out;
510
511 UNUSED(c);
512
513 if (len < 1)
515
516 if (*s < 0x80) {
517 out = *s;
518 } else {
519 return PARSERUTILS_INVALID;
520 }
521
522 *ucs4 = out;
523
524 return PARSERUTILS_OK;
525}
526
530};
531
@ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT
Abort processing if unrepresentable character encountered.
Definition: codec.h:64
size_t len
Definition: codec_8859.c:23
static parserutils_error charset_ascii_codec_encode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Encode a chunk of UCS-4 (big endian) data into US-ASCII.
Definition: codec_ascii.c:165
static parserutils_error charset_ascii_codec_output_decoded_char(charset_ascii_codec *c, uint32_t ucs4, uint8_t **dest, size_t *destlen)
Output a UCS-4 character (big endian)
Definition: codec_ascii.c:435
static parserutils_error charset_ascii_codec_read_char(charset_ascii_codec *c, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Read a character from US-ASCII to UCS-4 (big endian)
Definition: codec_ascii.c:378
static parserutils_error charset_ascii_codec_destroy(parserutils_charset_codec *codec)
Destroy a US-ASCII codec.
Definition: codec_ascii.c:131
static parserutils_error charset_ascii_codec_create(const char *charset, parserutils_charset_codec **codec)
Create a US-ASCII codec.
Definition: codec_ascii.c:97
#define READ_BUFSIZE
Definition: codec_ascii.c:24
static parserutils_error charset_ascii_codec_decode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Decode a chunk of US-ASCII data into UCS-4 (big endian)
Definition: codec_ascii.c:284
struct charset_ascii_codec charset_ascii_codec
US-ASCII charset codec.
static parserutils_error charset_ascii_codec_reset(parserutils_charset_codec *codec)
Clear a US-ASCII codec's encoding state.
Definition: codec_ascii.c:336
static parserutils_error charset_ascii_from_ucs4(charset_ascii_codec *c, uint32_t ucs4, uint8_t **s, size_t *len)
Convert a UCS4 (host endian) character to US-ASCII.
Definition: codec_ascii.c:470
static bool charset_ascii_codec_handles_charset(const char *charset)
Determine whether this codec handles a specific charset.
Definition: codec_ascii.c:71
const parserutils_charset_handler charset_ascii_codec_handler
Definition: codec_ascii.c:527
static parserutils_error charset_ascii_to_ucs4(charset_ascii_codec *c, const uint8_t *s, size_t len, uint32_t *ucs4)
Convert a US-ASCII character to UCS4 (host endian)
Definition: codec_ascii.c:506
#define WRITE_BUFSIZE
Definition: codec_ascii.c:30
static uint32_t endian_host_to_big(uint32_t host)
Definition: endian.h:24
static uint32_t endian_big_to_host(uint32_t big)
Definition: endian.h:32
parserutils_error
Definition: errors.h:18
@ PARSERUTILS_OK
Definition: errors.h:19
@ PARSERUTILS_NEEDDATA
Definition: errors.h:25
@ PARSERUTILS_INVALID
Definition: errors.h:23
@ PARSERUTILS_NOMEM
Definition: errors.h:21
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
Retrieve the MIB enum value assigned to an encoding name.
Definition: aliases.c:107
US-ASCII charset codec.
Definition: codec_ascii.c:21
size_t read_len
Character length of read_buf.
Definition: codec_ascii.c:28
size_t write_len
Character length of write_buf.
Definition: codec_ascii.c:34
uint32_t read_buf[READ_BUFSIZE]
Buffer for partial output sequences (decode) (host-endian)
Definition: codec_ascii.c:25
parserutils_charset_codec base
Base class.
Definition: codec_ascii.c:22
uint32_t write_buf[WRITE_BUFSIZE]
Buffer for partial output sequences (encode) (host-endian)
Definition: codec_ascii.c:31
Core charset codec definition; implementations extend this.
Definition: codec_impl.h:19
parserutils_charset_codec_errormode errormode
error mode
Definition: codec_impl.h:22
parserutils_error(* encode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition: codec_impl.h:26
parserutils_error(* destroy)(parserutils_charset_codec *codec)
Definition: codec_impl.h:25
parserutils_error(* decode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition: codec_impl.h:29
parserutils_error(* reset)(parserutils_charset_codec *codec)
Definition: codec_impl.h:32
struct parserutils_charset_codec::@3 handler
Vtable for handler code.
Codec factory component definition.
Definition: codec_impl.h:39
#define UNUSED(x)
Definition: utils.h:25
#define SLEN(s)
Definition: utils.h:21