Hubbub
parser.c
Go to the documentation of this file.
1 /*
2  * This file is part of Hubbub.
3  * Licensed under the MIT License,
4  * http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2007-8 John-Mark Bell <jmb@netsurf-browser.org>
6  */
7 
8 #include <assert.h>
9 #include <string.h>
10 
11 #include <parserutils/charset/mibenum.h>
12 #include <parserutils/input/inputstream.h>
13 
14 #include <hubbub/parser.h>
15 
16 #include "charset/detect.h"
17 #include "tokeniser/tokeniser.h"
19 #include "utils/parserutilserror.h"
20 
24 struct hubbub_parser {
25  parserutils_inputstream *stream;
28 };
29 
41 hubbub_error hubbub_parser_create(const char *enc, bool fix_enc,
42  hubbub_parser **parser)
43 {
44  parserutils_error perror;
45  hubbub_error error;
46  hubbub_parser *p;
47 
48  if (parser == NULL)
49  return HUBBUB_BADPARM;
50 
51  p = malloc(sizeof(hubbub_parser));
52  if (p == NULL)
53  return HUBBUB_NOMEM;
54 
55  /* If we have an encoding and we're permitted to fix up likely broken
56  * ones, then attempt to do so. */
57  if (enc != NULL && fix_enc == true) {
58  uint16_t mibenum = parserutils_charset_mibenum_from_name(enc,
59  strlen(enc));
60 
61  if (mibenum != 0) {
63 
64  enc = parserutils_charset_mibenum_to_name(mibenum);
65  }
66  }
67 
68  perror = parserutils_inputstream_create(enc,
71  if (perror != PARSERUTILS_OK) {
72  free(p);
74  }
75 
76  error = hubbub_tokeniser_create(p->stream, &p->tok);
77  if (error != HUBBUB_OK) {
78  parserutils_inputstream_destroy(p->stream);
79  free(p);
80  return error;
81  }
82 
83  error = hubbub_treebuilder_create(p->tok, &p->tb);
84  if (error != HUBBUB_OK) {
86  parserutils_inputstream_destroy(p->stream);
87  free(p);
88  return error;
89  }
90 
91  *parser = p;
92 
93  return HUBBUB_OK;
94 }
95 
103 {
104  if (parser == NULL)
105  return HUBBUB_BADPARM;
106 
108 
109  hubbub_tokeniser_destroy(parser->tok);
110 
111  parserutils_inputstream_destroy(parser->stream);
112 
113  free(parser);
114 
115  return HUBBUB_OK;
116 }
117 
128  hubbub_parser_optparams *params)
129 {
130  hubbub_error result = HUBBUB_OK;
131 
132  if (parser == NULL || params == NULL)
133  return HUBBUB_BADPARM;
134 
135  switch (type) {
137  if (parser->tb != NULL) {
138  /* Client is defining their own token handler,
139  * so we must destroy the default treebuilder */
141  parser->tb = NULL;
142  }
143  result = hubbub_tokeniser_setopt(parser->tok,
145  (hubbub_tokeniser_optparams *) params);
146  break;
147 
149  /* The error handler does not cascade, so tell both the
150  * treebuilder (if extant) and the tokeniser. */
151  if (parser->tb != NULL) {
152  result = hubbub_treebuilder_setopt(parser->tb,
154  (hubbub_treebuilder_optparams *) params);
155  }
156  if (result == HUBBUB_OK) {
157  result = hubbub_tokeniser_setopt(parser->tok,
159  (hubbub_tokeniser_optparams *) params);
160  }
161  break;
162 
164  result = hubbub_tokeniser_setopt(parser->tok,
166  (hubbub_tokeniser_optparams *) params);
167  break;
168 
169  case HUBBUB_PARSER_PAUSE:
170  result = hubbub_tokeniser_setopt(parser->tok,
172  (hubbub_tokeniser_optparams *) params);
173  break;
174 
176  if (parser->tb != NULL) {
177  result = hubbub_treebuilder_setopt(parser->tb,
179  (hubbub_treebuilder_optparams *) params);
180  }
181  break;
182 
184  if (parser->tb != NULL) {
185  result = hubbub_treebuilder_setopt(parser->tb,
187  (hubbub_treebuilder_optparams *) params);
188  }
189  break;
190 
192  if (parser->tb != NULL) {
193  result = hubbub_treebuilder_setopt(parser->tb,
195  (hubbub_treebuilder_optparams *) params);
196  }
197  break;
198 
199  default:
200  result = HUBBUB_INVALID;
201  }
202 
203  return result;
204 }
205 
219  const uint8_t *data, size_t len)
220 {
221  if (parser == NULL || data == NULL)
222  return HUBBUB_BADPARM;
223 
224  return hubbub_tokeniser_insert_chunk(parser->tok, data, len);
225 }
226 
236  const uint8_t *data, size_t len)
237 {
238  parserutils_error perror;
239  hubbub_error error;
240 
241  if (parser == NULL || data == NULL)
242  return HUBBUB_BADPARM;
243 
244  perror = parserutils_inputstream_append(parser->stream, data, len);
245  if (perror != PARSERUTILS_OK)
247 
248  error = hubbub_tokeniser_run(parser->tok);
249  if (error == HUBBUB_BADENCODING) {
250  /* Ok, we autodetected an encoding that we don't actually
251  * support. We've not actually processed any data at this
252  * point so fall back to Windows-1252 and hope for the best
253  */
254  perror = parserutils_inputstream_change_charset(parser->stream,
255  "Windows-1252", HUBBUB_CHARSET_TENTATIVE);
256  /* Under no circumstances should we get here if we've managed
257  * to process data. If there is a way, I want to know about it
258  */
259  assert(perror != PARSERUTILS_INVALID);
260  if (perror != PARSERUTILS_OK)
262 
263  /* Retry the tokenisation */
264  error = hubbub_tokeniser_run(parser->tok);
265  }
266 
267  if (error != HUBBUB_OK)
268  return error;
269 
270  return HUBBUB_OK;
271 }
272 
280 {
281  parserutils_error perror;
282  hubbub_error error;
283 
284  if (parser == NULL)
285  return HUBBUB_BADPARM;
286 
287  perror = parserutils_inputstream_append(parser->stream, NULL, 0);
288  if (perror != PARSERUTILS_OK)
290 
291  error = hubbub_tokeniser_run(parser->tok);
292  if (error != HUBBUB_OK)
293  return error;
294 
295  return HUBBUB_OK;
296 }
297 
306  hubbub_charset_source *source)
307 {
308  const char *name;
309  uint32_t src;
310 
311  if (parser == NULL || source == NULL)
312  return NULL;
313 
314  name = parserutils_inputstream_read_charset(parser->stream, &src);
315 
316  *source = (hubbub_charset_source) src;
317 
318  return name;
319 }
320 
Charset may be changed with further data.
Definition: types.h:24
hubbub_parser_opttype
Hubbub parser option types.
Definition: parser.h:29
Hubbub parser option parameters.
Definition: parser.h:42
hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, hubbub_tokeniser_opttype type, hubbub_tokeniser_optparams *params)
Configure a hubbub tokeniser.
Definition: tokeniser.c:366
Hubbub treebuilder option parameters.
Definition: treebuilder.h:36
parserutils_inputstream * stream
Input stream instance.
Definition: parser.c:25
hubbub_charset_source
Source of charset information, in order of importance A client-dictated charset will override all oth...
Definition: types.h:22
Tokeniser data structure.
Definition: tokeniser.c:165
Charset definite.
Definition: types.h:26
Hubbub tokeniser option parameters.
Definition: tokeniser.h:36
hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
Destroy a hubbub tokeniser.
Definition: tokeniser.c:340
hubbub_error hubbub_tokeniser_insert_chunk(hubbub_tokeniser *tokeniser, const uint8_t *data, size_t len)
Insert a chunk of data into the input stream.
Definition: tokeniser.c:415
hubbub_error hubbub_parser_completed(hubbub_parser *parser)
Inform the parser that the last chunk of data has been parsed.
Definition: parser.c:279
hubbub_error hubbub_parser_destroy(hubbub_parser *parser)
Destroy a hubbub parser.
Definition: parser.c:102
hubbub_treebuilder * tb
Treebuilder instance.
Definition: parser.c:27
Hubbub parser object.
Definition: parser.c:24
hubbub_error hubbub_parser_insert_chunk(hubbub_parser *parser, const uint8_t *data, size_t len)
Insert a chunk of data into a hubbub parser input stream.
Definition: parser.c:218
const char * name
Definition: initial.c:22
hubbub_error hubbub_parser_setopt(hubbub_parser *parser, hubbub_parser_opttype type, hubbub_parser_optparams *params)
Configure a hubbub parser.
Definition: parser.c:126
hubbub_error
Definition: errors.h:18
hubbub_error hubbub_treebuilder_destroy(hubbub_treebuilder *treebuilder)
Destroy a hubbub treebuilder.
Definition: treebuilder.c:155
hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input, hubbub_tokeniser **tokeniser)
Create a hubbub tokeniser.
Definition: tokeniser.c:285
hubbub_error hubbub_treebuilder_setopt(hubbub_treebuilder *treebuilder, hubbub_treebuilder_opttype type, hubbub_treebuilder_optparams *params)
Configure a hubbub treebuilder.
Definition: treebuilder.c:232
element_type type
Definition: treebuilder.c:26
No error.
Definition: errors.h:19
size_t len
Definition: initial.c:23
static hubbub_error hubbub_error_from_parserutils_error(parserutils_error error)
Convert a ParserUtils error into a Hubbub error.
const char * hubbub_parser_read_charset(hubbub_parser *parser, hubbub_charset_source *source)
Read the document charset.
Definition: parser.c:305
parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, uint16_t *mibenum, uint32_t *source)
Extract a charset from a chunk of data.
Definition: detect.c:44
hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
Process remaining data in the input stream.
Definition: tokeniser.c:436
void hubbub_charset_fix_charset(uint16_t *charset)
Fix charsets, according to the override table in HTML5, section 8.2.2.2.
Definition: detect.c:667
Treebuilder object.
Definition: internal.h:116
hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser, const uint8_t *data, size_t len)
Pass a chunk of data to a hubbub parser for parsing.
Definition: parser.c:235
hubbub_error hubbub_parser_create(const char *enc, bool fix_enc, hubbub_parser **parser)
Create a hubbub parser.
Definition: parser.c:41
hubbub_tokeniser * tok
Tokeniser instance.
Definition: parser.c:26
hubbub_error hubbub_treebuilder_create(hubbub_tokeniser *tokeniser, hubbub_treebuilder **treebuilder)
Create a hubbub treebuilder.
Definition: treebuilder.c:94