13 #include <parserutils/charset/mibenum.h>
27 const uint8_t **
name, uint32_t *namelen,
28 const uint8_t **value, uint32_t *valuelen);
45 uint16_t *mibenum, uint32_t *source)
49 if (data == NULL || mibenum == NULL || source == NULL)
50 return PARSERUTILS_BADPARM;
69 return PARSERUTILS_OK;
80 goto default_encoding;
88 return PARSERUTILS_OK;
117 if (charset != parserutils_charset_mibenum_from_name(
118 "UTF-32",
SLEN(
"UTF-32")) &&
119 charset != parserutils_charset_mibenum_from_name(
120 "UTF-32LE",
SLEN(
"UTF-32LE")) &&
121 charset != parserutils_charset_mibenum_from_name(
122 "UTF-32BE",
SLEN(
"UTF-32BE"))) {
127 return PARSERUTILS_OK;
141 charset = parserutils_charset_mibenum_from_name(
"Windows-1252",
142 SLEN(
"Windows-1252"));
144 charset = parserutils_charset_mibenum_from_name(
"ISO-8859-1",
150 return PARSERUTILS_OK;
171 if (data[0] == 0xFE && data[1] == 0xFF) {
172 return parserutils_charset_mibenum_from_name(
"UTF-16BE",
174 }
else if (data[0] == 0xFF && data[1] == 0xFE) {
175 return parserutils_charset_mibenum_from_name(
"UTF-16LE",
177 }
else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
178 return parserutils_charset_mibenum_from_name(
"UTF-8",
186 (pos < end - SLEN(a) && \
187 strncasecmp((const char *) pos, a, SLEN(a)) == 0)
190 while (pos < end - SLEN(a)) { \
196 if (pos == end - SLEN(a)) \
200 (a == 0x09 || a == 0x0a || a == 0x0c || \
201 a == 0x0d || a == 0x20 || a == 0x2f)
212 const uint8_t *pos = data;
219 end = pos +
min(512, len);
228 }
else if (
PEEK(
"<meta")) {
229 if (pos +
SLEN(
"<meta") >= end - 1)
234 pos +=
SLEN(
"<meta");
245 }
else if ((
PEEK(
"</") && (pos < end - 3 &&
246 (0x41 <= (*(pos + 2) & ~ 0x20) &&
247 (*(pos + 2) & ~ 0x20) <= 0x5A))) ||
248 (pos < end - 2 && *pos ==
'<' &&
249 (0x41 <= (*(pos + 1) & ~ 0x20) &&
250 (*(pos + 1) & ~ 0x20) <= 0x5A))) {
258 *pos ==
'>' || *pos ==
'<')
304 const uint8_t *value;
305 uint32_t namelen, valuelen;
306 uint16_t mibenum = 0;
308 if (pos == NULL || *pos == NULL || end == NULL)
313 &name, &namelen, &value, &valuelen)) {
317 if (namelen ==
SLEN(
"charset") && valuelen > 0 &&
318 strncasecmp((
const char *) name,
"charset",
319 SLEN(
"charset")) == 0) {
326 while (valuelen > 0 &&
ISSPACE(value[valuelen - 1]))
329 mibenum = parserutils_charset_mibenum_from_name(
330 (
const char *) value, valuelen);
332 }
else if (namelen ==
SLEN(
"content") && valuelen > 0 &&
333 strncasecmp((
const char *) name,
"content",
334 SLEN(
"content")) == 0) {
340 if (mibenum == parserutils_charset_mibenum_from_name(
341 "UTF-16LE",
SLEN(
"UTF-16LE")) ||
343 parserutils_charset_mibenum_from_name(
344 "UTF-16BE",
SLEN(
"UTF-16BE")) ||
346 parserutils_charset_mibenum_from_name(
347 "UTF-16",
SLEN(
"UTF-16"))) {
348 mibenum = parserutils_charset_mibenum_from_name(
349 "UTF-8",
SLEN(
"UTF-8"));
373 const uint8_t *tentative = NULL;
374 uint32_t tentative_len = 0;
379 end = value + valuelen;
382 while (value < end) {
395 while (value < end &&
ISSPACE(*value)) {
403 if (value < end -
SLEN(
"charset") &&
404 strncasecmp((
const char *) value,
405 "charset",
SLEN(
"charset")) != 0)
408 value +=
SLEN(
"charset");
411 while (value < end &&
ISSPACE(*value)) {
425 while (value < end &&
ISSPACE(*value)) {
437 while (++value < end && *value !=
'"') {
446 }
else if (*value ==
'\'') {
447 while (++value < end && *value !=
'\'') {
457 while (value < end && !
ISSPACE(*value)) {
464 if (tentative != NULL) {
465 return parserutils_charset_mibenum_from_name(
466 (
const char *) tentative, tentative_len);
488 const uint8_t **
name, uint32_t *namelen,
489 const uint8_t **value, uint32_t *valuelen)
493 if (data == NULL || *data == NULL || end == NULL || name == NULL ||
494 namelen == NULL || value == NULL || valuelen == NULL)
500 while (pos < end && (
ISSPACE(*pos) || *pos ==
'/')) {
525 *value = (
const uint8_t *)
"";
541 if (*pos ==
'/' || *pos ==
'<' || *pos ==
'>') {
562 while (pos < end &&
ISSPACE(*pos)) {
583 while (pos < end &&
ISSPACE(*pos)) {
594 if (*pos ==
'\'' || *pos ==
'"') {
596 const uint8_t *quote = pos;
599 while (++pos < end) {
601 if (*pos == *quote) {
602 *value = (quote + 1);
622 if (*pos ==
'<' || *pos ==
'>') {
635 if (
ISSPACE(*pos) || *pos ==
'<' || *pos ==
'>') {
670 assert(*charset != 0);
673 if (*charset == parserutils_charset_mibenum_from_name(
674 "ISO-8859-1",
SLEN(
"ISO-8859-1"))) {
675 tmp = parserutils_charset_mibenum_from_name(
676 "Windows-1252",
SLEN(
"Windows-1252"));
677 assert(tmp != 0 &&
"Windows-1252 MUST be supported");
679 }
else if (*charset == parserutils_charset_mibenum_from_name(
680 "ISO-8859-9",
SLEN(
"ISO-8859-9"))) {
681 tmp = parserutils_charset_mibenum_from_name(
682 "Windows-1254",
SLEN(
"Windows-1254"));
684 }
else if (*charset == parserutils_charset_mibenum_from_name(
685 "ISO-8859-11",
SLEN(
"ISO-8859-11"))) {
686 tmp = parserutils_charset_mibenum_from_name(
687 "Windows-874",
SLEN(
"Windows-874"));
689 }
else if (*charset == parserutils_charset_mibenum_from_name(
690 "KS_C_5601-1987",
SLEN(
"KS_C_5601-1987")) ||
691 *charset == parserutils_charset_mibenum_from_name(
692 "EUC-KR",
SLEN(
"EUC-KR"))) {
693 tmp = parserutils_charset_mibenum_from_name(
694 "Windows-949",
SLEN(
"Windows-949"));
696 }
else if (*charset == parserutils_charset_mibenum_from_name(
697 "TIS-620",
SLEN(
"TIS-620"))) {
698 tmp = parserutils_charset_mibenum_from_name(
699 "Windows-874",
SLEN(
"Windows-874"));
701 }
else if (*charset == parserutils_charset_mibenum_from_name(
702 "x-x-big5",
SLEN(
"x-x-big5"))) {
703 tmp = parserutils_charset_mibenum_from_name(
704 "Big5",
SLEN(
"Big5"));
706 }
else if (*charset == parserutils_charset_mibenum_from_name(
707 "GB2312",
SLEN(
"GB2312")) ||
708 *charset == parserutils_charset_mibenum_from_name(
709 "GB_2312-80",
SLEN(
"GB_2312-80"))) {
710 tmp = parserutils_charset_mibenum_from_name(
Charset may be changed with further data.
static uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len)
Inspect the beginning of a buffer of data for the presence of a UTF Byte Order Mark.
static bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, const uint8_t **name, uint32_t *namelen, const uint8_t **value, uint32_t *valuelen)
Extract an attribute from the data stream.
static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len)
Search for a meta charset within a buffer of data.
static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, const uint8_t *end)
Parse attributes on a meta tag.
uint16_t hubbub_charset_parse_content(const uint8_t *value, uint32_t valuelen)
Parse a content= attribute's value.
parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, uint16_t *mibenum, uint32_t *source)
Extract a charset from a chunk of data.
void hubbub_charset_fix_charset(uint16_t *charset)
Fix charsets, according to the override table in HTML5, section 8.2.2.2.