Mercurial > dovecot > core-2.2
view src/lib-mail/mail-html2text.c @ 19552:0f22db71df7a
global: freshen copyright
git ls-files | xargs perl -p -i -e 's/(\d+)-201[0-5]/$1-2016/g;s/ (201[0-5]) Dovecot/ $1-2016 Dovecot/'
author | Timo Sirainen <timo.sirainen@dovecot.fi> |
---|---|
date | Wed, 13 Jan 2016 12:24:03 +0200 |
parents | 17aac21b303d |
children | d2e877350c52 |
line wrap: on
line source
/* Copyright (c) 2011-2016 Dovecot authors, see the included COPYING file */ #include "lib.h" #include "buffer.h" #include "unichar.h" #include "message-parser.h" #include "mail-html2text.h" /* Zero-width space (​) apparently also belongs here, but that gets a bit tricky to handle.. is it actually used anywhere? */ #define HTML_WHITESPACE(c) \ ((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n') enum html_state { /* regular text */ HTML_STATE_TEXT, /* tag outside "quoted string" */ HTML_STATE_TAG, /* tag inside "double quoted string" */ HTML_STATE_TAG_DQUOTED, /* tag -> "escape\ */ HTML_STATE_TAG_DQUOTED_ESCAPE, /* tag inside 'single quoted string' */ HTML_STATE_TAG_SQUOTED, /* tag -> 'escape\ */ HTML_STATE_TAG_SQUOTED_ESCAPE, /* comment */ HTML_STATE_COMMENT, /* comment is ending, we've seen "--" and now just waiting for ">" */ HTML_STATE_COMMENT_END, /* (java)script */ HTML_STATE_SCRIPT, /* CSS style */ HTML_STATE_STYLE, /* <![CDATA[...]]> */ HTML_STATE_CDATA }; struct mail_html2text { enum mail_html2text_flags flags; enum html_state state; buffer_t *input; unsigned int quote_level; bool ignore_next_text; }; static struct { const char *name; unichar_t chr; } html_entities[] = { #include "html-entities.h" }; struct mail_html2text * mail_html2text_init(enum mail_html2text_flags flags) { struct mail_html2text *ht; ht = i_new(struct mail_html2text, 1); ht->flags = flags; ht->input = buffer_create_dynamic(default_pool, 512); return ht; } static size_t parse_tag_name(struct mail_html2text *ht, const unsigned char *data, size_t size) { size_t i; if (size >= 3 && memcmp(data, "!--", 3) == 0) { ht->state = HTML_STATE_COMMENT; return 3 + 1; } if (size >= 7 && i_memcasecmp(data, "script", 6) == 0 && (HTML_WHITESPACE(data[6]) || data[6] == '>')) { ht->state = HTML_STATE_SCRIPT; return 7 + 1; } if (size >= 6 && i_memcasecmp(data, "style", 5) == 0 && (HTML_WHITESPACE(data[5]) || data[5] == '>')) { ht->state = HTML_STATE_STYLE; return 6 + 1; } if (size >= 8 && i_memcasecmp(data, "![CDATA[", 8) == 0) { ht->state = HTML_STATE_CDATA; return 8 + 1; } if ((ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) != 0) { if (size >= 10 && i_memcasecmp(data, "blockquote", 10) == 0 && (HTML_WHITESPACE(data[10]) || data[10] == '>')) { ht->quote_level++; ht->state = HTML_STATE_TAG; return 1; } else if (ht->quote_level > 0 && size >= 12 && i_memcasecmp(data, "/blockquote>", 12) == 0) { if (--ht->quote_level == 0) ht->ignore_next_text = FALSE; ht->state = HTML_STATE_TAG; return 1; } } if (size < 12) { /* can we see the whole tag name? */ for (i = 0; i < size; i++) { if (HTML_WHITESPACE(data[i]) || data[i] == '>') break; } if (i == size) { /* need more data */ return 0; } } ht->state = HTML_STATE_TAG; return 1; } static bool html_entity_get_unichar(const char *name, unichar_t *chr_r) { unsigned int i; for (i = 0; i < N_ELEMENTS(html_entities); i++) { if (strcasecmp(html_entities[i].name, name) == 0) { *chr_r = html_entities[i].chr; return TRUE; } } return FALSE; } static size_t parse_entity(const unsigned char *data, size_t size, buffer_t *output) { char entity[10]; unichar_t chr; size_t i; for (i = 0; i < size; i++) { if (HTML_WHITESPACE(data[i]) || i >= sizeof(entity)) { /* broken entity */ return 1; } if (data[i] == ';') break; } if (i == size) return 0; i_assert(i < sizeof(entity)); memcpy(entity, data, i); entity[i] = '\0'; if (html_entity_get_unichar(entity, &chr)) uni_ucs4_to_utf8_c(chr, output); return i + 1 + 1; } static void mail_html2text_add_space(buffer_t *output) { const unsigned char *data = output->data; if (output->used > 0 && data[output->used-1] != ' ') buffer_append_c(output, ' '); } static size_t parse_data(struct mail_html2text *ht, const unsigned char *data, size_t size, buffer_t *output) { size_t i, ret; for (i = 0; i < size; i++) { char c = data[i]; switch (ht->state) { case HTML_STATE_TEXT: if (c == '<') { ret = parse_tag_name(ht, data+i+1, size-i-1); if (ret == 0) return i; i += ret - 1; } else if (c == '&') { ret = parse_entity(data+i+1, size-i-1, output); if (ret == 0) return i; i += ret - 1; } else if (ht->quote_level == 0) { buffer_append_c(output, c); } break; case HTML_STATE_TAG: if (c == '"') ht->state = HTML_STATE_TAG_DQUOTED; else if (c == '\'') ht->state = HTML_STATE_TAG_SQUOTED; else if (c == '>') { ht->state = HTML_STATE_TEXT; mail_html2text_add_space(output); } break; case HTML_STATE_TAG_DQUOTED: if (c == '"') ht->state = HTML_STATE_TAG; else if (c == '\\') ht->state = HTML_STATE_TAG_DQUOTED_ESCAPE; break; case HTML_STATE_TAG_DQUOTED_ESCAPE: ht->state = HTML_STATE_TAG_DQUOTED; break; case HTML_STATE_TAG_SQUOTED: if (c == '\'') ht->state = HTML_STATE_TAG; else if (c == '\\') ht->state = HTML_STATE_TAG_SQUOTED_ESCAPE; break; case HTML_STATE_TAG_SQUOTED_ESCAPE: ht->state = HTML_STATE_TAG_SQUOTED; break; case HTML_STATE_COMMENT: if (c == '-') { if (i+1 == size) return i; if (data[i+1] == '-') { ht->state = HTML_STATE_COMMENT_END; i++; } } break; case HTML_STATE_COMMENT_END: if (c == '>') ht->state = HTML_STATE_TEXT; else if (!HTML_WHITESPACE(c)) ht->state = HTML_STATE_COMMENT; break; case HTML_STATE_SCRIPT: if (c == '<') { unsigned int max_len = I_MIN(size-i, 9); if (i_memcasecmp(data+i, "</script>", max_len) == 0) { if (max_len < 9) return i; mail_html2text_add_space(output); ht->state = HTML_STATE_TEXT; i += 8; } } break; case HTML_STATE_STYLE: if (c == '<') { unsigned int max_len = I_MIN(size-i, 8); if (i_memcasecmp(data+i, "</style>", max_len) == 0) { if (max_len < 8) return i; mail_html2text_add_space(output); ht->state = HTML_STATE_TEXT; i += 7; } } break; case HTML_STATE_CDATA: if (c == ']') { unsigned int max_len = I_MIN(size-i, 3); if (i_memcasecmp(data+i, "]]>", max_len) == 0) { if (max_len < 3) return i; ht->state = HTML_STATE_TEXT; i += 2; break; } } if (ht->quote_level == 0) buffer_append_c(output, c); break; } } return i; } void mail_html2text_more(struct mail_html2text *ht, const unsigned char *data, size_t size, buffer_t *output) { size_t pos, inc_size, buf_orig_size; i_assert(size > 0); while (ht->input->used > 0) { /* we didn't get enough input the last time to know what to do. */ buf_orig_size = ht->input->used; inc_size = I_MIN(size, 128); buffer_append(ht->input, data, inc_size); pos = parse_data(ht, ht->input->data, ht->input->used, output); if (pos != 0) { /* we parsed forward */ i_assert(pos >= buf_orig_size); data += pos - buf_orig_size; size -= pos - buf_orig_size; buffer_set_used_size(ht->input, 0); } else { /* we need to add more data into buffer */ data += inc_size; size -= inc_size; if (size == 0) return; } } pos = parse_data(ht, data, size, output); buffer_append(ht->input, data + pos, size - pos); } void mail_html2text_deinit(struct mail_html2text **_ht) { struct mail_html2text *ht = *_ht; *_ht = NULL; buffer_free(&ht->input); i_free(ht); }