Mercurial > dovecot > original-hg > dovecot-1.2
view src/lib-mail/rfc822-tokenize.c @ 896:21ffcce83c70 HEAD
Rewrote rfc822-tokenize.c to work one token at a time so it won't uselessly
take memory, maybe also a bit faster. This caused pretty large changes all
around.
Also moved all string (un)escaping code to lib/strescape.c.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Fri, 03 Jan 2003 17:57:12 +0200 |
parents | 5ac361acb316 |
children | e27267f227e6 |
line wrap: on
line source
/* Copyright (C) 2002 Timo Sirainen */ #include "lib.h" #include "str.h" #include "strescape.h" #include "rfc822-tokenize.h" struct _Rfc822TokenizeContext { const char *data; size_t size; Rfc822TokenizeErrorFunc error_func; void *error_context; int token; size_t token_pos, token_len; size_t parse_pos; unsigned int skip_comments:1; unsigned int dot_token:1; unsigned int in_bracket:1; }; #define PARSE_ERROR() \ STMT_START { \ if (ctx->error_func != NULL && \ !ctx->error_func(data, i, '\0', ctx->error_context)) \ return FALSE; \ } STMT_END #define PARSE_ERROR_MISSING(c) \ STMT_START { \ if (ctx->error_func != NULL && \ !ctx->error_func(data, i, c, ctx->error_context)) \ return FALSE; \ } STMT_END Rfc822TokenizeContext * rfc822_tokenize_init(const char *data, size_t size, Rfc822TokenizeErrorFunc error_func, void *error_context) { Rfc822TokenizeContext *ctx; ctx = i_new(Rfc822TokenizeContext, 1); ctx->data = data; ctx->size = size; ctx->error_func = error_func; ctx->error_context = error_context; ctx->skip_comments = TRUE; ctx->dot_token = TRUE; ctx->token = -1; return ctx; } void rfc822_tokenize_deinit(Rfc822TokenizeContext *ctx) { i_free(ctx); } void rfc822_tokenize_skip_comments(Rfc822TokenizeContext *ctx, int set) { ctx->skip_comments = set; } void rfc822_tokenize_dot_token(Rfc822TokenizeContext *ctx, int set) { ctx->dot_token = set; } int rfc822_tokenize_next(Rfc822TokenizeContext *ctx) { int token, level, last_atom; const char *data; size_t i, size; if (ctx->token == TOKEN_LAST) return FALSE; data = ctx->data; size = ctx->size; ctx->token = TOKEN_LAST; last_atom = FALSE; for (i = ctx->parse_pos; i < size && data[i] != '\0'; i++) { token = -1; switch (data[i]) { case ' ': case '\t': case '\r': case '\n': /* skip whitespace */ break; case '(': /* (comment) - nesting is allowed */ if (last_atom) break; token = '('; ctx->token_pos = ++i; level = 1; for (; i < size && data[i] != '\0'; i++) { if (data[i] == '\\' && i+1 < size && data[i+1] != '\0') i++; else if (data[i] == '(') level++; else if (data[i] == ')') { if (--level == 0) break; } } if (level > 0) PARSE_ERROR_MISSING(')'); ctx->token_len = (size_t) (i - ctx->token_pos); break; case '[': /* domain literal - nesting isn't allowed */ if (last_atom) break; token = '['; ctx->token_pos = ++i; while (i < size && data[i] != '\0' && data[i] != ']') { if (data[i] == '\\' && i+1 < size && data[i+1] != '\0') i++; else if (data[i] == '[') { /* nesting not allowed, but continue anyway */ PARSE_ERROR(); } i++; } if (i == size || data[i] == '\0') PARSE_ERROR_MISSING(']'); ctx->token_len = (size_t) (i - ctx->token_pos); break; case '"': /* quoted string */ if (last_atom) break; token = '"'; ctx->token_pos = ++i; while (i < size && data[i] != '\0' && data[i] != '"') { if (data[i] == '\\' && i+1 < size && data[i+1] != '\0') i++; i++; } if (i == size || data[i] == '\0') PARSE_ERROR_MISSING('"'); ctx->token_len = (size_t) (i - ctx->token_pos); break; case '<': if (last_atom) break; if (ctx->in_bracket) { /* '<' cannot be nested */ PARSE_ERROR(); } token = '<'; ctx->in_bracket = TRUE; break; case '>': if (last_atom) break; if (!ctx->in_bracket) { /* missing '<' */ PARSE_ERROR(); } token = '>'; ctx->in_bracket = FALSE; break; case ')': case ']': case '\\': PARSE_ERROR(); /* fall through */ /* RFC822 specials: */ case '@': case ',': case ';': case ':': case '.': /* RFC 2045 specials: */ case '/': case '?': case '=': token = ctx->data[i]; if (token != '.' || ctx->dot_token) break; /* fall through */ default: /* atom */ token = 'A'; if (!last_atom) { ctx->token = token; ctx->token_pos = i; last_atom = TRUE; } break; } if (last_atom) { if (token != 'A') { /* end of atom */ ctx->token_len = (size_t) (i - ctx->token_pos); last_atom = FALSE; break; } } else { if (token != -1) { ctx->token = token; if (i < ctx->size && data[i] != '\0') i++; break; } } if (i == ctx->size || data[i] == '\0') { /* unexpected eol */ break; } } if (last_atom) { /* end of atom */ ctx->token_len = (size_t) (i - ctx->token_pos); } ctx->parse_pos = i; if (ctx->token == TOKEN_LAST && ctx->in_bracket && ctx->error_func != NULL) { if (!ctx->error_func(data, i, '>', ctx->error_context)) return FALSE; } return TRUE; } Rfc822Token rfc822_tokenize_get(const Rfc822TokenizeContext *ctx) { return ctx->token; } const char *rfc822_tokenize_get_value(const Rfc822TokenizeContext *ctx, size_t *len) { i_assert(IS_TOKEN_STRING(ctx->token)); *len = ctx->token_len; return ctx->data + ctx->token_pos; } int rfc822_tokenize_get_string(Rfc822TokenizeContext *ctx, String *str, String *comments, const Rfc822Token *stop_tokens) { Rfc822Token token; const char *value; size_t len; int i, token_str, last_str; last_str = FALSE; while (rfc822_tokenize_next(ctx)) { token = rfc822_tokenize_get(ctx); if (token == TOKEN_LAST) return TRUE; for (i = 0; stop_tokens[i] != TOKEN_LAST; i++) if (token == stop_tokens[i]) return TRUE; if (token == TOKEN_COMMENT) { /* handle comment specially */ if (comments != NULL) { if (str_len(comments) > 0) str_append_c(comments, ' '); value = rfc822_tokenize_get_value(ctx, &len); str_append_unescaped(comments, value, len); } continue; } token_str = token == TOKEN_ATOM || token == TOKEN_QSTRING || token == TOKEN_DLITERAL || token == TOKEN_COMMENT; if (!token_str) str_append_c(str, token); else if (token == TOKEN_QSTRING) { /* unescape only quoted strings, since we're removing the quotes. for domain literals I don't see much point in unescaping if [] is still kept.. */ if (last_str) str_append_c(str, ' '); value = rfc822_tokenize_get_value(ctx, &len); str_append_unescaped(str, value, len); } else { if (last_str) str_append_c(str, ' '); if (token == TOKEN_DLITERAL) str_append_c(str, '['); value = rfc822_tokenize_get_value(ctx, &len); str_append_n(str, value, len); if (token == TOKEN_DLITERAL) str_append_c(str, ']'); } last_str = token_str; } return FALSE; }