view src/lib-imap/imap-base-subject.c @ 2708:f1e9f3ec8135 HEAD

Buffer API change: we no longer support limited sized buffers where writes past limit wouldn't kill the process. They weren't used hardly anywhere, they could have hidden bugs and the code for handling them was too complex. This also changed base64 and hex-binary APIs.
author Timo Sirainen <tss@iki.fi>
date Fri, 08 Oct 2004 20:51:47 +0300
parents da6b2e4a8b3a
children 55df57c028d4
line wrap: on
line source

/* Copyright (C) 2002 Timo Sirainen */

/* Implementated against draft-ietf-imapext-sort-10 and
   draft-ietf-imapext-thread-12 */

#include "lib.h"
#include "buffer.h"
#include "charset-utf8.h"
#include "message-header-decode.h"
#include "imap-base-subject.h"

static int header_decode(const unsigned char *data, size_t size,
			 const char *charset, void *context)
{
	buffer_t *buf = context;
        struct charset_translation *t;
	unsigned char *buf_data;
	size_t pos, used_size;

	pos = buffer_get_used_size(buf);
	if (charset == NULL) {
		/* It's ASCII. */
		buffer_append(buf, data, size);
	} else {
		t = charset_to_utf8_begin(charset, NULL);
		if (t != NULL) {
			(void)charset_to_ucase_utf8(t, data, &size, buf);
                        charset_to_utf8_end(t);
		}
	}

	if (size > 0) {
		/* @UNSAFE: uppercase it. Current draft specifies that we
		   should touch only ASCII. */
		buf_data = buffer_get_modifyable_data(buf, &used_size);
		for (; pos < used_size; pos++) {
			if (buf_data[pos] >= 'a' && buf_data[pos] <= 'z')
				buf_data[pos] = buf_data[pos] - 'a' + 'A';
		}
	}

	return TRUE;
}

static void pack_whitespace(buffer_t *buf)
{
	char *data, *dest;
	int last_lwsp;

	data = buffer_get_modifyable_data(buf, NULL);

	/* check if we need to do anything */
	while (*data != '\0') {
		if (*data == '\t' || *data == '\n' || *data == '\r' ||
		    (*data == ' ' && (data[1] == ' ' || data[1] == '\t')))
			break;
		data++;
	}

	if (*data == '\0')
		return;

	/* @UNSAFE: convert/pack the whitespace */
	dest = data; last_lwsp = FALSE;
	while (*data != '\0') {
		if (*data == '\t' || *data == ' ' ||
		    *data == '\r' || *data == '\n') {
			if (!last_lwsp) {
				*dest++ = ' ';
				last_lwsp = TRUE;
			}
		} else {
			*dest++ = *data;
			last_lwsp = FALSE;
		}
		data++;
	}
	*dest = '\0';

	data = buffer_get_modifyable_data(buf, NULL);
	buffer_set_used_size(buf, (size_t) (dest - data)+1);
}

static void remove_subj_trailers(buffer_t *buf, size_t start_pos,
				 int *is_reply_or_forward_r)
{
	const char *data;
	size_t orig_size, size;

	/* subj-trailer    = "(fwd)" / WSP */
	data = buffer_get_data(buf, &orig_size);

	if (orig_size < 2) /* size includes trailing \0 */
		return;

	for (size = orig_size-2; size > start_pos; ) {
		if (data[size] == ' ')
			size--;
		else if (size >= 5 &&
			 memcmp(data + size - 5, "(fwd)", 5) == 0) {
			if (is_reply_or_forward_r != NULL)
				*is_reply_or_forward_r = TRUE;
			size -= 5;
		} else {
			break;
		}
	}

	if (size != orig_size-2) {
		buffer_set_used_size(buf, size);
		buffer_append_c(buf, '\0');
	}
}

static int remove_blob(const char **datap)
{
	const char *data = *datap;

	if (*data != '[')
		return FALSE;

	data++;
	while (*data != '\0' && *data != '[' && *data != ']')
		data++;

	if (*data != ']')
		return FALSE;

	data++;
	if (*data == ' ')
		data++;

	*datap = data;
	return TRUE;
}

static int remove_subj_leader(buffer_t *buf, size_t *start_pos,
			      int *is_reply_or_forward_r)
{
	const char *data, *orig_data;
	int ret = FALSE;

	/* subj-leader     = (*subj-blob subj-refwd) / WSP

	   subj-blob       = "[" *BLOBCHAR "]" *WSP
	   subj-refwd      = ("re" / ("fw" ["d"])) *WSP [subj-blob] ":"

	   BLOBCHAR        = %x01-5a / %x5c / %x5e-7f
	                   ; any CHAR except '[' and ']' */
	orig_data = buffer_get_data(buf, NULL);
	orig_data += *start_pos;
	data = orig_data;

	if (*data == ' ') {
		/* independent from checks below - always removed */
		data++; orig_data++;
		*start_pos += 1;
		ret = TRUE;
	}

	while (*data == '[') {
		if (!remove_blob(&data))
			return ret;
	}

	if (strncasecmp(data, "re", 2) == 0)
		data += 2;
	else if (strncasecmp(data, "fwd", 3) == 0)
		data += 3;
	else if (strncasecmp(data, "fw", 2) == 0)
		data += 2;
	else
		return ret;

	if (*data == ' ')
		data++;

	if (*data == '[' && !remove_blob(&data))
		return ret;

	if (*data != ':')
		return ret;

	data++;
	*start_pos += (size_t)(data - orig_data);
	if (is_reply_or_forward_r != NULL)
		*is_reply_or_forward_r = TRUE;
	return TRUE;
}

static int remove_blob_when_nonempty(buffer_t *buf, size_t *start_pos)
{
	const char *data, *orig_data;

	orig_data = buffer_get_data(buf, NULL);
	orig_data += *start_pos;
	data = orig_data;
	if (*data == '[' && remove_blob(&data) && *data != '\0') {
		*start_pos += (size_t)(data - orig_data);
		return TRUE;
	}

	return FALSE;
}

static int remove_subj_fwd_hdr(buffer_t *buf, size_t *start_pos,
			       int *is_reply_or_forward_r)
{
	const char *data;
	size_t size;

	/* subj-fwd        = subj-fwd-hdr subject subj-fwd-trl
	   subj-fwd-hdr    = "[fwd:"
	   subj-fwd-trl    = "]" */
	data = buffer_get_data(buf, &size);

	if (strncasecmp(data + *start_pos, "[fwd:", 5) != 0)
		return FALSE;

	if (data[size-2] != ']')
		return FALSE;

	if (is_reply_or_forward_r != NULL)
		*is_reply_or_forward_r = TRUE;

	buffer_set_used_size(buf, size-2);
	buffer_append_c(buf, '\0');

	*start_pos += 5;
	return TRUE;
}

const char *imap_get_base_subject_cased(pool_t pool, const char *subject,
					int *is_reply_or_forward_r)
{
	buffer_t *buf;
	size_t start_pos, subject_len;
	int found;

	if (is_reply_or_forward_r != NULL)
		*is_reply_or_forward_r = FALSE;

	subject_len = strlen(subject);
	buf = buffer_create_dynamic(pool, subject_len);

	/* (1) Convert any RFC 2047 encoded-words in the subject to
	   UTF-8.  Convert all tabs and continuations to space.
	   Convert all multiple spaces to a single space. */
	message_header_decode((const unsigned char *)subject, subject_len,
			      header_decode, buf);
	buffer_append_c(buf, '\0');

	pack_whitespace(buf);

	start_pos = 0;
	do {
		/* (2) Remove all trailing text of the subject that matches
		   the subj-trailer ABNF, repeat until no more matches are
		   possible. */
		remove_subj_trailers(buf, start_pos, is_reply_or_forward_r);

		do {
			/* (3) Remove all prefix text of the subject that
			   matches the subj-leader ABNF. */
			found = remove_subj_leader(buf, &start_pos,
						   is_reply_or_forward_r);

			/* (4) If there is prefix text of the subject that
			   matches the subj-blob ABNF, and removing that prefix
			   leaves a non-empty subj-base, then remove the prefix
			   text. */
			found = remove_blob_when_nonempty(buf, &start_pos) ||
				found;

			/* (5) Repeat (3) and (4) until no matches remain. */
		} while (found);

		/* (6) If the resulting text begins with the subj-fwd-hdr ABNF
		   and ends with the subj-fwd-trl ABNF, remove the
		   subj-fwd-hdr and subj-fwd-trl and repeat from step (2). */
	} while (remove_subj_fwd_hdr(buf, &start_pos, is_reply_or_forward_r));

	/* (7) The resulting text is the "base subject" used in the
	   SORT. */
	return (const char *)buffer_get_data(buf, NULL) + start_pos;
}