changeset 5520:f322edd67a4f HEAD

Added Boyer-Moore string searching.
author Timo Sirainen <tss@iki.fi>
date Wed, 04 Apr 2007 11:17:03 +0300
parents 7b6511e67476
children baf3367d2450
files src/lib/Makefile.am src/lib/str-find.c src/lib/str-find.h
diffstat 3 files changed, 186 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/src/lib/Makefile.am	Wed Apr 04 09:15:31 2007 +0300
+++ b/src/lib/Makefile.am	Wed Apr 04 11:17:03 2007 +0300
@@ -78,6 +78,7 @@
 	seq-range-array.c \
 	sha1.c \
 	str.c \
+	str-find.c \
 	str-sanitize.c \
 	strescape.c \
 	strfuncs.c \
@@ -156,6 +157,7 @@
 	seq-range-array.h \
 	sha1.h \
 	str.h \
+	str-find.h \
 	str-sanitize.h \
 	strescape.h \
 	strfuncs.h \
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/str-find.c	Wed Apr 04 11:17:03 2007 +0300
@@ -0,0 +1,167 @@
+/* Copyright (C) 2007 Timo Sirainen */
+
+#include "lib.h"
+#include "str-find.h"
+
+struct str_find_context {
+	pool_t pool;
+	unsigned char *key;
+	unsigned int key_len;
+
+	unsigned int *matches;
+	unsigned int match_count;
+
+	int badtab[UCHAR_MAX+1];
+	int goodtab[];
+};
+
+static void init_badtab(struct str_find_context *ctx)
+{
+	unsigned int i, len_1 = ctx->key_len - 1;
+
+	for (i = 0; i <= UCHAR_MAX; i++)
+		ctx->badtab[i] = ctx->key_len;
+
+	for (i = 0; i < len_1; i++)
+		ctx->badtab[ctx->key[i]] = len_1 - i;
+}
+
+static void init_suffixes(struct str_find_context *ctx, unsigned int *suffixes)
+{
+	unsigned int len_1 = ctx->key_len - 1;
+	int f = 0, g, i;
+
+	suffixes[len_1] = ctx->key_len;
+	g = len_1;
+	for (i = ctx->key_len - 2; i >= 0; i--) {
+		if (i > g && (int)suffixes[i + len_1 - f] < i - g)
+			suffixes[i] = suffixes[i + len_1 - f];
+		else {
+			if (i < g)
+				g = i;
+			f = i;
+			while (g >= 0 && ctx->key[g] == ctx->key[g + len_1 - f])
+				g--;
+			suffixes[i] = f - g;
+		}
+	}
+}
+
+static void init_goodtab(struct str_find_context *ctx)
+{
+	unsigned int len_1 = ctx->key_len - 1;
+	unsigned int j, *suffixes;
+	int i;
+
+	suffixes = t_buffer_get(ctx->key_len);
+	init_suffixes(ctx, suffixes);
+
+	for (i = 0; i < (int)ctx->key_len; i++)
+		ctx->goodtab[i] = ctx->key_len;
+
+	j = 0;
+	for (i = len_1; i >= -1; i--) {
+		if (i < 0 || suffixes[i] == (unsigned int)i + 1) {
+			for (; j < len_1 - i; j++) {
+				if (ctx->goodtab[j] == (int)ctx->key_len)
+					ctx->goodtab[j] = len_1 - i;
+			}
+		}
+	}
+	for (i = 0; i <= (int)ctx->key_len - 2; i++)
+		ctx->goodtab[len_1 - suffixes[i]] = len_1 - i;
+}
+ 
+struct str_find_context *str_find_init(pool_t pool, const char *key)
+{
+	struct str_find_context *ctx;
+	unsigned int key_len = strlen(key);
+
+	ctx = p_malloc(pool, sizeof(struct str_find_context) +
+		       sizeof(ctx->goodtab[0]) * key_len);
+	ctx->pool = pool;
+	ctx->matches = p_malloc(pool, key_len);
+	ctx->key_len = key_len;
+	ctx->key = p_malloc(pool, key_len);
+	memcpy(ctx->key, key, key_len);
+
+	init_goodtab(ctx);
+	init_badtab(ctx);
+	return ctx;
+}
+
+void str_find_deinit(struct str_find_context **_ctx)
+{
+	struct str_find_context *ctx = *_ctx;
+
+	*_ctx = NULL;
+	p_free(ctx->pool, ctx->matches);
+	p_free(ctx->pool, ctx->key);
+	p_free(ctx->pool, ctx);
+}
+
+bool str_find_more(struct str_find_context *ctx,
+		    const unsigned char *data, size_t size)
+{
+	unsigned int key_len = ctx->key_len;
+	unsigned int i, j, a, b;
+	int bad_value;
+
+	for (i = j = 0; i < ctx->match_count; i++) {
+		a = ctx->matches[i];
+		if (ctx->matches[i] + size >= key_len) {
+			/* we can fully determine this match now */
+			for (; a < key_len; a++) {
+				if (ctx->key[a] != data[a - ctx->matches[i]])
+					break;
+			}
+
+			if (a == key_len)
+				return TRUE;
+		} else {
+			for (b = 0; b < size; b++) {
+				if (ctx->key[a+b] != data[b])
+					break;
+			}
+
+			if (b == size)
+				ctx->matches[j++] = a + size;
+		}
+	}
+	if (j > 0) {
+		i_assert(j + size < key_len);
+		ctx->match_count = j;
+		j = 0;
+	} else {
+		/* Boyer-Moore searching */
+		j = 0;
+		while (j + key_len <= size) {
+			i = key_len - 1;
+			while (ctx->key[i] == data[i + j]) {
+				if (i == 0)
+					return TRUE;
+				i--;
+			}
+
+			bad_value = ctx->badtab[data[i + j]] + i + 1 - key_len;
+			j += I_MAX(ctx->goodtab[i], bad_value);
+		}
+		i_assert(j <= size);
+		ctx->match_count = 0;
+	}
+
+	for (; j < size; j++) {
+		for (i = j; i < size; i++) {
+			if (ctx->key[i-j] != data[i])
+				break;
+		}
+		if (i == size)
+			ctx->matches[ctx->match_count++] = size - j;
+	}
+	return FALSE;
+}
+
+void str_find_reset(struct str_find_context *ctx)
+{
+	ctx->match_count = 0;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/str-find.h	Wed Apr 04 11:17:03 2007 +0300
@@ -0,0 +1,17 @@
+#ifndef __STR_FIND_H
+#define __STR_FIND_H
+
+struct str_find_context;
+
+struct str_find_context *str_find_init(pool_t pool, const char *key);
+void str_find_deinit(struct str_find_context **ctx);
+
+/* Returns TRUE if key is found. It's possible to send the data in arbitrary
+   blocks and have the key still match. */
+bool str_find_more(struct str_find_context *ctx,
+		   const unsigned char *data, size_t size);
+/* Reset input data. The next str_find_more() call won't try to match the key
+   to earlier data. */
+void str_find_reset(struct str_find_context *ctx);
+
+#endif