# HG changeset patch # User mpm@selenic.com # Date 1119403678 28800 # Node ID 8b067bde6679bbeacae92a4d9b1e9d2a794ef3b4 # Parent f060d728fe546a6a22384fa20104f71822d5ba7b Add a fast binary diff extension (not yet used) -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 Add a fast binary diff extension (not yet used) manifest hash: fbfdcab2b1f4856af45176fcf2423d602dfe88c4 -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.0 (GNU/Linux) iD8DBQFCuL6eywK+sNU5EO8RAoT+AJ95z9gx2IrucBS30CnCQFkZT7lSbgCgnXh5 ScZcgwzJ/mEo9i5vuKNlCIs= =Gr58 -----END PGP SIGNATURE----- diff -r f060d728fe54 -r 8b067bde6679 mercurial/bdiff.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/bdiff.c Tue Jun 21 17:27:58 2005 -0800 @@ -0,0 +1,279 @@ +/* + bdiff.c - efficient binary diff extension for Mercurial + + Copyright 2005 Matt Mackall + + This software may be used and distributed according to the terms of + the GNU General Public License, incorporated herein by reference. + + Based roughly on Python difflib +*/ + +#include +#include +#include +#include +#include + +struct line { + int h, len, n; + const char *l; +}; + +struct hunk { + int a1, a2, b1, b2; +}; + +struct hunklist { + struct hunk *base, *head; +}; + +static inline uint32_t rol32(uint32_t word, unsigned int shift) +{ + return (word << shift) | (word >> (32 - shift)); +} + +int splitlines(const char *a, int len, struct line **lr) +{ + int h, i; + const char *p, *b = a; + struct line *l; + + /* count the lines */ + i = 1; /* extra line for sentinel */ + for (p = a; p < a + len; p++) + if (*p == '\n' || p == a + len - 1) + i++; + + *lr = l = malloc(sizeof(struct line) * i); + if (!l) + return -1; + + /* build the line array and calculate hashes */ + h = 0; + for (p = a; p < a + len; p++) { + h = *p + rol32(h, 7); /* a simple hash from GNU diff */ + if (*p == '\n' || p == a + len - 1) { + l->len = p - b + 1; + l->h = h; + l->l = b; + l->n = -1; + l++; + b = p + 1; + h = 0; + } + } + + /* set up a sentinel */ + l->h = l->len = 0; + l->l = a + len; + return i - 1; +} + +int inline cmp(struct line *a, struct line *b) +{ + return a->len != b->len || memcmp(a->l, b->l, a->len); +} + +static int equatelines(struct line *a, int an, struct line *b, int bn) +{ + int i, j, buckets = 1, t, *h, *l; + + /* build a hash table of the next highest power of 2 */ + while (buckets < bn + 1) + buckets *= 2; + + h = malloc(buckets * sizeof(int)); + l = calloc(buckets, sizeof(int)); + buckets = buckets - 1; + if (!h || !l) { + free(h); + return 0; + } + + /* clear the hash table */ + for (i = 0; i <= buckets; i++) + h[i] = -1; + + /* add lines to the hash table chains */ + for (i = bn - 1; i >= 0; i--) { + /* find the equivalence class */ + for (j = b[i].h & buckets; h[j] != -1; j = (j + 1) & buckets) + if (!cmp(b + i, b + h[j])) + break; + + /* add to the head of the equivalence class */ + b[i].n = h[j]; + b[i].h = j; + h[j] = i; + l[j]++; /* keep track of popularity */ + } + + /* compute popularity threshold */ + t = (bn >= 200) ? bn / 100 : bn + 1; + + /* match items in a to their equivalence class in b */ + for (i = 0; i < an; i++) { + /* find the equivalence class */ + for (j = a[i].h & buckets; h[j] != -1; j = (j + 1) & buckets) + if (!cmp(a + i, b + h[j])) + break; + + a[i].h = j; /* use equivalence class for quick compare */ + if(l[j] <= t) + a[i].n = h[j]; /* point to head of match list */ + else + a[i].n = -1; /* too popular */ + } + + /* discard hash tables */ + free(h); + free(l); + return 1; +} + +static int longest_match(struct line *a, struct line *b, int *jpos, int *jlen, + int a1, int a2, int b1, int b2, int *omi, int *omj) +{ + int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k; + + for (i = a1; i < a2; i++) { + /* skip things before the current block */ + for (j = a[i].n; j != -1 && j < b1; j = b[j].n) + ; + + /* loop through all lines match a[i] in b */ + for (; j != -1 && j < b2; j = b[j].n) { + /* does this extend an earlier match? */ + if (i > a1 && j > b1 && jpos[j - 1] == i) + k = jlen[j - 1] + 1; + else + k = 1; + jpos[j] = i + 1; + jlen[j] = k; + + /* best match so far? */ + if (k > mk) { + mi = i; + mj = j; + mk = k; + } + } + } + + if (mk) { + mi = mi - mk + 1; + mj = mj - mk + 1; + } + + /* expand match to include neighboring popular lines */ + while (mi - mb > a1 && mj - mb > b1 && + a[mi - mb - 1].h == b[mj - mb - 1].h) + mb++; + while (mi + mk < a2 && mj + mk < b2 && + a[mi + mk].h == b[mj + mk].h) + mk++; + + *omi = mi - mb; + *omj = mj - mb; + return mk + mb; +} + +static void recurse(struct line *a, struct line *b, int *jpos, int *jlen, + int a1, int a2, int b1, int b2, struct hunklist *l) +{ + int i, j, k; + + /* find the longest match in this chunk */ + k = longest_match(a, b, jpos, jlen, a1, a2, b1, b2, &i, &j); + if (!k) + return; + + /* and recurse on the remaining chunks on either side */ + recurse(a, b, jpos, jlen, a1, i, b1, j, l); + l->head->a1 = i; + l->head->a2 = i + k; + l->head->b1 = j; + l->head->b2 = j + k; + l->head++; + recurse(a, b, jpos, jlen, i + k, a2, j + k, b2, l); +} + +static PyObject *bdiff(PyObject *self, PyObject *args) +{ + PyObject *sa, *sb, *result = NULL; + struct hunklist l; + struct hunk *h; + struct line *al, *bl; + char encode[12], *rb; + int an, bn, len = 0, t, la = 0, lb = 0, *jpos, *jlen; + + if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb)) + return NULL; + + /* allocate and fill arrays */ + an = splitlines(PyString_AsString(sa), PyString_Size(sa), &al); + bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &bl); + t = equatelines(al, an, bl, bn); + jpos = calloc(bn, sizeof(int)); + jlen = calloc(bn, sizeof(int)); + l.head = l.base = malloc(sizeof(struct hunk) * ((an + bn) / 4 + 2)); + if (!al || !bl || !jpos || !jlen || !l.base || !t) + goto nomem; + + /* generate the matching block list */ + recurse(al, bl, jpos, jlen, 0, an, 0, bn, &l); + l.head->a1 = an; + l.head->b1 = bn; + l.head++; + + /* calculate length of output */ + for(h = l.base; h != l.head; h++) { + if (h->a1 != la || h->b1 != lb) + len += 12 + bl[h->b1].l - bl[lb].l; + la = h->a2; + lb = h->b2; + } + + result = PyString_FromStringAndSize(NULL, len); + if (!result) + goto nomem; + + /* build binary patch */ + rb = PyString_AsString(result); + la = lb = 0; + + for(h = l.base; h != l.head; h++) { + if (h->a1 != la || h->b1 != lb) { + len = bl[h->b1].l - bl[lb].l; + *(uint32_t *)(encode) = htonl(al[la].l - al->l); + *(uint32_t *)(encode + 4) = htonl(al[h->a1].l - al->l); + *(uint32_t *)(encode + 8) = htonl(len); + memcpy(rb, encode, 12); + memcpy(rb + 12, bl[lb].l, len); + rb += 12 + len; + } + la = h->a2; + lb = h->b2; + } + +nomem: + free(al); + free(bl); + free(jpos); + free(jlen); + free(l.base); + return result ? result : PyErr_NoMemory(); +} + +static char mdiff_doc[] = "Efficient binary diff."; + +static PyMethodDef methods[] = { + {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"}, + {NULL, NULL} +}; + +PyMODINIT_FUNC initbdiff(void) +{ + Py_InitModule3("bdiff", methods, mdiff_doc); +} diff -r f060d728fe54 -r 8b067bde6679 setup.py --- a/setup.py Tue Jun 21 17:24:41 2005 -0800 +++ b/setup.py Tue Jun 21 17:27:58 2005 -0800 @@ -23,7 +23,8 @@ description='scalable distributed SCM', license='GNU GPL', packages=['mercurial'], - ext_modules=[Extension('mercurial.mpatch', ['mercurial/mpatch.c'])], + ext_modules=[Extension('mercurial.mpatch', ['mercurial/mpatch.c']), + Extension('mercurial.bdiff', ['mercurial/bdiff.c'])], data_files=[('mercurial/templates', ['templates/map'] + glob.glob('templates/map-*') + diff -r f060d728fe54 -r 8b067bde6679 tests/test-bdiff --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test-bdiff Tue Jun 21 17:27:58 2005 -0800 @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +import sys +from mercurial import bdiff, mpatch + +def test1(a, b): + d = bdiff.bdiff(a, b) + c = a + if d: + c = mpatch.patches(a, [d]) + if c != b: + print "***", `a`, `b` + print "bad:" + print `c`[:200] + print `d` + +def test(a, b): + print "***", `a`, `b` + test1(a, b) + test1(b, a) + +test("a\nc\n\n\n\n", "a\nb\n\n\n") +test("a\nb\nc\n", "a\nc\n") +test("", "") +test("a\nb\nc", "a\nb\nc") +test("a\nb\nc\nd\n", "a\nd\n") +test("a\nb\nc\nd\n", "a\nc\ne\n") +test("a\nb\nc\n", "a\nc\n") +test("a\n", "c\na\nb\n") +test("a\n", "") +test("a\n", "b\nc\n") +test("a\n", "c\na\n") +test("", "adjfkjdjksdhfksj") +test("", "ab") +test("", "abc") +test("a", "a") +test("ab", "ab") +test("abc", "abc") +test("a\n", "a\n") +test("a\nb", "a\nb") + +print "done" diff -r f060d728fe54 -r 8b067bde6679 tests/test-bdiff.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test-bdiff.out Tue Jun 21 17:27:58 2005 -0800 @@ -0,0 +1,20 @@ +*** 'a\nc\n\n\n\n' 'a\nb\n\n\n' +*** 'a\nb\nc\n' 'a\nc\n' +*** '' '' +*** 'a\nb\nc' 'a\nb\nc' +*** 'a\nb\nc\nd\n' 'a\nd\n' +*** 'a\nb\nc\nd\n' 'a\nc\ne\n' +*** 'a\nb\nc\n' 'a\nc\n' +*** 'a\n' 'c\na\nb\n' +*** 'a\n' '' +*** 'a\n' 'b\nc\n' +*** 'a\n' 'c\na\n' +*** '' 'adjfkjdjksdhfksj' +*** '' 'ab' +*** '' 'abc' +*** 'a' 'a' +*** 'ab' 'ab' +*** 'abc' 'abc' +*** 'a\n' 'a\n' +*** 'a\nb' 'a\nb' +done