# HG changeset patch
# User Erling Ellingsen <erlingalf@gmail.com>
# Date 1171827565 -3600
# Node ID 6cb6cfe43c5d1ca3b4a46d90546bb039bce6245f
# Parent  9dc64c8414caf4a91cb11a19c978b677d9ff7431
Avoid some false positives for addremove -s

The original code uses the similary score

  1 - len(diff(after, before)) / len(after)

The diff can at most be the size of the 'before' file, so any small
'before' file would be considered very similar. Removing an empty file
would cause all files added in the same revision to be considered
copies of the removed file.

This changes the metric to

  bytes_overlap(before, after) / len(before + after)

i.e. the actual percentage of bytes shared between the two files.

diff -r 9dc64c8414ca -r 6cb6cfe43c5d mercurial/cmdutil.py
--- a/mercurial/cmdutil.py	Sun Mar 04 09:03:21 2007 -0300
+++ b/mercurial/cmdutil.py	Sun Feb 18 20:39:25 2007 +0100
@@ -7,7 +7,7 @@
 
 from node import *
 from i18n import _
-import os, sys, mdiff, util, templater, patch
+import os, sys, mdiff, bdiff, util, templater, patch
 
 revrangesep = ':'
 
@@ -146,20 +146,29 @@
         yield src, fn, util.pathto(repo.getcwd(), fn), fn in exact
 
 def findrenames(repo, added=None, removed=None, threshold=0.5):
+    '''find renamed files -- yields (before, after, score) tuples'''
     if added is None or removed is None:
         added, removed = repo.status()[1:3]
     ctx = repo.changectx()
     for a in added:
         aa = repo.wread(a)
-        bestscore, bestname = None, None
+        bestname, bestscore = None, threshold
         for r in removed:
             rr = ctx.filectx(r).data()
-            delta = mdiff.textdiff(aa, rr)
-            if len(delta) < len(aa):
-                myscore = 1.0 - (float(len(delta)) / len(aa))
-                if bestscore is None or myscore > bestscore:
-                    bestscore, bestname = myscore, r
-        if bestname and bestscore >= threshold:
+
+            # bdiff.blocks() returns blocks of matching lines
+            # count the number of bytes in each
+            equal = 0
+            alines = mdiff.splitnewlines(aa)
+            matches = bdiff.blocks(aa, rr)
+            for x1,x2,y1,y2 in matches:
+                for line in alines[x1:x2]:
+                    equal += len(line)
+
+            myscore = equal*2.0 / (len(aa)+len(rr))
+            if myscore >= bestscore:
+                bestname, bestscore = r, myscore
+        if bestname:
             yield bestname, a, bestscore
 
 def addremove(repo, pats=[], opts={}, wlock=None, dry_run=None,
diff -r 9dc64c8414ca -r 6cb6cfe43c5d tests/test-addremove-similar
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-addremove-similar	Sun Feb 18 20:39:25 2007 +0100
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+hg init rep; cd rep
+
+touch empty-file
+python -c 'for x in range(10000): print x' > large-file
+
+hg addremove
+
+hg commit -m A
+
+rm large-file empty-file
+python -c 'for x in range(10,10000): print x' > another-file
+
+hg addremove -s50
+
+hg commit -m B
+
+cd ..
+
+hg init rep2; cd rep2
+
+python -c 'for x in range(10000): print x' > large-file
+python -c 'for x in range(50): print x' > tiny-file
+
+hg addremove
+
+hg commit -m A
+
+python -c 'for x in range(70): print x' > small-file
+rm tiny-file
+rm large-file
+
+hg addremove -s50
+
+hg commit -m B
+
diff -r 9dc64c8414ca -r 6cb6cfe43c5d tests/test-addremove-similar.out
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-addremove-similar.out	Sun Feb 18 20:39:25 2007 +0100
@@ -0,0 +1,12 @@
+adding empty-file
+adding large-file
+adding another-file
+removing empty-file
+removing large-file
+recording removal of large-file as rename to another-file (99% similar)
+adding large-file
+adding tiny-file
+adding small-file
+removing large-file
+removing tiny-file
+recording removal of tiny-file as rename to small-file (82% similar)