ultimatepp/uppsrc/TextDiffCtrl/TextDiff.cpp

#include "TextDiffCtrl.h"

namespace Upp {

template <class I>
static int CompareGetCount(I a, I b, int max_count)
{
	if(max_count <= 0 || *a != *b)
		return 0;
	int left;
	for(left = max_count; --left > 0;)
		if(*++a != *++b)
			return max_count - left;
	return max_count;
}

Vector<String> GetLineMap(Stream& stream, bool ignore_indent)
{
	Vector<String> out;
	int emp = 0;
	if(stream.IsOpen())
		while(!stream.IsEof()) {
			String s = stream.GetLine();
			const char *p = s, *e = s.end(), *f = e;
			if(ignore_indent)
				while((byte)*p <= 32 && p != e)
					p++;
			while(e > p && (byte)e[-1] < ' ')
				e--;
			if(e == p)
				emp++;
			else
			{
				while(emp-- > 0)
					out.Add(Null);
				if(p != s.begin())
					s = String(p, e);
				else
				if(e != f)
					s.Trim(int(e - p));
				out.Add(s);
				emp = 0;
			}
		}
	return out;
}

class TextComparator
{
public:
	TextComparator(const Vector<String>& f1, const Vector<String>& f2);

	Array<TextSection>    GetSections() const;

private:
	bool                  Find(int start1, int end1, int start2, int end2, int& best_match, int& best_count) const;
	void                  Split(Array<TextSection>& dest, int start1, int end1, int start2, int end2) const;

private:
	Vector<Index<dword>>  hash1;
	Vector<Index<dword>>  hash2;
	const Vector<String>& file1;
	const Vector<String>& file2;
};

Array<TextSection> CompareLineMaps(const Vector<String>& s1, const Vector<String>& s2)
{
	return TextComparator(s1, s2).GetSections();
}

static void CalcHash(Vector<Index<dword>>& hash, const Vector<String>& file, int limit)
{
	{ // 1st row
		Index<dword>& first = hash.Add();
		for(int i = 0; i < file.GetCount(); i++)
			first.Add(FoldHash(GetHashValue(file[i])));
	}
	static const int prime[] =
	{
		3,  5,  7,   11,  13,  17,  19,  21,
		23, 29, 31,  37,  41,  43,  47,  51,
		53, 61, 67,  71,  73,  79,  83,  87,
		89, 97, 101, 103, 107, 109, 113, 117,
	};
	const int *pp = prime;
	for(int l = 1; l < limit; l <<= 1) {
		Index<dword>& nhash = hash.Add();
		const Index<dword>& ohash = hash[hash.GetCount() - 2];
		int pri = *pp++;
		int t;
		for(t = l; t < ohash.GetCount(); t++)
			nhash.Add(ohash[t - l] + pri * ohash[t]);
		for(t -= l; t < ohash.GetCount(); t++)
			nhash.Add(ohash[t]);
	}
}

TextComparator::TextComparator(const Vector<String>& f1, const Vector<String>& f2)
: file1(f1), file2(f2)
{
	int limit = min(f1.GetCount(), f2.GetCount());
	CalcHash(hash1, f1, limit);
	CalcHash(hash2, f2, limit);
}

static bool CompareSection(const TextSection& ta, const TextSection& tb)
{
	return ta.start1 < tb.start1 || ta.start1 == tb.start1 && ta.start2 < tb.start2;
}

Array<TextSection> TextComparator::GetSections() const
{
	Array<TextSection> output;
	Split(output, 0, file1.GetCount(), 0, file2.GetCount());
	Sort(output, &CompareSection);
	return output;
}

static int GetHashLevel(int min_count, int hash_count)
{
	int l = 0;
	hash_count--;
	while(min_count > 1 && l < hash_count)
	{
		min_count >>= 1;
		l++;
	}
	return l;
}

bool TextComparator::Find(int start1, int end1, int start2, int end2, int& best_match, int& best_count) const
{
	ASSERT(end1 > start1 && end2 > start2);
	bool done = false;
	const String *f1 = file1.Begin() + start1;
	int len1 = end1 - start1;
	int lvl = GetHashLevel(best_count + 1, hash1.GetCount());
	int chunk = 1 << lvl;
	int last = max(best_count - chunk + 1, 0);
	const Index<dword> *hp1 = &hash1[lvl];
	const Index<dword> *hp2 = &hash2[lvl];
	const dword *h1 = hp1->begin() + start1;

	int i = hp2->Find(*h1);
	while(i >= 0)
		if(i + best_count >= end2)
			return done;
		else {
			if(i >= start2 && h1[last] == (*hp2)[i + last]) {
				int top = min(len1, end2 - i);
				int hc = CompareGetCount(h1, hp2->begin() + i, top) + chunk - 1;
				int cnt = CompareGetCount(f1, file2.begin() + i, min(hc, top));
				if(cnt > best_count) {
					best_count = cnt;
					best_match = i;
					done = true;
					last = best_count - chunk + 1;
					if(best_count + 1 >= 2 * chunk)
					{
						lvl = GetHashLevel(best_count + 1, hash1.GetCount());
						chunk = 1 << lvl;
						last = best_count - chunk + 1;
						hp1 = &hash1[lvl];
						hp2 = &hash2[lvl];
						h1 = hp1->begin() + start1;
						int oi = i;
						for(i = hp2->Find(*h1); i >= 0 && i <= oi; i = hp2->FindNext(i))
							;
						continue;
					}
				}
			}
			i = hp2->FindNext(i);
		}
	return done;
}

void TextComparator::Split(Array<TextSection>& dest, int start1, int end1, int start2, int end2) const
{
	ASSERT(start1 <= end1 && start2 <= end2);
	while(start1 < end1 && start2 < end2) {
		int new1 = -1, new2 = -1, count = 0;
		for(int i = start1; i + count < end1; i++)
			if(Find(i, end1, start2, end2, new2, count))
				new1 = i;
		if(count == 0)
			break; // no match at all
		ASSERT(new1 >= start1 && new1 + count <= end1);
		ASSERT(new2 >= start2 && new2 + count <= end2);
		dest.Add(TextSection(new1, count, new2, count, true));
		if(new1 - start1 >= end1 - new1 - count) { // head is longer - recurse for tail
			Split(dest, new1 + count, end1, new2 + count, end2);
			end1 = new1;
			end2 = new2;
		}
		else { // tail is longer - recurse for head
			Split(dest, start1, new1, start2, new2);
			start1 = new1 + count;
			start2 = new2 + count;
		}
		ASSERT(start1 <= end1 && start2 <= end2);
	}
	if(start1 < end1 || start2 < end2)
		dest.Add(TextSection(start1, end1 - start1, start2, end2 - start2, false));
}

};