.benchmarks: StringFind

git-svn-id: svn://ultimatepp.org/upp/trunk@6978 f0d560ea-af0d-0410-9eb7-867de7ffcac7
2026-05-15 14:16:07 -06:00 · 2014-03-02 10:04:47 +00:00 · 2014-03-02 10:04:47 +00:00 · 677a3e7dcf
commit 677a3e7dcf
parent 51ff437d49
6 changed files with 370 additions and 0 deletions
--- a/benchmarks/StringFind/StringFind.upp
+++ b/benchmarks/StringFind/StringFind.upp
@ -0,0 +1,11 @@
+uses
+	Core;
+
+file
+	bm.h,
+	bm.cpp,
+	main.cpp optimize_speed;
+
+mainconfig
+	"" = "SSE2";
+
--- a/benchmarks/StringFind/bm.cpp
+++ b/benchmarks/StringFind/bm.cpp
@ -0,0 +1,153 @@
+#include "bm.h"
+
+#define ALPHABET_LEN 256
+#define NOT_FOUND patlen
+ 
+// delta1 table: delta1[c] contains the distance between the last
+// character of pat and the rightmost occurrence of c in pat.
+// If c does not occur in pat, then delta1[c] = patlen.
+// If c is at string[i] and c != pat[patlen-1], we can
+// safely shift i over by delta1[c], which is the minimum distance
+// needed to shift pat forward to get string[i] lined up 
+// with some character in pat.
+// this algorithm runs in alphabet_len+patlen time.
+void make_delta1(int *delta1, uint8_t *pat, int32_t patlen) {
+    int i;
+    for (i=0; i < ALPHABET_LEN; i++) {
+        delta1[i] = patlen;
+    }
+    for (i=0; i < patlen-1; i++) {
+        delta1[pat[i]] = patlen-1 - i;
+    }
+}
+ 
+// true if the suffix of word starting from word[pos] is a prefix 
+// of word
+int is_prefix(uint8_t *word, int wordlen, int pos) {
+    int i;
+    int suffixlen = wordlen - pos;
+    // could also use the strncmp() library function here
+    for (i = 0; i < suffixlen; i++) {
+        if (word[i] != word[pos+i]) {
+            return 0;
+        }
+    }
+    return 1;
+}
+ 
+// length of the longest suffix of word ending on word[pos].
+// suffix_length("dddbcabc", 8, 4) = 2
+int suffix_length(uint8_t *word, int wordlen, int pos) {
+    int i;
+    // increment suffix length i to the first mismatch or beginning
+    // of the word
+    for (i = 0; (word[pos-i] == word[wordlen-1-i]) && (i < pos); i++);
+    return i;
+}
+ 
+// delta2 table: given a mismatch at pat[pos], we want to align 
+// with the next possible full match could be based on what we
+// know about pat[pos+1] to pat[patlen-1].
+//
+// In case 1:
+// pat[pos+1] to pat[patlen-1] does not occur elsewhere in pat,
+// the next plausible match starts at or after the mismatch.
+// If, within the substring pat[pos+1 .. patlen-1], lies a prefix
+// of pat, the next plausible match is here (if there are multiple
+// prefixes in the substring, pick the longest). Otherwise, the
+// next plausible match starts past the character aligned with 
+// pat[patlen-1].
+// 
+// In case 2:
+// pat[pos+1] to pat[patlen-1] does occur elsewhere in pat. The
+// mismatch tells us that we are not looking at the end of a match.
+// We may, however, be looking at the middle of a match.
+// 
+// The first loop, which takes care of case 1, is analogous to
+// the KMP table, adapted for a 'backwards' scan order with the
+// additional restriction that the substrings it considers as 
+// potential prefixes are all suffixes. In the worst case scenario
+// pat consists of the same letter repeated, so every suffix is
+// a prefix. This loop alone is not sufficient, however:
+// Suppose that pat is "ABYXCDEYX", and text is ".....ABYXCDEYX".
+// We will match X, Y, and find B != E. There is no prefix of pat
+// in the suffix "YX", so the first loop tells us to skip forward
+// by 9 characters.
+// Although superficially similar to the KMP table, the KMP table
+// relies on information about the beginning of the partial match
+// that the BM algorithm does not have.
+//
+// The second loop addresses case 2. Since suffix_length may not be
+// unique, we want to take the minimum value, which will tell us
+// how far away the closest potential match is.
+void make_delta2(int *delta2, uint8_t *pat, int32_t patlen) {
+    int p;
+    int last_prefix_index = patlen-1;
+ 
+    // first loop
+    for (p=patlen-1; p>=0; p--) {
+        if (is_prefix(pat, patlen, p+1)) {
+            last_prefix_index = p+1;
+        }
+        delta2[p] = last_prefix_index + (patlen-1 - p);
+    }
+ 
+    // second loop
+    for (p=0; p < patlen-1; p++) {
+        int slen = suffix_length(pat, patlen, p);
+        if (pat[p - slen] != pat[patlen-1 - slen]) {
+            delta2[patlen-1 - slen] = patlen-1 - p + slen;
+        }
+    }
+}
+
+void BMFinder::SetPattern(const String& pattern_)
+{
+	pattern = pattern_;
+	int l = pattern.GetLength();
+    make_delta1(delta1, (byte *)~pattern, l);
+	delta2.Alloc(l);
+    make_delta2(delta2, (byte *)~pattern, l);
+}
+
+int BMFinder::FindIn(const char *data, int len) const
+{
+	int patlen = pattern.GetLength();
+    const char *pat = pattern;
+    int i = patlen - 1;
+    while(i < len) {
+        int j = patlen - 1;
+        while(j >= 0 && (data[i] == pat[j])) {
+            --i;
+            --j;
+        }
+        if(j < 0)
+            return i + 1;
+ 		i += max(delta1[(byte)data[i]], delta2[j]);
+	}
+	return -1;
+}
+
+uint8_t* boyer_moore (uint8_t *string, uint32_t stringlen, uint8_t *pat, uint32_t patlen) {
+    int i;
+    int delta1[ALPHABET_LEN];
+    int *delta2 = (int *)malloc(patlen * sizeof(int));
+    make_delta1(delta1, pat, patlen);
+    make_delta2(delta2, pat, patlen);
+    i = patlen-1;
+    while (i < stringlen) {
+        int j = patlen-1;
+        while (j >= 0 && (string[i] == pat[j])) {
+            --i;
+            --j;
+        }
+        if (j < 0) {
+            free(delta2);
+            return (string + i+1);
+        }
+
+        i += max(delta1[string[i]], delta2[j]);
+    }
+    free(delta2);
+    return NULL;
+}
--- a/benchmarks/StringFind/bm.h
+++ b/benchmarks/StringFind/bm.h
@ -0,0 +1,23 @@
+#ifndef _BM_bm_h_
+#define _BM_bm_h_
+
+#include <Core/Core.h>
+
+using namespace Upp;
+
+struct BMFinder {
+	int         delta1[256];
+	Buffer<int> delta2;
+	String      pattern;
+
+	void SetPattern(const String& pattern);
+
+	int  FindIn(const char *data, int len) const;
+	int  FindIn(const String& data) const     { return FindIn(data, data.GetCount()); }
+	
+	BMFinder(const String& pattern)           { SetPattern(pattern); }
+};
+
+uint8_t* boyer_moore (uint8_t *string, uint32_t stringlen, uint8_t *pat, uint32_t patlen);
+
+#endif
--- a/benchmarks/StringFind/init
+++ b/benchmarks/StringFind/init
@ -0,0 +1,4 @@
+#ifndef _StringFind_icpp_init_stub
+#define _StringFind_icpp_init_stub
+#include "Core/init"
+#endif
--- a/benchmarks/StringFind/main.cpp
+++ b/benchmarks/StringFind/main.cpp
@ -0,0 +1,178 @@
+#include "bm.h"
+
+using namespace Upp;
+
+int ffind(const char *haystack, int len, const char *needle, int nsize)
+{
+    if (!nsize) return -1;
+    // nsize + pos can overflow (eg pos == npos), guard against that by checking
+    // that nsize + pos does not wrap around.
+    if (nsize > len) return -1;
+    // Don't use std::search, use a Boyer-Moore-like trick by comparing
+    // the last characters first
+    int nsize_1 = nsize - 1;
+    int lastNeedle = needle[nsize_1];
+
+    // Boyer-Moore skip value for the last char in the needle. Zero is
+    // not a valid value; skip will be computed the first time it's
+    // needed.
+    int skip = 0;
+
+    const char *i = haystack;
+    const char *iEnd = haystack + len - nsize_1;
+
+    while (i < iEnd) {
+      // Boyer-Moore: match the last element in the needle
+      while (i[nsize_1] != lastNeedle) {
+        if (++i == iEnd) {
+          // not found
+          return -1;
+        }
+      }
+      // Here we know that the last char matches
+      // Continue in pedestrian mode
+      for (int j = 0; ; ) {
+        if (i[j] != needle[j]) {
+          // Not found, we can skip
+          // Compute the skip value lazily
+          if (skip == 0) {
+            skip = 1;
+            while (skip <= nsize_1 && needle[nsize_1 - skip] != lastNeedle) {
+              ++skip;
+            }
+          }
+          i += skip;
+          break;
+        }
+        // Check if done searching
+        if (++j == nsize) {
+          // Yay
+          return i - haystack;
+        }
+      }
+    }
+    return -1;
+}
+
+int ffind(const char *haystack, const char *needle)
+{
+	return ffind(haystack, strlen(haystack), needle, strlen(needle));
+}
+
+void test_find(const char *data, const char *pattern)
+{
+	LOG("=======================");
+	LOG("Haystack: " << data);
+	LOG("Needle: " << pattern);
+	LOG("Folly: " << ffind(data, pattern));
+	
+	BMFinder bm(pattern);
+	uint8_t *s = boyer_moore((uint8_t *)data, strlen(data), (uint8_t *)pattern, strlen(pattern));
+	LOG("Boyer-Moore0: " << (s ? (char *)s - data : -1));
+	LOG("Boyer-Moore1: " << bm.FindIn(data));
+}
+
+void DoTest(const String& data, const String& needle)
+{
+	RLOG("------------");
+	RLOG("Needle: " << needle);
+	{
+		TimeStop tm;
+		RLOG("U++ Brute force: " << data.Find(needle));
+		RLOG("  Time elapsed: " << tm);
+	}
+	{
+		TimeStop tm;
+		RLOG("Folly: " << ffind(data, data.GetLength(), needle, needle.GetCount()));
+		RLOG("  Time elapsed: " << tm);
+	}
+	{
+		TimeStop tm;
+		uint8_t* t = boyer_moore ((uint8_t *)~data, data.GetCount(),
+		                          (uint8_t *)~needle, needle.GetCount());
+		RLOG("Boyer-Moore C: " << (t ? (char *)t - ~data : -1));
+		RLOG("  Time elapsed: " << tm);
+	}
+	{
+		TimeStop tm;
+		BMFinder bm(needle);
+		RLOG("Boyer-Moore: " << bm.FindIn(data));
+		RLOG("  Time elapsed: " << tm);
+	}
+}
+
+CONSOLE_APP_MAIN
+{
+	StdLogSetup(LOG_COUT|LOG_FILE);
+	String data, needle, n;
+
+/*
+	test_find("zxcvzxcvahoj", "ahoj");
+	test_find("aahoj", "ahoj");
+	test_find("jojajo", "ahoj");
+	test_find("jojajohadjohajohahojaj", "ahoj");
+	int n = String("0123456789").Find("012345");
+	RDUMP(n);
+*/	
+
+	RLOG("====================================================");
+	RLOG("**** xml file");
+	data = LoadFile("/home/cxl/20131117_ST_ZZSZ.xml") + "Hello world!";
+	DoTest(data, "Hello world");
+	DoTest(data, "Hel");
+	
+	RLOG("====================================================");
+	RLOG("**** a..ab");
+	data = String('a', 100000000) + "b";
+	DoTest(data, "b");
+	DoTest(data, "ab");
+	DoTest(data, "aab");
+	DoTest(data, "aaab");
+	DoTest(data, "aaaab");
+	DoTest(data, "aaaaaaaaaaaaaaaaaab");
+	DoTest(data, String('a', 100) + "b");
+
+	RLOG("====================================================");
+	RLOG("**** a..aba..a");
+	data = String('a', 50000000) + "b" + String('a', 50000000);
+	DoTest(data, "ab");
+	DoTest(data, "aba");
+	DoTest(data, "aaba");
+	DoTest(data, "aabaa");
+	DoTest(data, "aaaabaaaa");
+	DoTest(data, "aaaaaabaaaaaa");
+	DoTest(data, String('a', 20) + "b" + String('a', 20));
+	DoTest(data, String('a', 100) + "b");
+	
+	RLOG("====================================================");
+	RLOG("**** 10000000 * Hello_world!");
+	data.Clear();
+	for(int i = 0; i < 10000000; i++)
+		data << "Hello_world!";
+	data << "Hello world!";
+	DoTest(data, "Hello world!");
+
+	RLOG("====================================================");
+	RLOG("**** all chars repeated");
+	for(int i = 32; i < 127; i++)
+		needle.Cat(i);
+	data.Clear();
+	for(int i = 0; i < 1000000; i++)
+		data << needle;
+	needle << 'x';
+	data << needle;
+	DoTest(data, needle);
+
+	RLOG("====================================================");
+	RLOG("**** all chars repeated, space in middle");
+	needle.Clear();
+	for(int i = 32; i < 127; i++)
+		needle.Cat(i);
+	data.Clear();
+	n = needle;
+	n.Set(60, ' ');
+	for(int i = 0; i < 1000000; i++)
+		data << n;
+	data << needle;
+	DoTest(data, needle);
+}
--- a/benchmarks/sizeof/main.cpp
+++ b/benchmarks/sizeof/main.cpp
@ -38,6 +38,7 @@ CONSOLE_APP_MAIN
 	RDUMP(sizeof(Value));
 	RDUMP(sizeof(ValueArray));
 	RDUMP(sizeof(ValueMap));
+	RDUMP(sizeof(XmlNode));
 	RLOG("========================");
 	RDUMP(sizeof(Vector<int>));
 	RDUMP(sizeof(Array<int>));