diff --git a/benchmarks/StringFind/StringFind.upp b/benchmarks/StringFind/StringFind.upp new file mode 100644 index 000000000..b21e20fa9 --- /dev/null +++ b/benchmarks/StringFind/StringFind.upp @@ -0,0 +1,11 @@ +uses + Core; + +file + bm.h, + bm.cpp, + main.cpp optimize_speed; + +mainconfig + "" = "SSE2"; + diff --git a/benchmarks/StringFind/bm.cpp b/benchmarks/StringFind/bm.cpp new file mode 100644 index 000000000..af28329ae --- /dev/null +++ b/benchmarks/StringFind/bm.cpp @@ -0,0 +1,153 @@ +#include "bm.h" + +#define ALPHABET_LEN 256 +#define NOT_FOUND patlen + +// delta1 table: delta1[c] contains the distance between the last +// character of pat and the rightmost occurrence of c in pat. +// If c does not occur in pat, then delta1[c] = patlen. +// If c is at string[i] and c != pat[patlen-1], we can +// safely shift i over by delta1[c], which is the minimum distance +// needed to shift pat forward to get string[i] lined up +// with some character in pat. +// this algorithm runs in alphabet_len+patlen time. +void make_delta1(int *delta1, uint8_t *pat, int32_t patlen) { + int i; + for (i=0; i < ALPHABET_LEN; i++) { + delta1[i] = patlen; + } + for (i=0; i < patlen-1; i++) { + delta1[pat[i]] = patlen-1 - i; + } +} + +// true if the suffix of word starting from word[pos] is a prefix +// of word +int is_prefix(uint8_t *word, int wordlen, int pos) { + int i; + int suffixlen = wordlen - pos; + // could also use the strncmp() library function here + for (i = 0; i < suffixlen; i++) { + if (word[i] != word[pos+i]) { + return 0; + } + } + return 1; +} + +// length of the longest suffix of word ending on word[pos]. +// suffix_length("dddbcabc", 8, 4) = 2 +int suffix_length(uint8_t *word, int wordlen, int pos) { + int i; + // increment suffix length i to the first mismatch or beginning + // of the word + for (i = 0; (word[pos-i] == word[wordlen-1-i]) && (i < pos); i++); + return i; +} + +// delta2 table: given a mismatch at pat[pos], we want to align +// with the next possible full match could be based on what we +// know about pat[pos+1] to pat[patlen-1]. +// +// In case 1: +// pat[pos+1] to pat[patlen-1] does not occur elsewhere in pat, +// the next plausible match starts at or after the mismatch. +// If, within the substring pat[pos+1 .. patlen-1], lies a prefix +// of pat, the next plausible match is here (if there are multiple +// prefixes in the substring, pick the longest). Otherwise, the +// next plausible match starts past the character aligned with +// pat[patlen-1]. +// +// In case 2: +// pat[pos+1] to pat[patlen-1] does occur elsewhere in pat. The +// mismatch tells us that we are not looking at the end of a match. +// We may, however, be looking at the middle of a match. +// +// The first loop, which takes care of case 1, is analogous to +// the KMP table, adapted for a 'backwards' scan order with the +// additional restriction that the substrings it considers as +// potential prefixes are all suffixes. In the worst case scenario +// pat consists of the same letter repeated, so every suffix is +// a prefix. This loop alone is not sufficient, however: +// Suppose that pat is "ABYXCDEYX", and text is ".....ABYXCDEYX". +// We will match X, Y, and find B != E. There is no prefix of pat +// in the suffix "YX", so the first loop tells us to skip forward +// by 9 characters. +// Although superficially similar to the KMP table, the KMP table +// relies on information about the beginning of the partial match +// that the BM algorithm does not have. +// +// The second loop addresses case 2. Since suffix_length may not be +// unique, we want to take the minimum value, which will tell us +// how far away the closest potential match is. +void make_delta2(int *delta2, uint8_t *pat, int32_t patlen) { + int p; + int last_prefix_index = patlen-1; + + // first loop + for (p=patlen-1; p>=0; p--) { + if (is_prefix(pat, patlen, p+1)) { + last_prefix_index = p+1; + } + delta2[p] = last_prefix_index + (patlen-1 - p); + } + + // second loop + for (p=0; p < patlen-1; p++) { + int slen = suffix_length(pat, patlen, p); + if (pat[p - slen] != pat[patlen-1 - slen]) { + delta2[patlen-1 - slen] = patlen-1 - p + slen; + } + } +} + +void BMFinder::SetPattern(const String& pattern_) +{ + pattern = pattern_; + int l = pattern.GetLength(); + make_delta1(delta1, (byte *)~pattern, l); + delta2.Alloc(l); + make_delta2(delta2, (byte *)~pattern, l); +} + +int BMFinder::FindIn(const char *data, int len) const +{ + int patlen = pattern.GetLength(); + const char *pat = pattern; + int i = patlen - 1; + while(i < len) { + int j = patlen - 1; + while(j >= 0 && (data[i] == pat[j])) { + --i; + --j; + } + if(j < 0) + return i + 1; + i += max(delta1[(byte)data[i]], delta2[j]); + } + return -1; +} + +uint8_t* boyer_moore (uint8_t *string, uint32_t stringlen, uint8_t *pat, uint32_t patlen) { + int i; + int delta1[ALPHABET_LEN]; + int *delta2 = (int *)malloc(patlen * sizeof(int)); + make_delta1(delta1, pat, patlen); + make_delta2(delta2, pat, patlen); + i = patlen-1; + while (i < stringlen) { + int j = patlen-1; + while (j >= 0 && (string[i] == pat[j])) { + --i; + --j; + } + if (j < 0) { + free(delta2); + return (string + i+1); + } + + i += max(delta1[string[i]], delta2[j]); + } + free(delta2); + return NULL; +} diff --git a/benchmarks/StringFind/bm.h b/benchmarks/StringFind/bm.h new file mode 100644 index 000000000..b6c062d39 --- /dev/null +++ b/benchmarks/StringFind/bm.h @@ -0,0 +1,23 @@ +#ifndef _BM_bm_h_ +#define _BM_bm_h_ + +#include + +using namespace Upp; + +struct BMFinder { + int delta1[256]; + Buffer delta2; + String pattern; + + void SetPattern(const String& pattern); + + int FindIn(const char *data, int len) const; + int FindIn(const String& data) const { return FindIn(data, data.GetCount()); } + + BMFinder(const String& pattern) { SetPattern(pattern); } +}; + +uint8_t* boyer_moore (uint8_t *string, uint32_t stringlen, uint8_t *pat, uint32_t patlen); + +#endif diff --git a/benchmarks/StringFind/init b/benchmarks/StringFind/init new file mode 100644 index 000000000..042121e6c --- /dev/null +++ b/benchmarks/StringFind/init @@ -0,0 +1,4 @@ +#ifndef _StringFind_icpp_init_stub +#define _StringFind_icpp_init_stub +#include "Core/init" +#endif diff --git a/benchmarks/StringFind/main.cpp b/benchmarks/StringFind/main.cpp new file mode 100644 index 000000000..7e132dafa --- /dev/null +++ b/benchmarks/StringFind/main.cpp @@ -0,0 +1,178 @@ +#include "bm.h" + +using namespace Upp; + +int ffind(const char *haystack, int len, const char *needle, int nsize) +{ + if (!nsize) return -1; + // nsize + pos can overflow (eg pos == npos), guard against that by checking + // that nsize + pos does not wrap around. + if (nsize > len) return -1; + // Don't use std::search, use a Boyer-Moore-like trick by comparing + // the last characters first + int nsize_1 = nsize - 1; + int lastNeedle = needle[nsize_1]; + + // Boyer-Moore skip value for the last char in the needle. Zero is + // not a valid value; skip will be computed the first time it's + // needed. + int skip = 0; + + const char *i = haystack; + const char *iEnd = haystack + len - nsize_1; + + while (i < iEnd) { + // Boyer-Moore: match the last element in the needle + while (i[nsize_1] != lastNeedle) { + if (++i == iEnd) { + // not found + return -1; + } + } + // Here we know that the last char matches + // Continue in pedestrian mode + for (int j = 0; ; ) { + if (i[j] != needle[j]) { + // Not found, we can skip + // Compute the skip value lazily + if (skip == 0) { + skip = 1; + while (skip <= nsize_1 && needle[nsize_1 - skip] != lastNeedle) { + ++skip; + } + } + i += skip; + break; + } + // Check if done searching + if (++j == nsize) { + // Yay + return i - haystack; + } + } + } + return -1; +} + +int ffind(const char *haystack, const char *needle) +{ + return ffind(haystack, strlen(haystack), needle, strlen(needle)); +} + +void test_find(const char *data, const char *pattern) +{ + LOG("======================="); + LOG("Haystack: " << data); + LOG("Needle: " << pattern); + LOG("Folly: " << ffind(data, pattern)); + + BMFinder bm(pattern); + uint8_t *s = boyer_moore((uint8_t *)data, strlen(data), (uint8_t *)pattern, strlen(pattern)); + LOG("Boyer-Moore0: " << (s ? (char *)s - data : -1)); + LOG("Boyer-Moore1: " << bm.FindIn(data)); +} + +void DoTest(const String& data, const String& needle) +{ + RLOG("------------"); + RLOG("Needle: " << needle); + { + TimeStop tm; + RLOG("U++ Brute force: " << data.Find(needle)); + RLOG(" Time elapsed: " << tm); + } + { + TimeStop tm; + RLOG("Folly: " << ffind(data, data.GetLength(), needle, needle.GetCount())); + RLOG(" Time elapsed: " << tm); + } + { + TimeStop tm; + uint8_t* t = boyer_moore ((uint8_t *)~data, data.GetCount(), + (uint8_t *)~needle, needle.GetCount()); + RLOG("Boyer-Moore C: " << (t ? (char *)t - ~data : -1)); + RLOG(" Time elapsed: " << tm); + } + { + TimeStop tm; + BMFinder bm(needle); + RLOG("Boyer-Moore: " << bm.FindIn(data)); + RLOG(" Time elapsed: " << tm); + } +} + +CONSOLE_APP_MAIN +{ + StdLogSetup(LOG_COUT|LOG_FILE); + String data, needle, n; + +/* + test_find("zxcvzxcvahoj", "ahoj"); + test_find("aahoj", "ahoj"); + test_find("jojajo", "ahoj"); + test_find("jojajohadjohajohahojaj", "ahoj"); + int n = String("0123456789").Find("012345"); + RDUMP(n); +*/ + + RLOG("===================================================="); + RLOG("**** xml file"); + data = LoadFile("/home/cxl/20131117_ST_ZZSZ.xml") + "Hello world!"; + DoTest(data, "Hello world"); + DoTest(data, "Hel"); + + RLOG("===================================================="); + RLOG("**** a..ab"); + data = String('a', 100000000) + "b"; + DoTest(data, "b"); + DoTest(data, "ab"); + DoTest(data, "aab"); + DoTest(data, "aaab"); + DoTest(data, "aaaab"); + DoTest(data, "aaaaaaaaaaaaaaaaaab"); + DoTest(data, String('a', 100) + "b"); + + RLOG("===================================================="); + RLOG("**** a..aba..a"); + data = String('a', 50000000) + "b" + String('a', 50000000); + DoTest(data, "ab"); + DoTest(data, "aba"); + DoTest(data, "aaba"); + DoTest(data, "aabaa"); + DoTest(data, "aaaabaaaa"); + DoTest(data, "aaaaaabaaaaaa"); + DoTest(data, String('a', 20) + "b" + String('a', 20)); + DoTest(data, String('a', 100) + "b"); + + RLOG("===================================================="); + RLOG("**** 10000000 * Hello_world!"); + data.Clear(); + for(int i = 0; i < 10000000; i++) + data << "Hello_world!"; + data << "Hello world!"; + DoTest(data, "Hello world!"); + + RLOG("===================================================="); + RLOG("**** all chars repeated"); + for(int i = 32; i < 127; i++) + needle.Cat(i); + data.Clear(); + for(int i = 0; i < 1000000; i++) + data << needle; + needle << 'x'; + data << needle; + DoTest(data, needle); + + RLOG("===================================================="); + RLOG("**** all chars repeated, space in middle"); + needle.Clear(); + for(int i = 32; i < 127; i++) + needle.Cat(i); + data.Clear(); + n = needle; + n.Set(60, ' '); + for(int i = 0; i < 1000000; i++) + data << n; + data << needle; + DoTest(data, needle); +} diff --git a/benchmarks/sizeof/main.cpp b/benchmarks/sizeof/main.cpp index be748097e..f64ceca09 100644 --- a/benchmarks/sizeof/main.cpp +++ b/benchmarks/sizeof/main.cpp @@ -38,6 +38,7 @@ CONSOLE_APP_MAIN RDUMP(sizeof(Value)); RDUMP(sizeof(ValueArray)); RDUMP(sizeof(ValueMap)); + RDUMP(sizeof(XmlNode)); RLOG("========================"); RDUMP(sizeof(Vector)); RDUMP(sizeof(Array));