ultimatepp/benchmarks/StringFind/main.cpp
cxl c777e09587 .benchmarks: StringFind
git-svn-id: svn://ultimatepp.org/upp/trunk@6978 f0d560ea-af0d-0410-9eb7-867de7ffcac7
2014-03-02 10:04:47 +00:00

178 lines
4.9 KiB
C++

#include "bm.h"
using namespace Upp;
int ffind(const char *haystack, int len, const char *needle, int nsize)
{
if (!nsize) return -1;
// nsize + pos can overflow (eg pos == npos), guard against that by checking
// that nsize + pos does not wrap around.
if (nsize > len) return -1;
// Don't use std::search, use a Boyer-Moore-like trick by comparing
// the last characters first
int nsize_1 = nsize - 1;
int lastNeedle = needle[nsize_1];
// Boyer-Moore skip value for the last char in the needle. Zero is
// not a valid value; skip will be computed the first time it's
// needed.
int skip = 0;
const char *i = haystack;
const char *iEnd = haystack + len - nsize_1;
while (i < iEnd) {
// Boyer-Moore: match the last element in the needle
while (i[nsize_1] != lastNeedle) {
if (++i == iEnd) {
// not found
return -1;
}
}
// Here we know that the last char matches
// Continue in pedestrian mode
for (int j = 0; ; ) {
if (i[j] != needle[j]) {
// Not found, we can skip
// Compute the skip value lazily
if (skip == 0) {
skip = 1;
while (skip <= nsize_1 && needle[nsize_1 - skip] != lastNeedle) {
++skip;
}
}
i += skip;
break;
}
// Check if done searching
if (++j == nsize) {
// Yay
return i - haystack;
}
}
}
return -1;
}
int ffind(const char *haystack, const char *needle)
{
return ffind(haystack, strlen(haystack), needle, strlen(needle));
}
void test_find(const char *data, const char *pattern)
{
LOG("=======================");
LOG("Haystack: " << data);
LOG("Needle: " << pattern);
LOG("Folly: " << ffind(data, pattern));
BMFinder bm(pattern);
uint8_t *s = boyer_moore((uint8_t *)data, strlen(data), (uint8_t *)pattern, strlen(pattern));
LOG("Boyer-Moore0: " << (s ? (char *)s - data : -1));
LOG("Boyer-Moore1: " << bm.FindIn(data));
}
void DoTest(const String& data, const String& needle)
{
RLOG("------------");
RLOG("Needle: " << needle);
{
TimeStop tm;
RLOG("U++ Brute force: " << data.Find(needle));
RLOG(" Time elapsed: " << tm);
}
{
TimeStop tm;
RLOG("Folly: " << ffind(data, data.GetLength(), needle, needle.GetCount()));
RLOG(" Time elapsed: " << tm);
}
{
TimeStop tm;
uint8_t* t = boyer_moore ((uint8_t *)~data, data.GetCount(),
(uint8_t *)~needle, needle.GetCount());
RLOG("Boyer-Moore C: " << (t ? (char *)t - ~data : -1));
RLOG(" Time elapsed: " << tm);
}
{
TimeStop tm;
BMFinder bm(needle);
RLOG("Boyer-Moore: " << bm.FindIn(data));
RLOG(" Time elapsed: " << tm);
}
}
CONSOLE_APP_MAIN
{
StdLogSetup(LOG_COUT|LOG_FILE);
String data, needle, n;
/*
test_find("zxcvzxcvahoj", "ahoj");
test_find("aahoj", "ahoj");
test_find("jojajo", "ahoj");
test_find("jojajohadjohajohahojaj", "ahoj");
int n = String("0123456789").Find("012345");
RDUMP(n);
*/
RLOG("====================================================");
RLOG("**** xml file");
data = LoadFile("/home/cxl/20131117_ST_ZZSZ.xml") + "Hello world!";
DoTest(data, "Hello world");
DoTest(data, "Hel");
RLOG("====================================================");
RLOG("**** a..ab");
data = String('a', 100000000) + "b";
DoTest(data, "b");
DoTest(data, "ab");
DoTest(data, "aab");
DoTest(data, "aaab");
DoTest(data, "aaaab");
DoTest(data, "aaaaaaaaaaaaaaaaaab");
DoTest(data, String('a', 100) + "b");
RLOG("====================================================");
RLOG("**** a..aba..a");
data = String('a', 50000000) + "b" + String('a', 50000000);
DoTest(data, "ab");
DoTest(data, "aba");
DoTest(data, "aaba");
DoTest(data, "aabaa");
DoTest(data, "aaaabaaaa");
DoTest(data, "aaaaaabaaaaaa");
DoTest(data, String('a', 20) + "b" + String('a', 20));
DoTest(data, String('a', 100) + "b");
RLOG("====================================================");
RLOG("**** 10000000 * Hello_world!");
data.Clear();
for(int i = 0; i < 10000000; i++)
data << "Hello_world!";
data << "Hello world!";
DoTest(data, "Hello world!");
RLOG("====================================================");
RLOG("**** all chars repeated");
for(int i = 32; i < 127; i++)
needle.Cat(i);
data.Clear();
for(int i = 0; i < 1000000; i++)
data << needle;
needle << 'x';
data << needle;
DoTest(data, needle);
RLOG("====================================================");
RLOG("**** all chars repeated, space in middle");
needle.Clear();
for(int i = 32; i < 127; i++)
needle.Cat(i);
data.Clear();
n = needle;
n.Set(60, ' ');
for(int i = 0; i < 1000000; i++)
data << n;
data << needle;
DoTest(data, needle);
}