.benchmarks: StringFind

git-svn-id: svn://ultimatepp.org/upp/trunk@6978 f0d560ea-af0d-0410-9eb7-867de7ffcac7
This commit is contained in:
cxl 2014-03-02 10:04:47 +00:00
parent 51ff437d49
commit 677a3e7dcf
6 changed files with 370 additions and 0 deletions

View file

@ -0,0 +1,11 @@
uses
Core;
file
bm.h,
bm.cpp,
main.cpp optimize_speed;
mainconfig
"" = "SSE2";

View file

@ -0,0 +1,153 @@
#include "bm.h"
#define ALPHABET_LEN 256
#define NOT_FOUND patlen
// delta1 table: delta1[c] contains the distance between the last
// character of pat and the rightmost occurrence of c in pat.
// If c does not occur in pat, then delta1[c] = patlen.
// If c is at string[i] and c != pat[patlen-1], we can
// safely shift i over by delta1[c], which is the minimum distance
// needed to shift pat forward to get string[i] lined up
// with some character in pat.
// this algorithm runs in alphabet_len+patlen time.
void make_delta1(int *delta1, uint8_t *pat, int32_t patlen) {
int i;
for (i=0; i < ALPHABET_LEN; i++) {
delta1[i] = patlen;
}
for (i=0; i < patlen-1; i++) {
delta1[pat[i]] = patlen-1 - i;
}
}
// true if the suffix of word starting from word[pos] is a prefix
// of word
int is_prefix(uint8_t *word, int wordlen, int pos) {
int i;
int suffixlen = wordlen - pos;
// could also use the strncmp() library function here
for (i = 0; i < suffixlen; i++) {
if (word[i] != word[pos+i]) {
return 0;
}
}
return 1;
}
// length of the longest suffix of word ending on word[pos].
// suffix_length("dddbcabc", 8, 4) = 2
int suffix_length(uint8_t *word, int wordlen, int pos) {
int i;
// increment suffix length i to the first mismatch or beginning
// of the word
for (i = 0; (word[pos-i] == word[wordlen-1-i]) && (i < pos); i++);
return i;
}
// delta2 table: given a mismatch at pat[pos], we want to align
// with the next possible full match could be based on what we
// know about pat[pos+1] to pat[patlen-1].
//
// In case 1:
// pat[pos+1] to pat[patlen-1] does not occur elsewhere in pat,
// the next plausible match starts at or after the mismatch.
// If, within the substring pat[pos+1 .. patlen-1], lies a prefix
// of pat, the next plausible match is here (if there are multiple
// prefixes in the substring, pick the longest). Otherwise, the
// next plausible match starts past the character aligned with
// pat[patlen-1].
//
// In case 2:
// pat[pos+1] to pat[patlen-1] does occur elsewhere in pat. The
// mismatch tells us that we are not looking at the end of a match.
// We may, however, be looking at the middle of a match.
//
// The first loop, which takes care of case 1, is analogous to
// the KMP table, adapted for a 'backwards' scan order with the
// additional restriction that the substrings it considers as
// potential prefixes are all suffixes. In the worst case scenario
// pat consists of the same letter repeated, so every suffix is
// a prefix. This loop alone is not sufficient, however:
// Suppose that pat is "ABYXCDEYX", and text is ".....ABYXCDEYX".
// We will match X, Y, and find B != E. There is no prefix of pat
// in the suffix "YX", so the first loop tells us to skip forward
// by 9 characters.
// Although superficially similar to the KMP table, the KMP table
// relies on information about the beginning of the partial match
// that the BM algorithm does not have.
//
// The second loop addresses case 2. Since suffix_length may not be
// unique, we want to take the minimum value, which will tell us
// how far away the closest potential match is.
void make_delta2(int *delta2, uint8_t *pat, int32_t patlen) {
int p;
int last_prefix_index = patlen-1;
// first loop
for (p=patlen-1; p>=0; p--) {
if (is_prefix(pat, patlen, p+1)) {
last_prefix_index = p+1;
}
delta2[p] = last_prefix_index + (patlen-1 - p);
}
// second loop
for (p=0; p < patlen-1; p++) {
int slen = suffix_length(pat, patlen, p);
if (pat[p - slen] != pat[patlen-1 - slen]) {
delta2[patlen-1 - slen] = patlen-1 - p + slen;
}
}
}
void BMFinder::SetPattern(const String& pattern_)
{
pattern = pattern_;
int l = pattern.GetLength();
make_delta1(delta1, (byte *)~pattern, l);
delta2.Alloc(l);
make_delta2(delta2, (byte *)~pattern, l);
}
int BMFinder::FindIn(const char *data, int len) const
{
int patlen = pattern.GetLength();
const char *pat = pattern;
int i = patlen - 1;
while(i < len) {
int j = patlen - 1;
while(j >= 0 && (data[i] == pat[j])) {
--i;
--j;
}
if(j < 0)
return i + 1;
i += max(delta1[(byte)data[i]], delta2[j]);
}
return -1;
}
uint8_t* boyer_moore (uint8_t *string, uint32_t stringlen, uint8_t *pat, uint32_t patlen) {
int i;
int delta1[ALPHABET_LEN];
int *delta2 = (int *)malloc(patlen * sizeof(int));
make_delta1(delta1, pat, patlen);
make_delta2(delta2, pat, patlen);
i = patlen-1;
while (i < stringlen) {
int j = patlen-1;
while (j >= 0 && (string[i] == pat[j])) {
--i;
--j;
}
if (j < 0) {
free(delta2);
return (string + i+1);
}
i += max(delta1[string[i]], delta2[j]);
}
free(delta2);
return NULL;
}

View file

@ -0,0 +1,23 @@
#ifndef _BM_bm_h_
#define _BM_bm_h_
#include <Core/Core.h>
using namespace Upp;
struct BMFinder {
int delta1[256];
Buffer<int> delta2;
String pattern;
void SetPattern(const String& pattern);
int FindIn(const char *data, int len) const;
int FindIn(const String& data) const { return FindIn(data, data.GetCount()); }
BMFinder(const String& pattern) { SetPattern(pattern); }
};
uint8_t* boyer_moore (uint8_t *string, uint32_t stringlen, uint8_t *pat, uint32_t patlen);
#endif

View file

@ -0,0 +1,4 @@
#ifndef _StringFind_icpp_init_stub
#define _StringFind_icpp_init_stub
#include "Core/init"
#endif

View file

@ -0,0 +1,178 @@
#include "bm.h"
using namespace Upp;
int ffind(const char *haystack, int len, const char *needle, int nsize)
{
if (!nsize) return -1;
// nsize + pos can overflow (eg pos == npos), guard against that by checking
// that nsize + pos does not wrap around.
if (nsize > len) return -1;
// Don't use std::search, use a Boyer-Moore-like trick by comparing
// the last characters first
int nsize_1 = nsize - 1;
int lastNeedle = needle[nsize_1];
// Boyer-Moore skip value for the last char in the needle. Zero is
// not a valid value; skip will be computed the first time it's
// needed.
int skip = 0;
const char *i = haystack;
const char *iEnd = haystack + len - nsize_1;
while (i < iEnd) {
// Boyer-Moore: match the last element in the needle
while (i[nsize_1] != lastNeedle) {
if (++i == iEnd) {
// not found
return -1;
}
}
// Here we know that the last char matches
// Continue in pedestrian mode
for (int j = 0; ; ) {
if (i[j] != needle[j]) {
// Not found, we can skip
// Compute the skip value lazily
if (skip == 0) {
skip = 1;
while (skip <= nsize_1 && needle[nsize_1 - skip] != lastNeedle) {
++skip;
}
}
i += skip;
break;
}
// Check if done searching
if (++j == nsize) {
// Yay
return i - haystack;
}
}
}
return -1;
}
int ffind(const char *haystack, const char *needle)
{
return ffind(haystack, strlen(haystack), needle, strlen(needle));
}
void test_find(const char *data, const char *pattern)
{
LOG("=======================");
LOG("Haystack: " << data);
LOG("Needle: " << pattern);
LOG("Folly: " << ffind(data, pattern));
BMFinder bm(pattern);
uint8_t *s = boyer_moore((uint8_t *)data, strlen(data), (uint8_t *)pattern, strlen(pattern));
LOG("Boyer-Moore0: " << (s ? (char *)s - data : -1));
LOG("Boyer-Moore1: " << bm.FindIn(data));
}
void DoTest(const String& data, const String& needle)
{
RLOG("------------");
RLOG("Needle: " << needle);
{
TimeStop tm;
RLOG("U++ Brute force: " << data.Find(needle));
RLOG(" Time elapsed: " << tm);
}
{
TimeStop tm;
RLOG("Folly: " << ffind(data, data.GetLength(), needle, needle.GetCount()));
RLOG(" Time elapsed: " << tm);
}
{
TimeStop tm;
uint8_t* t = boyer_moore ((uint8_t *)~data, data.GetCount(),
(uint8_t *)~needle, needle.GetCount());
RLOG("Boyer-Moore C: " << (t ? (char *)t - ~data : -1));
RLOG(" Time elapsed: " << tm);
}
{
TimeStop tm;
BMFinder bm(needle);
RLOG("Boyer-Moore: " << bm.FindIn(data));
RLOG(" Time elapsed: " << tm);
}
}
CONSOLE_APP_MAIN
{
StdLogSetup(LOG_COUT|LOG_FILE);
String data, needle, n;
/*
test_find("zxcvzxcvahoj", "ahoj");
test_find("aahoj", "ahoj");
test_find("jojajo", "ahoj");
test_find("jojajohadjohajohahojaj", "ahoj");
int n = String("0123456789").Find("012345");
RDUMP(n);
*/
RLOG("====================================================");
RLOG("**** xml file");
data = LoadFile("/home/cxl/20131117_ST_ZZSZ.xml") + "Hello world!";
DoTest(data, "Hello world");
DoTest(data, "Hel");
RLOG("====================================================");
RLOG("**** a..ab");
data = String('a', 100000000) + "b";
DoTest(data, "b");
DoTest(data, "ab");
DoTest(data, "aab");
DoTest(data, "aaab");
DoTest(data, "aaaab");
DoTest(data, "aaaaaaaaaaaaaaaaaab");
DoTest(data, String('a', 100) + "b");
RLOG("====================================================");
RLOG("**** a..aba..a");
data = String('a', 50000000) + "b" + String('a', 50000000);
DoTest(data, "ab");
DoTest(data, "aba");
DoTest(data, "aaba");
DoTest(data, "aabaa");
DoTest(data, "aaaabaaaa");
DoTest(data, "aaaaaabaaaaaa");
DoTest(data, String('a', 20) + "b" + String('a', 20));
DoTest(data, String('a', 100) + "b");
RLOG("====================================================");
RLOG("**** 10000000 * Hello_world!");
data.Clear();
for(int i = 0; i < 10000000; i++)
data << "Hello_world!";
data << "Hello world!";
DoTest(data, "Hello world!");
RLOG("====================================================");
RLOG("**** all chars repeated");
for(int i = 32; i < 127; i++)
needle.Cat(i);
data.Clear();
for(int i = 0; i < 1000000; i++)
data << needle;
needle << 'x';
data << needle;
DoTest(data, needle);
RLOG("====================================================");
RLOG("**** all chars repeated, space in middle");
needle.Clear();
for(int i = 32; i < 127; i++)
needle.Cat(i);
data.Clear();
n = needle;
n.Set(60, ' ');
for(int i = 0; i < 1000000; i++)
data << n;
data << needle;
DoTest(data, needle);
}

View file

@ -38,6 +38,7 @@ CONSOLE_APP_MAIN
RDUMP(sizeof(Value));
RDUMP(sizeof(ValueArray));
RDUMP(sizeof(ValueMap));
RDUMP(sizeof(XmlNode));
RLOG("========================");
RDUMP(sizeof(Vector<int>));
RDUMP(sizeof(Array<int>));