mirror of
https://github.com/ultimatepp/ultimatepp.git
synced 2026-05-15 14:16:07 -06:00
.benchmarks: StringFind
git-svn-id: svn://ultimatepp.org/upp/trunk@6978 f0d560ea-af0d-0410-9eb7-867de7ffcac7
This commit is contained in:
parent
51ff437d49
commit
677a3e7dcf
6 changed files with 370 additions and 0 deletions
11
benchmarks/StringFind/StringFind.upp
Normal file
11
benchmarks/StringFind/StringFind.upp
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
uses
|
||||
Core;
|
||||
|
||||
file
|
||||
bm.h,
|
||||
bm.cpp,
|
||||
main.cpp optimize_speed;
|
||||
|
||||
mainconfig
|
||||
"" = "SSE2";
|
||||
|
||||
153
benchmarks/StringFind/bm.cpp
Normal file
153
benchmarks/StringFind/bm.cpp
Normal file
|
|
@ -0,0 +1,153 @@
|
|||
#include "bm.h"
|
||||
|
||||
#define ALPHABET_LEN 256
|
||||
#define NOT_FOUND patlen
|
||||
|
||||
// delta1 table: delta1[c] contains the distance between the last
|
||||
// character of pat and the rightmost occurrence of c in pat.
|
||||
// If c does not occur in pat, then delta1[c] = patlen.
|
||||
// If c is at string[i] and c != pat[patlen-1], we can
|
||||
// safely shift i over by delta1[c], which is the minimum distance
|
||||
// needed to shift pat forward to get string[i] lined up
|
||||
// with some character in pat.
|
||||
// this algorithm runs in alphabet_len+patlen time.
|
||||
void make_delta1(int *delta1, uint8_t *pat, int32_t patlen) {
|
||||
int i;
|
||||
for (i=0; i < ALPHABET_LEN; i++) {
|
||||
delta1[i] = patlen;
|
||||
}
|
||||
for (i=0; i < patlen-1; i++) {
|
||||
delta1[pat[i]] = patlen-1 - i;
|
||||
}
|
||||
}
|
||||
|
||||
// true if the suffix of word starting from word[pos] is a prefix
|
||||
// of word
|
||||
int is_prefix(uint8_t *word, int wordlen, int pos) {
|
||||
int i;
|
||||
int suffixlen = wordlen - pos;
|
||||
// could also use the strncmp() library function here
|
||||
for (i = 0; i < suffixlen; i++) {
|
||||
if (word[i] != word[pos+i]) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
// length of the longest suffix of word ending on word[pos].
|
||||
// suffix_length("dddbcabc", 8, 4) = 2
|
||||
int suffix_length(uint8_t *word, int wordlen, int pos) {
|
||||
int i;
|
||||
// increment suffix length i to the first mismatch or beginning
|
||||
// of the word
|
||||
for (i = 0; (word[pos-i] == word[wordlen-1-i]) && (i < pos); i++);
|
||||
return i;
|
||||
}
|
||||
|
||||
// delta2 table: given a mismatch at pat[pos], we want to align
|
||||
// with the next possible full match could be based on what we
|
||||
// know about pat[pos+1] to pat[patlen-1].
|
||||
//
|
||||
// In case 1:
|
||||
// pat[pos+1] to pat[patlen-1] does not occur elsewhere in pat,
|
||||
// the next plausible match starts at or after the mismatch.
|
||||
// If, within the substring pat[pos+1 .. patlen-1], lies a prefix
|
||||
// of pat, the next plausible match is here (if there are multiple
|
||||
// prefixes in the substring, pick the longest). Otherwise, the
|
||||
// next plausible match starts past the character aligned with
|
||||
// pat[patlen-1].
|
||||
//
|
||||
// In case 2:
|
||||
// pat[pos+1] to pat[patlen-1] does occur elsewhere in pat. The
|
||||
// mismatch tells us that we are not looking at the end of a match.
|
||||
// We may, however, be looking at the middle of a match.
|
||||
//
|
||||
// The first loop, which takes care of case 1, is analogous to
|
||||
// the KMP table, adapted for a 'backwards' scan order with the
|
||||
// additional restriction that the substrings it considers as
|
||||
// potential prefixes are all suffixes. In the worst case scenario
|
||||
// pat consists of the same letter repeated, so every suffix is
|
||||
// a prefix. This loop alone is not sufficient, however:
|
||||
// Suppose that pat is "ABYXCDEYX", and text is ".....ABYXCDEYX".
|
||||
// We will match X, Y, and find B != E. There is no prefix of pat
|
||||
// in the suffix "YX", so the first loop tells us to skip forward
|
||||
// by 9 characters.
|
||||
// Although superficially similar to the KMP table, the KMP table
|
||||
// relies on information about the beginning of the partial match
|
||||
// that the BM algorithm does not have.
|
||||
//
|
||||
// The second loop addresses case 2. Since suffix_length may not be
|
||||
// unique, we want to take the minimum value, which will tell us
|
||||
// how far away the closest potential match is.
|
||||
void make_delta2(int *delta2, uint8_t *pat, int32_t patlen) {
|
||||
int p;
|
||||
int last_prefix_index = patlen-1;
|
||||
|
||||
// first loop
|
||||
for (p=patlen-1; p>=0; p--) {
|
||||
if (is_prefix(pat, patlen, p+1)) {
|
||||
last_prefix_index = p+1;
|
||||
}
|
||||
delta2[p] = last_prefix_index + (patlen-1 - p);
|
||||
}
|
||||
|
||||
// second loop
|
||||
for (p=0; p < patlen-1; p++) {
|
||||
int slen = suffix_length(pat, patlen, p);
|
||||
if (pat[p - slen] != pat[patlen-1 - slen]) {
|
||||
delta2[patlen-1 - slen] = patlen-1 - p + slen;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BMFinder::SetPattern(const String& pattern_)
|
||||
{
|
||||
pattern = pattern_;
|
||||
int l = pattern.GetLength();
|
||||
make_delta1(delta1, (byte *)~pattern, l);
|
||||
delta2.Alloc(l);
|
||||
make_delta2(delta2, (byte *)~pattern, l);
|
||||
}
|
||||
|
||||
int BMFinder::FindIn(const char *data, int len) const
|
||||
{
|
||||
int patlen = pattern.GetLength();
|
||||
const char *pat = pattern;
|
||||
int i = patlen - 1;
|
||||
while(i < len) {
|
||||
int j = patlen - 1;
|
||||
while(j >= 0 && (data[i] == pat[j])) {
|
||||
--i;
|
||||
--j;
|
||||
}
|
||||
if(j < 0)
|
||||
return i + 1;
|
||||
i += max(delta1[(byte)data[i]], delta2[j]);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint8_t* boyer_moore (uint8_t *string, uint32_t stringlen, uint8_t *pat, uint32_t patlen) {
|
||||
int i;
|
||||
int delta1[ALPHABET_LEN];
|
||||
int *delta2 = (int *)malloc(patlen * sizeof(int));
|
||||
make_delta1(delta1, pat, patlen);
|
||||
make_delta2(delta2, pat, patlen);
|
||||
i = patlen-1;
|
||||
while (i < stringlen) {
|
||||
int j = patlen-1;
|
||||
while (j >= 0 && (string[i] == pat[j])) {
|
||||
--i;
|
||||
--j;
|
||||
}
|
||||
if (j < 0) {
|
||||
free(delta2);
|
||||
return (string + i+1);
|
||||
}
|
||||
|
||||
i += max(delta1[string[i]], delta2[j]);
|
||||
}
|
||||
free(delta2);
|
||||
return NULL;
|
||||
}
|
||||
23
benchmarks/StringFind/bm.h
Normal file
23
benchmarks/StringFind/bm.h
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
#ifndef _BM_bm_h_
|
||||
#define _BM_bm_h_
|
||||
|
||||
#include <Core/Core.h>
|
||||
|
||||
using namespace Upp;
|
||||
|
||||
struct BMFinder {
|
||||
int delta1[256];
|
||||
Buffer<int> delta2;
|
||||
String pattern;
|
||||
|
||||
void SetPattern(const String& pattern);
|
||||
|
||||
int FindIn(const char *data, int len) const;
|
||||
int FindIn(const String& data) const { return FindIn(data, data.GetCount()); }
|
||||
|
||||
BMFinder(const String& pattern) { SetPattern(pattern); }
|
||||
};
|
||||
|
||||
uint8_t* boyer_moore (uint8_t *string, uint32_t stringlen, uint8_t *pat, uint32_t patlen);
|
||||
|
||||
#endif
|
||||
4
benchmarks/StringFind/init
Normal file
4
benchmarks/StringFind/init
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
#ifndef _StringFind_icpp_init_stub
|
||||
#define _StringFind_icpp_init_stub
|
||||
#include "Core/init"
|
||||
#endif
|
||||
178
benchmarks/StringFind/main.cpp
Normal file
178
benchmarks/StringFind/main.cpp
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
#include "bm.h"
|
||||
|
||||
using namespace Upp;
|
||||
|
||||
int ffind(const char *haystack, int len, const char *needle, int nsize)
|
||||
{
|
||||
if (!nsize) return -1;
|
||||
// nsize + pos can overflow (eg pos == npos), guard against that by checking
|
||||
// that nsize + pos does not wrap around.
|
||||
if (nsize > len) return -1;
|
||||
// Don't use std::search, use a Boyer-Moore-like trick by comparing
|
||||
// the last characters first
|
||||
int nsize_1 = nsize - 1;
|
||||
int lastNeedle = needle[nsize_1];
|
||||
|
||||
// Boyer-Moore skip value for the last char in the needle. Zero is
|
||||
// not a valid value; skip will be computed the first time it's
|
||||
// needed.
|
||||
int skip = 0;
|
||||
|
||||
const char *i = haystack;
|
||||
const char *iEnd = haystack + len - nsize_1;
|
||||
|
||||
while (i < iEnd) {
|
||||
// Boyer-Moore: match the last element in the needle
|
||||
while (i[nsize_1] != lastNeedle) {
|
||||
if (++i == iEnd) {
|
||||
// not found
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
// Here we know that the last char matches
|
||||
// Continue in pedestrian mode
|
||||
for (int j = 0; ; ) {
|
||||
if (i[j] != needle[j]) {
|
||||
// Not found, we can skip
|
||||
// Compute the skip value lazily
|
||||
if (skip == 0) {
|
||||
skip = 1;
|
||||
while (skip <= nsize_1 && needle[nsize_1 - skip] != lastNeedle) {
|
||||
++skip;
|
||||
}
|
||||
}
|
||||
i += skip;
|
||||
break;
|
||||
}
|
||||
// Check if done searching
|
||||
if (++j == nsize) {
|
||||
// Yay
|
||||
return i - haystack;
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int ffind(const char *haystack, const char *needle)
|
||||
{
|
||||
return ffind(haystack, strlen(haystack), needle, strlen(needle));
|
||||
}
|
||||
|
||||
void test_find(const char *data, const char *pattern)
|
||||
{
|
||||
LOG("=======================");
|
||||
LOG("Haystack: " << data);
|
||||
LOG("Needle: " << pattern);
|
||||
LOG("Folly: " << ffind(data, pattern));
|
||||
|
||||
BMFinder bm(pattern);
|
||||
uint8_t *s = boyer_moore((uint8_t *)data, strlen(data), (uint8_t *)pattern, strlen(pattern));
|
||||
LOG("Boyer-Moore0: " << (s ? (char *)s - data : -1));
|
||||
LOG("Boyer-Moore1: " << bm.FindIn(data));
|
||||
}
|
||||
|
||||
void DoTest(const String& data, const String& needle)
|
||||
{
|
||||
RLOG("------------");
|
||||
RLOG("Needle: " << needle);
|
||||
{
|
||||
TimeStop tm;
|
||||
RLOG("U++ Brute force: " << data.Find(needle));
|
||||
RLOG(" Time elapsed: " << tm);
|
||||
}
|
||||
{
|
||||
TimeStop tm;
|
||||
RLOG("Folly: " << ffind(data, data.GetLength(), needle, needle.GetCount()));
|
||||
RLOG(" Time elapsed: " << tm);
|
||||
}
|
||||
{
|
||||
TimeStop tm;
|
||||
uint8_t* t = boyer_moore ((uint8_t *)~data, data.GetCount(),
|
||||
(uint8_t *)~needle, needle.GetCount());
|
||||
RLOG("Boyer-Moore C: " << (t ? (char *)t - ~data : -1));
|
||||
RLOG(" Time elapsed: " << tm);
|
||||
}
|
||||
{
|
||||
TimeStop tm;
|
||||
BMFinder bm(needle);
|
||||
RLOG("Boyer-Moore: " << bm.FindIn(data));
|
||||
RLOG(" Time elapsed: " << tm);
|
||||
}
|
||||
}
|
||||
|
||||
CONSOLE_APP_MAIN
|
||||
{
|
||||
StdLogSetup(LOG_COUT|LOG_FILE);
|
||||
String data, needle, n;
|
||||
|
||||
/*
|
||||
test_find("zxcvzxcvahoj", "ahoj");
|
||||
test_find("aahoj", "ahoj");
|
||||
test_find("jojajo", "ahoj");
|
||||
test_find("jojajohadjohajohahojaj", "ahoj");
|
||||
int n = String("0123456789").Find("012345");
|
||||
RDUMP(n);
|
||||
*/
|
||||
|
||||
RLOG("====================================================");
|
||||
RLOG("**** xml file");
|
||||
data = LoadFile("/home/cxl/20131117_ST_ZZSZ.xml") + "Hello world!";
|
||||
DoTest(data, "Hello world");
|
||||
DoTest(data, "Hel");
|
||||
|
||||
RLOG("====================================================");
|
||||
RLOG("**** a..ab");
|
||||
data = String('a', 100000000) + "b";
|
||||
DoTest(data, "b");
|
||||
DoTest(data, "ab");
|
||||
DoTest(data, "aab");
|
||||
DoTest(data, "aaab");
|
||||
DoTest(data, "aaaab");
|
||||
DoTest(data, "aaaaaaaaaaaaaaaaaab");
|
||||
DoTest(data, String('a', 100) + "b");
|
||||
|
||||
RLOG("====================================================");
|
||||
RLOG("**** a..aba..a");
|
||||
data = String('a', 50000000) + "b" + String('a', 50000000);
|
||||
DoTest(data, "ab");
|
||||
DoTest(data, "aba");
|
||||
DoTest(data, "aaba");
|
||||
DoTest(data, "aabaa");
|
||||
DoTest(data, "aaaabaaaa");
|
||||
DoTest(data, "aaaaaabaaaaaa");
|
||||
DoTest(data, String('a', 20) + "b" + String('a', 20));
|
||||
DoTest(data, String('a', 100) + "b");
|
||||
|
||||
RLOG("====================================================");
|
||||
RLOG("**** 10000000 * Hello_world!");
|
||||
data.Clear();
|
||||
for(int i = 0; i < 10000000; i++)
|
||||
data << "Hello_world!";
|
||||
data << "Hello world!";
|
||||
DoTest(data, "Hello world!");
|
||||
|
||||
RLOG("====================================================");
|
||||
RLOG("**** all chars repeated");
|
||||
for(int i = 32; i < 127; i++)
|
||||
needle.Cat(i);
|
||||
data.Clear();
|
||||
for(int i = 0; i < 1000000; i++)
|
||||
data << needle;
|
||||
needle << 'x';
|
||||
data << needle;
|
||||
DoTest(data, needle);
|
||||
|
||||
RLOG("====================================================");
|
||||
RLOG("**** all chars repeated, space in middle");
|
||||
needle.Clear();
|
||||
for(int i = 32; i < 127; i++)
|
||||
needle.Cat(i);
|
||||
data.Clear();
|
||||
n = needle;
|
||||
n.Set(60, ' ');
|
||||
for(int i = 0; i < 1000000; i++)
|
||||
data << n;
|
||||
data << needle;
|
||||
DoTest(data, needle);
|
||||
}
|
||||
|
|
@ -38,6 +38,7 @@ CONSOLE_APP_MAIN
|
|||
RDUMP(sizeof(Value));
|
||||
RDUMP(sizeof(ValueArray));
|
||||
RDUMP(sizeof(ValueMap));
|
||||
RDUMP(sizeof(XmlNode));
|
||||
RLOG("========================");
|
||||
RDUMP(sizeof(Vector<int>));
|
||||
RDUMP(sizeof(Array<int>));
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue