diff --git a/benchmarks/PainterClear/PainterClear.upp b/benchmarks/PainterClear/PainterClear.upp index 5872304d3..ca5121f3b 100644 --- a/benchmarks/PainterClear/PainterClear.upp +++ b/benchmarks/PainterClear/PainterClear.upp @@ -2,7 +2,8 @@ uses CtrlLib; file - main.cpp; + main.cpp, + info.txt; mainconfig "" = "GUI"; diff --git a/benchmarks/PainterClear/info.txt b/benchmarks/PainterClear/info.txt new file mode 100644 index 000000000..fee37d26d --- /dev/null +++ b/benchmarks/PainterClear/info.txt @@ -0,0 +1,7 @@ +TIMING Blend : 435.99 ms - 435.99 us (436.00 ms / 1000 ), min: 0.00 ns, max: 1.00 ms, nesting: 0 - 1000 +TIMING Stroke : 236.99 ms - 236.99 us (237.00 ms / 1000 ), min: 0.00 ns, max: 1.00 ms, nesting: 0 - 1000 +TIMING Fill : 471.99 ms - 471.99 us (472.00 ms / 1000 ), min: 0.00 ns, max: 1.00 ms, nesting: 0 - 1000 +TIMING Rect : 596.99 ms - 596.99 us (597.00 ms / 1000 ), min: 0.00 ns, max: 2.00 ms, nesting: 0 - 1000 +TIMING Clear 2 : 704.99 ms - 704.99 us (705.00 ms / 1000 ), min: 0.00 ns, max: 1.00 ms, nesting: 0 - 1000 +TIMING Clear : 604.99 ms - 604.99 us (605.00 ms / 1000 ), min: 0.00 ns, max: 2.00 ms, nesting: 0 - 1000 + diff --git a/benchmarks/memset/memset.cpp b/benchmarks/memset/memset.cpp index d62c5204a..7964d0c3f 100644 --- a/benchmarks/memset/memset.cpp +++ b/benchmarks/memset/memset.cpp @@ -2,10 +2,60 @@ using namespace Upp; +Buffer h(1024*1024*32); +const byte *h1 = h; + CONSOLE_APP_MAIN { - Buffer h(1024*1024*30); + for(int i = 0; i < 1000; i++) { + { + RTIMING("32MB memset"); + memset(h, 0, 1024*1024 * 32); + } + { + RTIMING("32MB memset8"); + memset8(h, 0, 1024*1024 * 32); + } + { + RTIMING("32MB memcpy"); + memcpy(h, h1, 1024*1024 * 32); + } + { + RTIMING("32MB memcpy8"); + memcpy8(h, h, 1024*1024 * 32); + } + } + for(int i = 0; i < 100000000; i++) { + { + RTIMING("32B memset"); + memset(h, 0, 32); + } + { + RTIMING("32B memset8"); + memset8(h, 0, 32); + } + { + RTIMING("32B memset32"); + memset32(h, 31525874, 32 / 4); + } + { + RTIMING("32B loop 32"); + dword *s = (dword *)~h; + dword *e = s + 32 / 4; + while(s < e) + *s++ = 31515927; + } + { + RTIMING("32B memcpy"); + memcpy(h, h1, 32); + } + { + RTIMING("32B memcpy8"); + memcpy8(h, h, 32); + } + } + for(int i = 0; i < 1000000; i++) { { RTIMING("32KB memset"); @@ -26,5 +76,27 @@ CONSOLE_APP_MAIN while(s < e) *s++ = 31515927; } + { + RTIMING("32KB memcpy"); + memcpy(h, h1, 1024 * 32); + } + { + RTIMING("32KB memcpy8"); + memcpy8(h, h, 1024 * 32); + } + } + + for(int i = 0; i < 1000; i++) { + { + RTIMING("32MB memset32"); + memset32(h, 31525874, 1024*1024 * 32 / 4); + } + { + RTIMING("32MB loop 32"); + dword *s = (dword *)~h; + dword *e = s + 1024*1024 * 32 / 4; + while(s < e) + *s++ = 31515927; + } } } diff --git a/uppsrc/Core/Mem.cpp b/uppsrc/Core/Mem.cpp index 8428e6a99..006ca5952 100644 --- a/uppsrc/Core/Mem.cpp +++ b/uppsrc/Core/Mem.cpp @@ -18,8 +18,9 @@ void memset8__(void *p, i16x8 data_, size_t len) t = (byte *)(((uintptr_t)t | 15) + 1); len = e - t; e -= 128; +#if 0 // streaming does not seem to be benefical anymore #ifdef CPU_SSE2 - if(len >= 1024*1024) { // for really huge data, bypass the cache + if(len >= 1024*1024 && 0) { // for really huge data, bypass the cache auto Set4S = [&](int at) { data.Stream(t + at); }; while(len >= 64) { Set4S(0*16); Set4S(1*16); Set4S(2*16); Set4S(3*16); @@ -29,6 +30,7 @@ void memset8__(void *p, i16x8 data_, size_t len) _mm_sfence(); e = t - 1; } +#endif #endif while(t <= e) { Set4(0*16); Set4(1*16); Set4(2*16); Set4(3*16); @@ -55,7 +57,7 @@ void memcpy8__(void *p, const void *q, size_t len) byte *t = (byte *)p; const byte *s = (const byte *)q; - if(len > 4*1024*1024) { // for really huge data, call memcpy to bypass the cache + if(len > 4*1024*1024) { // for really huge data, call memcpy to use possible CPU magic memcpy(t, s, len); return; }