diff --git a/benchmarks/PainterClear/PainterClear.upp b/benchmarks/PainterClear/PainterClear.upp
index 5872304d3..ca5121f3b 100644
--- a/benchmarks/PainterClear/PainterClear.upp
+++ b/benchmarks/PainterClear/PainterClear.upp
@@ -2,7 +2,8 @@ uses
 	CtrlLib;
 
 file
-	main.cpp;
+	main.cpp,
+	info.txt;
 
 mainconfig
 	"" = "GUI";
diff --git a/benchmarks/PainterClear/info.txt b/benchmarks/PainterClear/info.txt
new file mode 100644
index 000000000..fee37d26d
--- /dev/null
+++ b/benchmarks/PainterClear/info.txt
@@ -0,0 +1,7 @@
+TIMING Blend          : 435.99 ms - 435.99 us (436.00 ms / 1000 ), min:  0.00 ns, max:  1.00 ms, nesting: 0 - 1000
+TIMING Stroke         : 236.99 ms - 236.99 us (237.00 ms / 1000 ), min:  0.00 ns, max:  1.00 ms, nesting: 0 - 1000
+TIMING Fill           : 471.99 ms - 471.99 us (472.00 ms / 1000 ), min:  0.00 ns, max:  1.00 ms, nesting: 0 - 1000
+TIMING Rect           : 596.99 ms - 596.99 us (597.00 ms / 1000 ), min:  0.00 ns, max:  2.00 ms, nesting: 0 - 1000
+TIMING Clear 2        : 704.99 ms - 704.99 us (705.00 ms / 1000 ), min:  0.00 ns, max:  1.00 ms, nesting: 0 - 1000
+TIMING Clear          : 604.99 ms - 604.99 us (605.00 ms / 1000 ), min:  0.00 ns, max:  2.00 ms, nesting: 0 - 1000
+
diff --git a/benchmarks/memset/memset.cpp b/benchmarks/memset/memset.cpp
index d62c5204a..7964d0c3f 100644
--- a/benchmarks/memset/memset.cpp
+++ b/benchmarks/memset/memset.cpp
@@ -2,10 +2,60 @@
 
 using namespace Upp;
 
+Buffer<byte> h(1024*1024*32);
+const byte *h1 = h;
+
 CONSOLE_APP_MAIN
 {
-	Buffer<byte> h(1024*1024*30);
+	for(int i = 0; i < 1000; i++) {
+		{
+			RTIMING("32MB memset");
+			memset(h, 0, 1024*1024 * 32);
+		}
+		{
+			RTIMING("32MB memset8");
+			memset8(h, 0, 1024*1024 * 32);
+		}
+		{
+			RTIMING("32MB memcpy");
+			memcpy(h, h1, 1024*1024 * 32);
+		}
+		{
+			RTIMING("32MB memcpy8");
+			memcpy8(h, h, 1024*1024 * 32);
+		}
+	}
 	
+	for(int i = 0; i < 100000000; i++) {
+		{
+			RTIMING("32B memset");
+			memset(h, 0,  32);
+		}
+		{
+			RTIMING("32B memset8");
+			memset8(h, 0, 32);
+		}
+		{
+			RTIMING("32B memset32");
+			memset32(h, 31525874, 32 / 4);
+		}
+		{
+			RTIMING("32B loop 32");
+			dword *s = (dword *)~h;
+			dword *e = s + 32 / 4;
+			while(s < e)
+				*s++ = 31515927;
+		}
+		{
+			RTIMING("32B memcpy");
+			memcpy(h, h1, 32);
+		}
+		{
+			RTIMING("32B memcpy8");
+			memcpy8(h, h, 32);
+		}
+	}
+
 	for(int i = 0; i < 1000000; i++) {
 		{
 			RTIMING("32KB memset");
@@ -26,5 +76,27 @@ CONSOLE_APP_MAIN
 			while(s < e)
 				*s++ = 31515927;
 		}
+		{
+			RTIMING("32KB memcpy");
+			memcpy(h, h1, 1024 * 32);
+		}
+		{
+			RTIMING("32KB memcpy8");
+			memcpy8(h, h, 1024 * 32);
+		}
+	}
+
+	for(int i = 0; i < 1000; i++) {
+		{
+			RTIMING("32MB memset32");
+			memset32(h, 31525874, 1024*1024 * 32 / 4);
+		}
+		{
+			RTIMING("32MB loop 32");
+			dword *s = (dword *)~h;
+			dword *e = s + 1024*1024 * 32 / 4;
+			while(s < e)
+				*s++ = 31515927;
+		}
 	}
 }
diff --git a/uppsrc/Core/Mem.cpp b/uppsrc/Core/Mem.cpp
index 8428e6a99..006ca5952 100644
--- a/uppsrc/Core/Mem.cpp
+++ b/uppsrc/Core/Mem.cpp
@@ -18,8 +18,9 @@ void memset8__(void *p, i16x8 data_, size_t len)
 	t = (byte *)(((uintptr_t)t | 15) + 1);
 	len = e - t;
 	e -= 128;
+#if 0 // streaming does not seem to be benefical anymore
 #ifdef CPU_SSE2
-	if(len >= 1024*1024) { // for really huge data, bypass the cache
+	if(len >= 1024*1024 && 0) { // for really huge data, bypass the cache
 		auto Set4S = [&](int at) { data.Stream(t + at); };
 		while(len >= 64) {
 			Set4S(0*16); Set4S(1*16); Set4S(2*16); Set4S(3*16);
@@ -29,6 +30,7 @@ void memset8__(void *p, i16x8 data_, size_t len)
 		_mm_sfence();
 		e = t - 1;
 	}
+#endif
 #endif
 	while(t <= e) {
 		Set4(0*16); Set4(1*16); Set4(2*16); Set4(3*16);
@@ -55,7 +57,7 @@ void memcpy8__(void *p, const void *q, size_t len)
 	byte *t = (byte *)p;
 	const byte *s = (const byte *)q;
 
-	if(len > 4*1024*1024) { // for really huge data, call memcpy to bypass the cache
+	if(len > 4*1024*1024) { // for really huge data, call memcpy to use possible CPU magic
 		memcpy(t, s, len);
 		return;
 	}