diff --git a/uppsrc/Core/Cpu.cpp b/uppsrc/Core/Cpu.cpp index ced544207..fb7ff015a 100644 --- a/uppsrc/Core/Cpu.cpp +++ b/uppsrc/Core/Cpu.cpp @@ -140,28 +140,6 @@ bool IsDecentMachine() } #endif -#ifndef CPU_X86 -int64 PeekI64(const void *ptr) { - const byte *p = (const byte *)ptr; - dword a = p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24); - dword b = p[4] | (p[5] << 8) | (p[6] << 16) | (p[7] << 24); - return (int64)a | ((int64)b << 32); -} -#endif - -#ifndef CPU_X86 -void PokeI64(void *ptr, int64 value) { - byte *p = (byte *)ptr; - p[0] = (byte)(value >> 8 * 0); - p[1] = (byte)(value >> 8 * 1); - p[2] = (byte)(value >> 8 * 2); - p[3] = (byte)(value >> 8 * 3); - p[4] = (byte)(value >> 8 * 4); - p[5] = (byte)(value >> 8 * 5); - p[6] = (byte)(value >> 8 * 6); - p[7] = (byte)(value >> 8 * 7); -} -#endif #define ENDIAN_SWAP { while(count--) { EndianSwap(*v++); } } void EndianSwap(word *v, size_t count) ENDIAN_SWAP @@ -171,4 +149,30 @@ void EndianSwap(int *v, size_t count) ENDIAN_SWAP void EndianSwap(int64 *v, size_t count) ENDIAN_SWAP void EndianSwap(uint64 *v, size_t count) ENDIAN_SWAP +#ifdef CPU_X86 +void huge_memsetd(void *p, dword c, int len) +{ // bypasses the cache, good for >4MB + dword *t = (dword *)p; + if(((uintptr_t)t & 3) == 0 && len > 64) { + __m128i val4 = _mm_set1_epi32(c); + auto Set4S = [&](int at) { _mm_stream_si128((__m128i *)(t + at), val4); }; + while((uintptr_t)t & 15) { // align to 16 bytes for SSE + *t++ = c; + len--; + } + while(len >= 16) { + Set4S(0); + Set4S(4); + Set4S(8); + Set4S(12); + t += 16; + len -= 16; + } + _mm_sfence(); + } + while(len--) + *t++ = c; +} +#endif + } diff --git a/uppsrc/Core/Ops.h b/uppsrc/Core/Ops.h index 3d1a2c071..959c977ac 100644 --- a/uppsrc/Core/Ops.h +++ b/uppsrc/Core/Ops.h @@ -250,11 +250,7 @@ force_inline void fast_copy128(void *t, const void *s) tt[3] = ss[3]; } -#if defined(CPU_UNALIGNED) && defined(CPU_LE) && (defined(COMPILER_MSC) || defined(COMPILER_GCC)) -#define FAST_STRING_COMPARE -#endif - -#ifdef FAST_STRING_COMPARE +#if defined(CPU_UNALIGNED) && defined(CPU_LE) force_inline int fast_memcmp(const char *a, const char *b, size_t len) { @@ -345,3 +341,81 @@ inline bool FitsInInt64(double x) { return x >= -9223372036854775808.0 && x < 9223372036854775808.0; } + +#ifdef CPU_X86 + +#include + +void huge_memsetd(void *p, dword data, int len); + +inline +void memsetd(void *p, dword data, int len) +{ + dword *t = (dword *)p; + if(len < 4) { + if(len & 2) { + t[0] = t[1] = data; + t += 2; + } + if(len & 1) + t[0] = data; + return; + } + + __m128i val4 = _mm_set1_epi32(data); + auto Set4 = [&](int at) { _mm_storeu_si128((__m128i *)(t + at), val4); }; + + Set4(len - 4); // fill tail + if(len >= 32) { + if(len >= 1024*1024) { // for really huge data, bypass the cache + huge_memsetd(t, data, len); + return; + } + const dword *e = t + len - 32; + do { + Set4(0); Set4(4); Set4(8); Set4(12); + Set4(16); Set4(20); Set4(24); Set4(28); + t += 32; + } + while(t <= e); + } + if(len & 16) { + Set4(0); Set4(4); Set4(8); Set4(12); + t += 16; + } + if(len & 8) { + Set4(0); Set4(4); + t += 8; + } + if(len & 4) + Set4(0); +} +#else +inline +void memsetd(void *p, RGBA c, int len) +{ + dword *t = (dword *)p; + while(len >= 16) { + t[0] = c; t[1] = c; t[2] = c; t[3] = c; + t[4] = c; t[5] = c; t[6] = c; t[7] = c; + t[8] = c; t[9] = c; t[10] = c; t[11] = c; + t[12] = c; t[13] = c; t[14] = c; t[15] = c; + t += 16; + len -= 16; + } + if(len & 8) { + t[0] = t[1] = t[2] = t[3] = t[4] = t[5] = t[6] = t[7] = c; + t += 8; + } + if(len & 4) { + t[0] = t[1] = t[2] = t[3] = c; + t += 4; + } + if(len & 2) { + t[0] = t[1] = c; + t += 2; + } + if(len & 1) + t[0] = c; +} +#endif \ No newline at end of file diff --git a/uppsrc/Core/Util.h b/uppsrc/Core/Util.h index 1abdd4ad1..d7b5e9ce0 100644 --- a/uppsrc/Core/Util.h +++ b/uppsrc/Core/Util.h @@ -196,15 +196,6 @@ void memsetw(void *t, word value, int count) *w++ = value; } -inline -void memsetd(void *t, dword value, int count) -{ - dword *w = (dword *)t; - dword *lim = w + count; - while(w < lim) - *w++ = value; -} - inline void memsetex(void *t, const void *item, int item_size, int count) { ASSERT(count >= 0); diff --git a/uppsrc/Draw/Image.h b/uppsrc/Draw/Image.h index 2e12674f5..a2d0bb5b4 100644 --- a/uppsrc/Draw/Image.h +++ b/uppsrc/Draw/Image.h @@ -20,7 +20,7 @@ inline bool operator!=(const RGBA& a, const RGBA& b) inline RGBA RGBAZero() { RGBA c; c.r = c.g = c.b = c.a = 0; return c; } -void Fill(RGBA *t, RGBA c, int n); +inline void Fill(RGBA *t, RGBA c, int n) { memsetd(t, *(dword *)&c, n); } void Copy(RGBA *t, const RGBA *s, int n); diff --git a/uppsrc/Draw/ImageBlit.cpp b/uppsrc/Draw/ImageBlit.cpp index 8bcd7d6f8..b95e89773 100644 --- a/uppsrc/Draw/ImageBlit.cpp +++ b/uppsrc/Draw/ImageBlit.cpp @@ -2,42 +2,6 @@ namespace Upp { -void Fill(RGBA *t, RGBA c, int len) -{ - while(len >= 16) { - t[0] = c; t[1] = c; t[2] = c; t[3] = c; - t[4] = c; t[5] = c; t[6] = c; t[7] = c; - t[8] = c; t[9] = c; t[10] = c; t[11] = c; - t[12] = c; t[13] = c; t[14] = c; t[15] = c; - t += 16; - len -= 16; - } - switch(len) { - case 15: t[14] = c; - case 14: t[13] = c; - case 13: t[12] = c; - case 12: t[11] = c; - case 11: t[10] = c; - case 10: t[9] = c; - case 9: t[8] = c; - case 8: t[7] = c; - case 7: t[6] = c; - case 6: t[5] = c; - case 5: t[4] = c; - case 4: t[3] = c; - case 3: t[2] = c; - case 2: t[1] = c; - case 1: t[0] = c; - } -} -/* -void Fill(RGBA *t, const RGBA& src, int n) -{ - while(n--) - *t++ = src; -} -*/ - void Copy(RGBA *t, const RGBA *s, int n) { while(n--) diff --git a/uppsrc/Draw/iml_source.h b/uppsrc/Draw/iml_source.h index 8e6aa53be..82c99d8dc 100644 --- a/uppsrc/Draw/iml_source.h +++ b/uppsrc/Draw/iml_source.h @@ -15,7 +15,6 @@ #define IMAGE_PACKED(n, d) }; #define IMAGE_DATA_BEGIN - #include IMAGEFILE #undef IMAGE_BEGIN diff --git a/uppsrc/Painter/Fillers.cpp b/uppsrc/Painter/Fillers.cpp index f4a80a15f..f44dacf5b 100644 --- a/uppsrc/Painter/Fillers.cpp +++ b/uppsrc/Painter/Fillers.cpp @@ -3,35 +3,6 @@ namespace Upp { -void FillRGBA(RGBA *t, RGBA c, int len) -{ - while(len >= 16) { - t[0] = c; t[1] = c; t[2] = c; t[3] = c; - t[4] = c; t[5] = c; t[6] = c; t[7] = c; - t[8] = c; t[9] = c; t[10] = c; t[11] = c; - t[12] = c; t[13] = c; t[14] = c; t[15] = c; - t += 16; - len -= 16; - } - switch(len) { - case 15: t[14] = c; - case 14: t[13] = c; - case 13: t[12] = c; - case 12: t[11] = c; - case 11: t[10] = c; - case 10: t[9] = c; - case 9: t[8] = c; - case 8: t[7] = c; - case 7: t[6] = c; - case 6: t[5] = c; - case 5: t[4] = c; - case 4: t[3] = c; - case 3: t[2] = c; - case 2: t[1] = c; - case 1: t[0] = c; - } -} - void SolidFiller::Start(int minx, int maxx) { t += minx; @@ -78,7 +49,7 @@ void SolidFiller::Render(int val, int len) } else { if(((val - 256) | (c.a - 255)) == 0) { - FillRGBA(t, c, len); + Fill(t, c, len); t += len; } else { @@ -238,7 +209,7 @@ void SubpixelFiller::Render(int val, int len) else { if(val == 256) if(!ss && color.a == 255) { - FillRGBA(t, color, int(e - t)); + Fill(t, color, int(e - t)); t = e; } else