Core, Draw, Painter: memsetd and RGBAFill optimized

git-svn-id: svn://ultimatepp.org/upp/trunk@14486 f0d560ea-af0d-0410-9eb7-867de7ffcac7
This commit is contained in:
cxl 2020-05-20 14:39:18 +00:00
parent c4d0d63aab
commit 8ec50747cf
7 changed files with 108 additions and 105 deletions

View file

@ -140,28 +140,6 @@ bool IsDecentMachine()
}
#endif
#ifndef CPU_X86
int64 PeekI64(const void *ptr) {
const byte *p = (const byte *)ptr;
dword a = p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
dword b = p[4] | (p[5] << 8) | (p[6] << 16) | (p[7] << 24);
return (int64)a | ((int64)b << 32);
}
#endif
#ifndef CPU_X86
void PokeI64(void *ptr, int64 value) {
byte *p = (byte *)ptr;
p[0] = (byte)(value >> 8 * 0);
p[1] = (byte)(value >> 8 * 1);
p[2] = (byte)(value >> 8 * 2);
p[3] = (byte)(value >> 8 * 3);
p[4] = (byte)(value >> 8 * 4);
p[5] = (byte)(value >> 8 * 5);
p[6] = (byte)(value >> 8 * 6);
p[7] = (byte)(value >> 8 * 7);
}
#endif
#define ENDIAN_SWAP { while(count--) { EndianSwap(*v++); } }
void EndianSwap(word *v, size_t count) ENDIAN_SWAP
@ -171,4 +149,30 @@ void EndianSwap(int *v, size_t count) ENDIAN_SWAP
void EndianSwap(int64 *v, size_t count) ENDIAN_SWAP
void EndianSwap(uint64 *v, size_t count) ENDIAN_SWAP
#ifdef CPU_X86
void huge_memsetd(void *p, dword c, int len)
{ // bypasses the cache, good for >4MB
dword *t = (dword *)p;
if(((uintptr_t)t & 3) == 0 && len > 64) {
__m128i val4 = _mm_set1_epi32(c);
auto Set4S = [&](int at) { _mm_stream_si128((__m128i *)(t + at), val4); };
while((uintptr_t)t & 15) { // align to 16 bytes for SSE
*t++ = c;
len--;
}
while(len >= 16) {
Set4S(0);
Set4S(4);
Set4S(8);
Set4S(12);
t += 16;
len -= 16;
}
_mm_sfence();
}
while(len--)
*t++ = c;
}
#endif
}

View file

@ -250,11 +250,7 @@ force_inline void fast_copy128(void *t, const void *s)
tt[3] = ss[3];
}
#if defined(CPU_UNALIGNED) && defined(CPU_LE) && (defined(COMPILER_MSC) || defined(COMPILER_GCC))
#define FAST_STRING_COMPARE
#endif
#ifdef FAST_STRING_COMPARE
#if defined(CPU_UNALIGNED) && defined(CPU_LE)
force_inline
int fast_memcmp(const char *a, const char *b, size_t len)
{
@ -345,3 +341,81 @@ inline bool FitsInInt64(double x)
{
return x >= -9223372036854775808.0 && x < 9223372036854775808.0;
}
#ifdef CPU_X86
#include <smmintrin.h>
void huge_memsetd(void *p, dword data, int len);
inline
void memsetd(void *p, dword data, int len)
{
dword *t = (dword *)p;
if(len < 4) {
if(len & 2) {
t[0] = t[1] = data;
t += 2;
}
if(len & 1)
t[0] = data;
return;
}
__m128i val4 = _mm_set1_epi32(data);
auto Set4 = [&](int at) { _mm_storeu_si128((__m128i *)(t + at), val4); };
Set4(len - 4); // fill tail
if(len >= 32) {
if(len >= 1024*1024) { // for really huge data, bypass the cache
huge_memsetd(t, data, len);
return;
}
const dword *e = t + len - 32;
do {
Set4(0); Set4(4); Set4(8); Set4(12);
Set4(16); Set4(20); Set4(24); Set4(28);
t += 32;
}
while(t <= e);
}
if(len & 16) {
Set4(0); Set4(4); Set4(8); Set4(12);
t += 16;
}
if(len & 8) {
Set4(0); Set4(4);
t += 8;
}
if(len & 4)
Set4(0);
}
#else
inline
void memsetd(void *p, RGBA c, int len)
{
dword *t = (dword *)p;
while(len >= 16) {
t[0] = c; t[1] = c; t[2] = c; t[3] = c;
t[4] = c; t[5] = c; t[6] = c; t[7] = c;
t[8] = c; t[9] = c; t[10] = c; t[11] = c;
t[12] = c; t[13] = c; t[14] = c; t[15] = c;
t += 16;
len -= 16;
}
if(len & 8) {
t[0] = t[1] = t[2] = t[3] = t[4] = t[5] = t[6] = t[7] = c;
t += 8;
}
if(len & 4) {
t[0] = t[1] = t[2] = t[3] = c;
t += 4;
}
if(len & 2) {
t[0] = t[1] = c;
t += 2;
}
if(len & 1)
t[0] = c;
}
#endif

View file

@ -196,15 +196,6 @@ void memsetw(void *t, word value, int count)
*w++ = value;
}
inline
void memsetd(void *t, dword value, int count)
{
dword *w = (dword *)t;
dword *lim = w + count;
while(w < lim)
*w++ = value;
}
inline
void memsetex(void *t, const void *item, int item_size, int count) {
ASSERT(count >= 0);

View file

@ -20,7 +20,7 @@ inline bool operator!=(const RGBA& a, const RGBA& b)
inline RGBA RGBAZero() { RGBA c; c.r = c.g = c.b = c.a = 0; return c; }
void Fill(RGBA *t, RGBA c, int n);
inline void Fill(RGBA *t, RGBA c, int n) { memsetd(t, *(dword *)&c, n); }
void Copy(RGBA *t, const RGBA *s, int n);

View file

@ -2,42 +2,6 @@
namespace Upp {
void Fill(RGBA *t, RGBA c, int len)
{
while(len >= 16) {
t[0] = c; t[1] = c; t[2] = c; t[3] = c;
t[4] = c; t[5] = c; t[6] = c; t[7] = c;
t[8] = c; t[9] = c; t[10] = c; t[11] = c;
t[12] = c; t[13] = c; t[14] = c; t[15] = c;
t += 16;
len -= 16;
}
switch(len) {
case 15: t[14] = c;
case 14: t[13] = c;
case 13: t[12] = c;
case 12: t[11] = c;
case 11: t[10] = c;
case 10: t[9] = c;
case 9: t[8] = c;
case 8: t[7] = c;
case 7: t[6] = c;
case 6: t[5] = c;
case 5: t[4] = c;
case 4: t[3] = c;
case 3: t[2] = c;
case 2: t[1] = c;
case 1: t[0] = c;
}
}
/*
void Fill(RGBA *t, const RGBA& src, int n)
{
while(n--)
*t++ = src;
}
*/
void Copy(RGBA *t, const RGBA *s, int n)
{
while(n--)

View file

@ -15,7 +15,6 @@
#define IMAGE_PACKED(n, d) };
#define IMAGE_DATA_BEGIN
#include IMAGEFILE
#undef IMAGE_BEGIN

View file

@ -3,35 +3,6 @@
namespace Upp {
void FillRGBA(RGBA *t, RGBA c, int len)
{
while(len >= 16) {
t[0] = c; t[1] = c; t[2] = c; t[3] = c;
t[4] = c; t[5] = c; t[6] = c; t[7] = c;
t[8] = c; t[9] = c; t[10] = c; t[11] = c;
t[12] = c; t[13] = c; t[14] = c; t[15] = c;
t += 16;
len -= 16;
}
switch(len) {
case 15: t[14] = c;
case 14: t[13] = c;
case 13: t[12] = c;
case 12: t[11] = c;
case 11: t[10] = c;
case 10: t[9] = c;
case 9: t[8] = c;
case 8: t[7] = c;
case 7: t[6] = c;
case 6: t[5] = c;
case 5: t[4] = c;
case 4: t[3] = c;
case 3: t[2] = c;
case 2: t[1] = c;
case 1: t[0] = c;
}
}
void SolidFiller::Start(int minx, int maxx)
{
t += minx;
@ -78,7 +49,7 @@ void SolidFiller::Render(int val, int len)
}
else {
if(((val - 256) | (c.a - 255)) == 0) {
FillRGBA(t, c, len);
Fill(t, c, len);
t += len;
}
else {
@ -238,7 +209,7 @@ void SubpixelFiller::Render(int val, int len)
else {
if(val == 256)
if(!ss && color.a == 255) {
FillRGBA(t, color, int(e - t));
Fill(t, color, int(e - t));
t = e;
}
else