Core: memcpyd, memsetd now reduced in code size, Vector::Grow now using new memory routines

git-svn-id: svn://ultimatepp.org/upp/trunk@14491 f0d560ea-af0d-0410-9eb7-867de7ffcac7
This commit is contained in:
cxl 2020-05-22 09:36:44 +00:00
parent 224edd7634
commit 175eda6c88
3 changed files with 82 additions and 21 deletions

View file

@ -1,8 +1,9 @@
#ifdef CPU_X86
#include <smmintrin.h>
#include <emmintrin.h>
void huge_memsetd(void *p, dword data, size_t len);
void memsetd_l(dword *t, dword data, size_t len);
inline
void memsetd(void *p, dword data, size_t len)
@ -18,25 +19,14 @@ void memsetd(void *p, dword data, size_t len)
return;
}
__m128i val4 = _mm_set1_epi32(data);
auto Set4 = [&](int at) { _mm_storeu_si128((__m128i *)(t + at), val4); };
Set4(len - 4); // fill tail
if(len >= 16) {
Set4(0); // align up on next 16 bytes boundary
const dword *e = t + len;
t = (dword *)(((uintptr_t)t | 15) + 1);
len = e - t;
e -= 16;
if(len >= 1024*1024) { // for really huge data, bypass the cache
huge_memsetd(t, data, len);
return;
}
while(t <= e) {
Set4(0); Set4(4); Set4(8); Set4(12);
t += 16;
}
memsetd_l(t, data, len);
return;
}
__m128i val4 = _mm_set1_epi32(data);
auto Set4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), val4); };
Set4(len - 4); // fill tail
if(len & 8) {
Set4(0); Set4(4);
t += 8;
@ -45,6 +35,8 @@ void memsetd(void *p, dword data, size_t len)
Set4(0);
}
void memcpyd_l(dword *t, const dword *s, size_t len);
inline
void memcpyd(dword *t, const dword *s, size_t len)
{
@ -75,10 +67,12 @@ void memcpyd(dword *t, const dword *s, size_t len)
}
#endif
auto Copy4 = [&](int at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
Copy4(len - 4); // copy tail
if(len >= 16) {
memcpyd_l(t, s, len);
#if 0
Copy4(0); // align target data up on next 16 bytes boundary
const dword *e = t + len;
dword *t1 = (dword *)(((uintptr_t)t | 15) + 1);
@ -95,6 +89,7 @@ void memcpyd(dword *t, const dword *s, size_t len)
t += 16;
s += 16;
}
#endif
}
if(len & 8) {
Copy4(0); Copy4(4);

View file

@ -170,6 +170,68 @@ void huge_memsetd(void *p, dword c, size_t len)
while(len--)
*t++ = c;
}
void memsetd_l(dword *t, dword data, size_t len)
{
__m128i val4 = _mm_set1_epi32(data);
auto Set4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), val4); };
Set4(len - 4); // fill tail
if(len >= 32) {
if(len >= 1024*1024) { // for really huge data, bypass the cache
huge_memsetd(t, data, len);
return;
}
Set4(0); // align up on 16 bytes boundary
const dword *e = t + len;
t = (dword *)(((uintptr_t)t | 15) + 1);
len = e - t;
e -= 32;
while(t <= e) {
Set4(0); Set4(4); Set4(8); Set4(12);
Set4(16); Set4(20); Set4(24); Set4(28);
t += 32;
}
}
if(len & 16) {
Set4(0); Set4(4); Set4(8); Set4(12);
t += 16;
}
if(len & 8) {
Set4(0); Set4(4);
t += 8;
}
if(len & 4)
Set4(0);
}
void memcpyd_l(dword *t, const dword *s, size_t len)
{
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
Copy4(0); // align target data up on next 16 bytes boundary
const dword *e = t + len;
dword *t1 = (dword *)(((uintptr_t)t | 15) + 1);
s += t1 - t;
t = t1;
len = e - t;
e -= 16;
if(len >= 1024*1024) { // for really huge data, call memcpy to bypass the cache
memcpy(t, s, 4 * len);
return;
}
while(t <= e) {
Copy4(0); Copy4(4); Copy4(8); Copy4(12);
t += 16;
s += 16;
}
if(len & 8) {
Copy4(0); Copy4(4);
t += 8;
s += 8;
}
if(len & 4)
Copy4(0);
}
#endif
#ifdef CPU_UNALIGNED

View file

@ -66,8 +66,12 @@ bool Vector<T>::ReAlloc(int newalloc)
newvector = newalloc ? MemoryAllocSz(sz) : NULL;
alloc = newalloc == INT_MAX ? INT_MAX // maximum alloc reached
: (int)((sz - sz0) / sizeof(T) + newalloc); // adjust alloc to real memory size
if(vector && newvector)
memcpy(newvector, vector, (size_t)items * sizeof(T));
if(vector && newvector) {
if((sizeof(T) & 3) == 0)
memcpyd((dword *)newvector, (dword *)vector, (size_t)items * (sizeof(T) >> 2));
else
svo_memcpy(newvector, vector, (size_t)items * sizeof(T));
}
vector = (T *)newvector;
return alloced;
}