mirror of
https://github.com/ultimatepp/ultimatepp.git
synced 2026-05-15 14:16:07 -06:00
Core: memcpyd, memsetd now reduced in code size, Vector::Grow now using new memory routines
git-svn-id: svn://ultimatepp.org/upp/trunk@14491 f0d560ea-af0d-0410-9eb7-867de7ffcac7
This commit is contained in:
parent
224edd7634
commit
175eda6c88
3 changed files with 82 additions and 21 deletions
|
|
@ -1,8 +1,9 @@
|
|||
#ifdef CPU_X86
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
void huge_memsetd(void *p, dword data, size_t len);
|
||||
void memsetd_l(dword *t, dword data, size_t len);
|
||||
|
||||
inline
|
||||
void memsetd(void *p, dword data, size_t len)
|
||||
|
|
@ -18,25 +19,14 @@ void memsetd(void *p, dword data, size_t len)
|
|||
return;
|
||||
}
|
||||
|
||||
__m128i val4 = _mm_set1_epi32(data);
|
||||
auto Set4 = [&](int at) { _mm_storeu_si128((__m128i *)(t + at), val4); };
|
||||
|
||||
Set4(len - 4); // fill tail
|
||||
if(len >= 16) {
|
||||
Set4(0); // align up on next 16 bytes boundary
|
||||
const dword *e = t + len;
|
||||
t = (dword *)(((uintptr_t)t | 15) + 1);
|
||||
len = e - t;
|
||||
e -= 16;
|
||||
if(len >= 1024*1024) { // for really huge data, bypass the cache
|
||||
huge_memsetd(t, data, len);
|
||||
return;
|
||||
}
|
||||
while(t <= e) {
|
||||
Set4(0); Set4(4); Set4(8); Set4(12);
|
||||
t += 16;
|
||||
}
|
||||
memsetd_l(t, data, len);
|
||||
return;
|
||||
}
|
||||
|
||||
__m128i val4 = _mm_set1_epi32(data);
|
||||
auto Set4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), val4); };
|
||||
Set4(len - 4); // fill tail
|
||||
if(len & 8) {
|
||||
Set4(0); Set4(4);
|
||||
t += 8;
|
||||
|
|
@ -45,6 +35,8 @@ void memsetd(void *p, dword data, size_t len)
|
|||
Set4(0);
|
||||
}
|
||||
|
||||
void memcpyd_l(dword *t, const dword *s, size_t len);
|
||||
|
||||
inline
|
||||
void memcpyd(dword *t, const dword *s, size_t len)
|
||||
{
|
||||
|
|
@ -75,10 +67,12 @@ void memcpyd(dword *t, const dword *s, size_t len)
|
|||
}
|
||||
#endif
|
||||
|
||||
auto Copy4 = [&](int at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
|
||||
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
|
||||
|
||||
Copy4(len - 4); // copy tail
|
||||
if(len >= 16) {
|
||||
memcpyd_l(t, s, len);
|
||||
#if 0
|
||||
Copy4(0); // align target data up on next 16 bytes boundary
|
||||
const dword *e = t + len;
|
||||
dword *t1 = (dword *)(((uintptr_t)t | 15) + 1);
|
||||
|
|
@ -95,6 +89,7 @@ void memcpyd(dword *t, const dword *s, size_t len)
|
|||
t += 16;
|
||||
s += 16;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
if(len & 8) {
|
||||
Copy4(0); Copy4(4);
|
||||
|
|
|
|||
|
|
@ -170,6 +170,68 @@ void huge_memsetd(void *p, dword c, size_t len)
|
|||
while(len--)
|
||||
*t++ = c;
|
||||
}
|
||||
|
||||
void memsetd_l(dword *t, dword data, size_t len)
|
||||
{
|
||||
__m128i val4 = _mm_set1_epi32(data);
|
||||
auto Set4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), val4); };
|
||||
Set4(len - 4); // fill tail
|
||||
if(len >= 32) {
|
||||
if(len >= 1024*1024) { // for really huge data, bypass the cache
|
||||
huge_memsetd(t, data, len);
|
||||
return;
|
||||
}
|
||||
Set4(0); // align up on 16 bytes boundary
|
||||
const dword *e = t + len;
|
||||
t = (dword *)(((uintptr_t)t | 15) + 1);
|
||||
len = e - t;
|
||||
e -= 32;
|
||||
while(t <= e) {
|
||||
Set4(0); Set4(4); Set4(8); Set4(12);
|
||||
Set4(16); Set4(20); Set4(24); Set4(28);
|
||||
t += 32;
|
||||
}
|
||||
}
|
||||
if(len & 16) {
|
||||
Set4(0); Set4(4); Set4(8); Set4(12);
|
||||
t += 16;
|
||||
}
|
||||
if(len & 8) {
|
||||
Set4(0); Set4(4);
|
||||
t += 8;
|
||||
}
|
||||
if(len & 4)
|
||||
Set4(0);
|
||||
}
|
||||
|
||||
void memcpyd_l(dword *t, const dword *s, size_t len)
|
||||
{
|
||||
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
|
||||
|
||||
Copy4(0); // align target data up on next 16 bytes boundary
|
||||
const dword *e = t + len;
|
||||
dword *t1 = (dword *)(((uintptr_t)t | 15) + 1);
|
||||
s += t1 - t;
|
||||
t = t1;
|
||||
len = e - t;
|
||||
e -= 16;
|
||||
if(len >= 1024*1024) { // for really huge data, call memcpy to bypass the cache
|
||||
memcpy(t, s, 4 * len);
|
||||
return;
|
||||
}
|
||||
while(t <= e) {
|
||||
Copy4(0); Copy4(4); Copy4(8); Copy4(12);
|
||||
t += 16;
|
||||
s += 16;
|
||||
}
|
||||
if(len & 8) {
|
||||
Copy4(0); Copy4(4);
|
||||
t += 8;
|
||||
s += 8;
|
||||
}
|
||||
if(len & 4)
|
||||
Copy4(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CPU_UNALIGNED
|
||||
|
|
|
|||
|
|
@ -66,8 +66,12 @@ bool Vector<T>::ReAlloc(int newalloc)
|
|||
newvector = newalloc ? MemoryAllocSz(sz) : NULL;
|
||||
alloc = newalloc == INT_MAX ? INT_MAX // maximum alloc reached
|
||||
: (int)((sz - sz0) / sizeof(T) + newalloc); // adjust alloc to real memory size
|
||||
if(vector && newvector)
|
||||
memcpy(newvector, vector, (size_t)items * sizeof(T));
|
||||
if(vector && newvector) {
|
||||
if((sizeof(T) & 3) == 0)
|
||||
memcpyd((dword *)newvector, (dword *)vector, (size_t)items * (sizeof(T) >> 2));
|
||||
else
|
||||
svo_memcpy(newvector, vector, (size_t)items * sizeof(T));
|
||||
}
|
||||
vector = (T *)newvector;
|
||||
return alloced;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue