Core: memcpyq, memcpydq, memcpy is using them...

git-svn-id: svn://ultimatepp.org/upp/trunk@14494 f0d560ea-af0d-0410-9eb7-867de7ffcac7
This commit is contained in:
cxl 2020-05-22 17:00:36 +00:00
parent 461b920c61
commit 3c4d3a67eb
2 changed files with 128 additions and 0 deletions

View file

@ -80,6 +80,70 @@ void memcpyd(dword *t, const dword *s, size_t len)
Copy4(0);
}
void memcpyq_l(qword *t, const qword *s, size_t len);
inline
void memcpyq(qword *t, const qword *s, size_t len)
{
if(len <= 2) {
if(len) {
if(len > 1) {
*(int64 *)t = *(int64 *)s;
*(int64 *)(t + len - 1) = *(int64 *)(s + len - 1);
return;
}
*t = *s;
}
return;
}
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
Copy4(len - 2); // copy tail
if(len >= 8) {
memcpyq_l(t, s, len);
return;
}
if(len & 4) {
Copy4(0); Copy4(2);
t += 4;
s += 4;
}
if(len & 2)
Copy4(0);
}
struct dqword {
qword a, b;
};
static_assert(sizeof(dqword) == 16, "dqword sizeof");
void memcpydq_l(dqword *t, const dqword *s, size_t len);
inline
void memcpydq(dqword *t, const dqword *s, size_t len)
{
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
if(len >= 8) {
memcpydq_l(t, s, len);
return;
}
if(len & 4) {
Copy4(0); Copy4(1); Copy4(2); Copy4(3);
t += 4;
s += 4;
}
if(len & 2) {
Copy4(0); Copy4(1);
t += 2;
s += 2;
}
if(len & 1)
Copy4(0);
}
#else
inline
void memsetd(void *p, dword c, size_t len)
@ -261,6 +325,12 @@ void svo_memcpy(void *p, const void *q, size_t len)
template <class T>
void memcpy_t(T *t, const T *s, size_t count)
{
if((sizeof(T) & 15) == 0)
memcpydq((dqword *)t, (const dqword *)s, count * (sizeof(T) >> 4));
else
if((sizeof(T) & 7) == 0)
memcpyq((qword *)t, (const qword *)s, count * (sizeof(T) >> 3));
else
if((sizeof(T) & 3) == 0)
memcpyd((dword *)t, (const dword *)s, count * (sizeof(T) >> 2));
else

View file

@ -235,6 +235,64 @@ void memcpyd_l(dword *t, const dword *s, size_t len)
if(len & 4)
Copy4(0);
}
void memcpyq_l(qword *t, const qword *s, size_t len)
{
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
if(len >= 512*1024) { // for really huge data, call memcpy to bypass the cache
memcpy(t, s, 8 * len);
return;
}
Copy4(0); // align target data up on next 16 bytes boundary
const qword *e = t + len;
qword *t1 = (qword *)(((uintptr_t)t | 15) + 1);
s += t1 - t;
t = t1;
len = e - t;
e -= 8;
while(t <= e) {
Copy4(0); Copy4(2); Copy4(4); Copy4(6);
t += 8;
s += 8;
}
if(len & 4) {
Copy4(0); Copy4(2);
t += 4;
s += 4;
}
if(len & 2)
Copy4(0);
}
void memcpydq_l(dqword *t, const dqword *s, size_t len)
{
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
if(len >= 256*1024) { // for really huge data, call memcpy to bypass the cache
memcpy(t, s, 16 * len);
return;
}
Copy4(0); // align target data up on next 16 bytes boundary
const dqword *e = t + len;
dqword *t1 = (dqword *)(((uintptr_t)t | 15) + 1);
s += t1 - t;
t = t1;
len = e - t;
e -= 4;
while(t <= e) {
Copy4(0); Copy4(1); Copy4(2); Copy4(3);
t += 4;
s += 4;
}
if(len & 2) {
Copy4(0); Copy4(1);
t += 2;
s += 2;
}
if(len & 1)
Copy4(0);
}
#endif
#ifdef CPU_UNALIGNED