mirror of
https://github.com/ultimatepp/ultimatepp.git
synced 2026-05-15 14:16:07 -06:00
Core: memcpyq, memcpydq, memcpy is using them...
git-svn-id: svn://ultimatepp.org/upp/trunk@14494 f0d560ea-af0d-0410-9eb7-867de7ffcac7
This commit is contained in:
parent
461b920c61
commit
3c4d3a67eb
2 changed files with 128 additions and 0 deletions
|
|
@ -80,6 +80,70 @@ void memcpyd(dword *t, const dword *s, size_t len)
|
|||
Copy4(0);
|
||||
}
|
||||
|
||||
void memcpyq_l(qword *t, const qword *s, size_t len);
|
||||
|
||||
inline
|
||||
void memcpyq(qword *t, const qword *s, size_t len)
|
||||
{
|
||||
if(len <= 2) {
|
||||
if(len) {
|
||||
if(len > 1) {
|
||||
*(int64 *)t = *(int64 *)s;
|
||||
*(int64 *)(t + len - 1) = *(int64 *)(s + len - 1);
|
||||
return;
|
||||
}
|
||||
*t = *s;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
|
||||
|
||||
Copy4(len - 2); // copy tail
|
||||
if(len >= 8) {
|
||||
memcpyq_l(t, s, len);
|
||||
return;
|
||||
}
|
||||
if(len & 4) {
|
||||
Copy4(0); Copy4(2);
|
||||
t += 4;
|
||||
s += 4;
|
||||
}
|
||||
if(len & 2)
|
||||
Copy4(0);
|
||||
}
|
||||
|
||||
struct dqword {
|
||||
qword a, b;
|
||||
};
|
||||
|
||||
static_assert(sizeof(dqword) == 16, "dqword sizeof");
|
||||
|
||||
void memcpydq_l(dqword *t, const dqword *s, size_t len);
|
||||
|
||||
inline
|
||||
void memcpydq(dqword *t, const dqword *s, size_t len)
|
||||
{
|
||||
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
|
||||
|
||||
if(len >= 8) {
|
||||
memcpydq_l(t, s, len);
|
||||
return;
|
||||
}
|
||||
if(len & 4) {
|
||||
Copy4(0); Copy4(1); Copy4(2); Copy4(3);
|
||||
t += 4;
|
||||
s += 4;
|
||||
}
|
||||
if(len & 2) {
|
||||
Copy4(0); Copy4(1);
|
||||
t += 2;
|
||||
s += 2;
|
||||
}
|
||||
if(len & 1)
|
||||
Copy4(0);
|
||||
}
|
||||
|
||||
#else
|
||||
inline
|
||||
void memsetd(void *p, dword c, size_t len)
|
||||
|
|
@ -261,6 +325,12 @@ void svo_memcpy(void *p, const void *q, size_t len)
|
|||
template <class T>
|
||||
void memcpy_t(T *t, const T *s, size_t count)
|
||||
{
|
||||
if((sizeof(T) & 15) == 0)
|
||||
memcpydq((dqword *)t, (const dqword *)s, count * (sizeof(T) >> 4));
|
||||
else
|
||||
if((sizeof(T) & 7) == 0)
|
||||
memcpyq((qword *)t, (const qword *)s, count * (sizeof(T) >> 3));
|
||||
else
|
||||
if((sizeof(T) & 3) == 0)
|
||||
memcpyd((dword *)t, (const dword *)s, count * (sizeof(T) >> 2));
|
||||
else
|
||||
|
|
|
|||
|
|
@ -235,6 +235,64 @@ void memcpyd_l(dword *t, const dword *s, size_t len)
|
|||
if(len & 4)
|
||||
Copy4(0);
|
||||
}
|
||||
|
||||
void memcpyq_l(qword *t, const qword *s, size_t len)
|
||||
{
|
||||
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
|
||||
|
||||
if(len >= 512*1024) { // for really huge data, call memcpy to bypass the cache
|
||||
memcpy(t, s, 8 * len);
|
||||
return;
|
||||
}
|
||||
Copy4(0); // align target data up on next 16 bytes boundary
|
||||
const qword *e = t + len;
|
||||
qword *t1 = (qword *)(((uintptr_t)t | 15) + 1);
|
||||
s += t1 - t;
|
||||
t = t1;
|
||||
len = e - t;
|
||||
e -= 8;
|
||||
while(t <= e) {
|
||||
Copy4(0); Copy4(2); Copy4(4); Copy4(6);
|
||||
t += 8;
|
||||
s += 8;
|
||||
}
|
||||
if(len & 4) {
|
||||
Copy4(0); Copy4(2);
|
||||
t += 4;
|
||||
s += 4;
|
||||
}
|
||||
if(len & 2)
|
||||
Copy4(0);
|
||||
}
|
||||
|
||||
void memcpydq_l(dqword *t, const dqword *s, size_t len)
|
||||
{
|
||||
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
|
||||
|
||||
if(len >= 256*1024) { // for really huge data, call memcpy to bypass the cache
|
||||
memcpy(t, s, 16 * len);
|
||||
return;
|
||||
}
|
||||
Copy4(0); // align target data up on next 16 bytes boundary
|
||||
const dqword *e = t + len;
|
||||
dqword *t1 = (dqword *)(((uintptr_t)t | 15) + 1);
|
||||
s += t1 - t;
|
||||
t = t1;
|
||||
len = e - t;
|
||||
e -= 4;
|
||||
while(t <= e) {
|
||||
Copy4(0); Copy4(1); Copy4(2); Copy4(3);
|
||||
t += 4;
|
||||
s += 4;
|
||||
}
|
||||
if(len & 2) {
|
||||
Copy4(0); Copy4(1);
|
||||
t += 2;
|
||||
s += 2;
|
||||
}
|
||||
if(len & 1)
|
||||
Copy4(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CPU_UNALIGNED
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue