From 3c4d3a67eb8f72e40cc9979bc4bce9cf2aed14dc Mon Sep 17 00:00:00 2001 From: cxl Date: Fri, 22 May 2020 17:00:36 +0000 Subject: [PATCH] Core: memcpyq, memcpydq, memcpy is using them... git-svn-id: svn://ultimatepp.org/upp/trunk@14494 f0d560ea-af0d-0410-9eb7-867de7ffcac7 --- uppsrc/Core/Blit.h | 70 +++++++++++++++++++++++++++++++++++++++++++++ uppsrc/Core/Cpu.cpp | 58 +++++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+) diff --git a/uppsrc/Core/Blit.h b/uppsrc/Core/Blit.h index 231c0c3f0..9253aee51 100644 --- a/uppsrc/Core/Blit.h +++ b/uppsrc/Core/Blit.h @@ -80,6 +80,70 @@ void memcpyd(dword *t, const dword *s, size_t len) Copy4(0); } +void memcpyq_l(qword *t, const qword *s, size_t len); + +inline +void memcpyq(qword *t, const qword *s, size_t len) +{ + if(len <= 2) { + if(len) { + if(len > 1) { + *(int64 *)t = *(int64 *)s; + *(int64 *)(t + len - 1) = *(int64 *)(s + len - 1); + return; + } + *t = *s; + } + return; + } + + auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); }; + + Copy4(len - 2); // copy tail + if(len >= 8) { + memcpyq_l(t, s, len); + return; + } + if(len & 4) { + Copy4(0); Copy4(2); + t += 4; + s += 4; + } + if(len & 2) + Copy4(0); +} + +struct dqword { + qword a, b; +}; + +static_assert(sizeof(dqword) == 16, "dqword sizeof"); + +void memcpydq_l(dqword *t, const dqword *s, size_t len); + +inline +void memcpydq(dqword *t, const dqword *s, size_t len) +{ + auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); }; + + if(len >= 8) { + memcpydq_l(t, s, len); + return; + } + if(len & 4) { + Copy4(0); Copy4(1); Copy4(2); Copy4(3); + t += 4; + s += 4; + } + if(len & 2) { + Copy4(0); Copy4(1); + t += 2; + s += 2; + } + if(len & 1) + Copy4(0); +} + #else inline void memsetd(void *p, dword c, size_t len) @@ -261,6 +325,12 @@ void svo_memcpy(void *p, const void *q, size_t len) template void memcpy_t(T *t, const T *s, size_t count) { + if((sizeof(T) & 15) == 0) + memcpydq((dqword *)t, (const dqword *)s, count * (sizeof(T) >> 4)); + else + if((sizeof(T) & 7) == 0) + memcpyq((qword *)t, (const qword *)s, count * (sizeof(T) >> 3)); + else if((sizeof(T) & 3) == 0) memcpyd((dword *)t, (const dword *)s, count * (sizeof(T) >> 2)); else diff --git a/uppsrc/Core/Cpu.cpp b/uppsrc/Core/Cpu.cpp index 2fa203d57..2c5a90747 100644 --- a/uppsrc/Core/Cpu.cpp +++ b/uppsrc/Core/Cpu.cpp @@ -235,6 +235,64 @@ void memcpyd_l(dword *t, const dword *s, size_t len) if(len & 4) Copy4(0); } + +void memcpyq_l(qword *t, const qword *s, size_t len) +{ + auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); }; + + if(len >= 512*1024) { // for really huge data, call memcpy to bypass the cache + memcpy(t, s, 8 * len); + return; + } + Copy4(0); // align target data up on next 16 bytes boundary + const qword *e = t + len; + qword *t1 = (qword *)(((uintptr_t)t | 15) + 1); + s += t1 - t; + t = t1; + len = e - t; + e -= 8; + while(t <= e) { + Copy4(0); Copy4(2); Copy4(4); Copy4(6); + t += 8; + s += 8; + } + if(len & 4) { + Copy4(0); Copy4(2); + t += 4; + s += 4; + } + if(len & 2) + Copy4(0); +} + +void memcpydq_l(dqword *t, const dqword *s, size_t len) +{ + auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); }; + + if(len >= 256*1024) { // for really huge data, call memcpy to bypass the cache + memcpy(t, s, 16 * len); + return; + } + Copy4(0); // align target data up on next 16 bytes boundary + const dqword *e = t + len; + dqword *t1 = (dqword *)(((uintptr_t)t | 15) + 1); + s += t1 - t; + t = t1; + len = e - t; + e -= 4; + while(t <= e) { + Copy4(0); Copy4(1); Copy4(2); Copy4(3); + t += 4; + s += 4; + } + if(len & 2) { + Copy4(0); Copy4(1); + t += 2; + s += 2; + } + if(len & 1) + Copy4(0); +} #endif #ifdef CPU_UNALIGNED