#include "Core.h" #ifdef COMPILER_MSC #include #else #include #endif namespace Upp { #ifdef CPU_X86 static bool sHasMMX; static bool sHasSSE; static bool sHasSSE2; static bool sHasSSE3; static bool sHasAVX; static bool sHypervisor; static void sCheckCPU() { static bool done; if(done) return; done = true; ONCELOCK { unsigned int eax, ebx, ecx, edx; #ifdef COMPILER_MSC int cpuInfo[4]; __cpuid(cpuInfo, 1); eax = cpuInfo[0]; ebx = cpuInfo[1]; ecx = cpuInfo[2]; edx = cpuInfo[3]; #else __get_cpuid(1, &eax, &ebx, &ecx, &edx); #endif // https://en.wikipedia.org/wiki/CPUID#EAX.3D1:_Processor_Info_and_Feature_Bits sHasMMX = edx & (1 << 23); sHasSSE = edx & (1 << 25); sHasSSE2 = edx & (1 << 26); sHasSSE3 = ecx & 1; sHasAVX = ecx & (1 << 28); sHypervisor = ecx & (1 << 31); } } INITBLOCK { // sCheckCPU(); } bool CpuMMX() { sCheckCPU(); return sHasMMX; } bool CpuSSE() { sCheckCPU(); return sHasSSE; } bool CpuSSE2() { sCheckCPU(); return sHasSSE2; } bool CpuSSE3() { sCheckCPU(); return sHasSSE3; } bool CpuAVX() { sCheckCPU(); return sHasAVX; } bool CpuHypervisor() { sCheckCPU(); return sHypervisor; } #ifdef PLATFORM_POSIX #ifdef PLATFORM_BSD #include #include #else #include #endif #endif int CPU_Cores() { static int n; ONCELOCK { #ifdef PLATFORM_WIN32 #ifdef CPU_64 uint64 pa, sa; GetProcessAffinityMask(GetCurrentProcess(), &pa, &sa); for(int i = 0; i < 64; i++) n += !!(sa & ((uint64)1 << i)); #else DWORD pa, sa; GetProcessAffinityMask(GetCurrentProcess(), &pa, &sa); for(int i = 0; i < 32; i++) n += !!(sa & (1 << i)); #endif #elif defined(PLATFORM_POSIX) #ifdef PLATFORM_BSD int mib[2]; size_t len = sizeof(n); mib[0] = CTL_HW; mib[1] = HW_NCPU; sysctl(mib, 2, &n, &len, NULL, 0); n = minmax(n, 1, 256); #elif defined(PLATFORM_SOLARIS) n = minmax((int)sysconf(_SC_NPROCESSORS_ONLN), 1, 256); #else n = minmax(get_nprocs(), 1, 256); #endif #else n = 1; #endif } return n; } #else #ifdef PLATFORM_LINUX #ifdef PLATFORM_ANDROID #include int CPU_Cores() { return android_getCpuCount(); } #else #include int CPU_Cores() { return minmax(get_nprocs(), 1, 256); } #endif #else int CPU_Cores() { return 1; } #endif #endif #ifdef PLATFORM_WIN32 bool IsDecentMachine() { if(!IsWin2K()) return false; MEMORYSTATUS m; GlobalMemoryStatus(&m); return m.dwTotalPhys > 500 * 1024 * 1024; } #else bool IsDecentMachine() { return true; } #endif #define ENDIAN_SWAP { while(count--) { EndianSwap(*v++); } } void EndianSwap(word *v, size_t count) ENDIAN_SWAP void EndianSwap(int16 *v, size_t count) ENDIAN_SWAP void EndianSwap(dword *v, size_t count) ENDIAN_SWAP void EndianSwap(int *v, size_t count) ENDIAN_SWAP void EndianSwap(int64 *v, size_t count) ENDIAN_SWAP void EndianSwap(uint64 *v, size_t count) ENDIAN_SWAP #ifdef CPU_X86 void huge_memsetd(void *p, dword c, size_t len) { // bypasses the cache, good for >4MB dword *t = (dword *)p; if(((uintptr_t)t & 3) == 0 && len > 64) { __m128i val4 = _mm_set1_epi32(c); auto Set4S = [&](int at) { _mm_stream_si128((__m128i *)(t + at), val4); }; while((uintptr_t)t & 15) { // align to 16 bytes for SSE *t++ = c; len--; } while(len >= 16) { Set4S(0); Set4S(4); Set4S(8); Set4S(12); t += 16; len -= 16; } _mm_sfence(); } while(len--) *t++ = c; } void memsetd_l(dword *t, dword data, size_t len) { __m128i val4 = _mm_set1_epi32(data); auto Set4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), val4); }; Set4(len - 4); // fill tail if(len >= 32) { if(len >= 1024*1024) { // for really huge data, bypass the cache huge_memsetd(t, data, len); return; } Set4(0); // align up on 16 bytes boundary const dword *e = t + len; t = (dword *)(((uintptr_t)t | 15) + 1); len = e - t; e -= 32; while(t <= e) { Set4(0); Set4(4); Set4(8); Set4(12); Set4(16); Set4(20); Set4(24); Set4(28); t += 32; } } if(len & 16) { Set4(0); Set4(4); Set4(8); Set4(12); t += 16; } if(len & 8) { Set4(0); Set4(4); t += 8; } if(len & 4) Set4(0); } void memcpyd_l(dword *t, const dword *s, size_t len) { auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); }; Copy4(0); // align target data up on next 16 bytes boundary const dword *e = t + len; dword *t1 = (dword *)(((uintptr_t)t | 15) + 1); s += t1 - t; t = t1; len = e - t; e -= 16; if(len >= 1024*1024) { // for really huge data, call memcpy to bypass the cache memcpy(t, s, 4 * len); return; } while(t <= e) { Copy4(0); Copy4(4); Copy4(8); Copy4(12); t += 16; s += 16; } if(len & 8) { Copy4(0); Copy4(4); t += 8; s += 8; } if(len & 4) Copy4(0); } void memcpyq_l(qword *t, const qword *s, size_t len) { auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); }; if(len >= 512*1024) { // for really huge data, call memcpy to bypass the cache memcpy(t, s, 8 * len); return; } Copy4(0); // align target data up on next 16 bytes boundary const qword *e = t + len; qword *t1 = (qword *)(((uintptr_t)t | 15) + 1); s += t1 - t; t = t1; len = e - t; e -= 8; while(t <= e) { Copy4(0); Copy4(2); Copy4(4); Copy4(6); t += 8; s += 8; } if(len & 4) { Copy4(0); Copy4(2); t += 4; s += 4; } if(len & 2) Copy4(0); } void memcpydq_l(dqword *t, const dqword *s, size_t len) { auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); }; if(len >= 256*1024) { // for really huge data, call memcpy to bypass the cache memcpy(t, s, 16 * len); return; } Copy4(0); // align target data up on next 16 bytes boundary const dqword *e = t + len; dqword *t1 = (dqword *)(((uintptr_t)t | 15) + 1); s += t1 - t; t = t1; len = e - t; e -= 4; while(t <= e) { Copy4(0); Copy4(1); Copy4(2); Copy4(3); t += 4; s += 4; } if(len & 2) { Copy4(0); Copy4(1); t += 2; s += 2; } if(len & 1) Copy4(0); } #endif #ifdef CPU_UNALIGNED never_inline void svo_memset_l(byte *t, dword val4, size_t len) { const byte *e = t + len; t = (byte *)(((uintptr_t)t | 3) + 1); len = e - t; memsetd(t, val4, len >> 2); } never_inline void svo_memcpy_l(byte *t, byte *s, size_t len) { const byte *e = t + len; byte *t2 = (byte *)(((uintptr_t)t | 3) + 1); s += t2 - t; t = t2; len = e - t; memcpyd((dword *)t, (dword *)s, len >> 2); } #endif }