ultimatepp/uppsrc/Core/Cpu.cpp
cxl 78cf875ad8 Core: Fixed to compile with ARM
git-svn-id: svn://ultimatepp.org/upp/trunk@14502 f0d560ea-af0d-0410-9eb7-867de7ffcac7
2020-05-25 21:27:25 +00:00

322 lines
6.4 KiB
C++

#include "Core.h"
#ifdef CPU_X86
#ifdef COMPILER_MSC
#include <intrin.h>
#else
#include <cpuid.h>
#endif
#endif
namespace Upp {
#ifdef CPU_X86
static bool sHasMMX;
static bool sHasSSE;
static bool sHasSSE2;
static bool sHasSSE3;
static bool sHasAVX;
static bool sHypervisor;
static void sCheckCPU()
{
static bool done;
if(done) return;
done = true;
ONCELOCK {
unsigned int eax, ebx, ecx, edx;
#ifdef COMPILER_MSC
int cpuInfo[4];
__cpuid(cpuInfo, 1);
eax = cpuInfo[0];
ebx = cpuInfo[1];
ecx = cpuInfo[2];
edx = cpuInfo[3];
#else
__get_cpuid(1, &eax, &ebx, &ecx, &edx);
#endif
// https://en.wikipedia.org/wiki/CPUID#EAX.3D1:_Processor_Info_and_Feature_Bits
sHasMMX = edx & (1 << 23);
sHasSSE = edx & (1 << 25);
sHasSSE2 = edx & (1 << 26);
sHasSSE3 = ecx & 1;
sHasAVX = ecx & (1 << 28);
sHypervisor = ecx & (1 << 31);
}
}
INITBLOCK {
// sCheckCPU();
}
bool CpuMMX() { sCheckCPU(); return sHasMMX; }
bool CpuSSE() { sCheckCPU(); return sHasSSE; }
bool CpuSSE2() { sCheckCPU(); return sHasSSE2; }
bool CpuSSE3() { sCheckCPU(); return sHasSSE3; }
bool CpuAVX() { sCheckCPU(); return sHasAVX; }
bool CpuHypervisor() { sCheckCPU(); return sHypervisor; }
#ifdef PLATFORM_POSIX
#ifdef PLATFORM_BSD
#include <sys/param.h>
#include <sys/sysctl.h>
#else
#include <sys/sysinfo.h>
#endif
#endif
int CPU_Cores()
{
static int n;
ONCELOCK {
#ifdef PLATFORM_WIN32
#ifdef CPU_64
uint64 pa, sa;
GetProcessAffinityMask(GetCurrentProcess(), &pa, &sa);
for(int i = 0; i < 64; i++)
n += !!(sa & ((uint64)1 << i));
#else
DWORD pa, sa;
GetProcessAffinityMask(GetCurrentProcess(), &pa, &sa);
for(int i = 0; i < 32; i++)
n += !!(sa & (1 << i));
#endif
#elif defined(PLATFORM_POSIX)
#ifdef PLATFORM_BSD
int mib[2];
size_t len = sizeof(n);
mib[0] = CTL_HW;
mib[1] = HW_NCPU;
sysctl(mib, 2, &n, &len, NULL, 0);
n = minmax(n, 1, 256);
#elif defined(PLATFORM_SOLARIS)
n = minmax((int)sysconf(_SC_NPROCESSORS_ONLN), 1, 256);
#else
n = minmax(get_nprocs(), 1, 256);
#endif
#else
n = 1;
#endif
}
return n;
}
#else
#ifdef PLATFORM_LINUX
#ifdef PLATFORM_ANDROID
#include <cpu-features.h>
int CPU_Cores()
{
return android_getCpuCount();
}
#else
#include <sys/sysinfo.h>
int CPU_Cores()
{
return minmax(get_nprocs(), 1, 256);
}
#endif
#else
int CPU_Cores()
{
return 1;
}
#endif
#endif
#ifdef PLATFORM_WIN32
bool IsDecentMachine()
{
if(!IsWin2K())
return false;
MEMORYSTATUS m;
GlobalMemoryStatus(&m);
return m.dwTotalPhys > 500 * 1024 * 1024;
}
#else
bool IsDecentMachine()
{
return true;
}
#endif
#define ENDIAN_SWAP { while(count--) { EndianSwap(*v++); } }
void EndianSwap(word *v, size_t count) ENDIAN_SWAP
void EndianSwap(int16 *v, size_t count) ENDIAN_SWAP
void EndianSwap(dword *v, size_t count) ENDIAN_SWAP
void EndianSwap(int *v, size_t count) ENDIAN_SWAP
void EndianSwap(int64 *v, size_t count) ENDIAN_SWAP
void EndianSwap(uint64 *v, size_t count) ENDIAN_SWAP
#ifdef CPU_X86
void huge_memsetd(void *p, dword c, size_t len)
{ // bypasses the cache, good for >4MB
dword *t = (dword *)p;
if(((uintptr_t)t & 3) == 0 && len > 64) {
__m128i val4 = _mm_set1_epi32(c);
auto Set4S = [&](int at) { _mm_stream_si128((__m128i *)(t + at), val4); };
while((uintptr_t)t & 15) { // align to 16 bytes for SSE
*t++ = c;
len--;
}
while(len >= 16) {
Set4S(0); Set4S(4); Set4S(8); Set4S(12);
t += 16;
len -= 16;
}
_mm_sfence();
}
while(len--)
*t++ = c;
}
void memsetd_l(dword *t, dword data, size_t len)
{
__m128i val4 = _mm_set1_epi32(data);
auto Set4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), val4); };
Set4(len - 4); // fill tail
if(len >= 32) {
if(len >= 1024*1024) { // for really huge data, bypass the cache
huge_memsetd(t, data, len);
return;
}
Set4(0); // align up on 16 bytes boundary
const dword *e = t + len;
t = (dword *)(((uintptr_t)t | 15) + 1);
len = e - t;
e -= 32;
while(t <= e) {
Set4(0); Set4(4); Set4(8); Set4(12);
Set4(16); Set4(20); Set4(24); Set4(28);
t += 32;
}
}
if(len & 16) {
Set4(0); Set4(4); Set4(8); Set4(12);
t += 16;
}
if(len & 8) {
Set4(0); Set4(4);
t += 8;
}
if(len & 4)
Set4(0);
}
void memcpyd_l(dword *t, const dword *s, size_t len)
{
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
Copy4(0); // align target data up on next 16 bytes boundary
const dword *e = t + len;
dword *t1 = (dword *)(((uintptr_t)t | 15) + 1);
s += t1 - t;
t = t1;
len = e - t;
e -= 16;
if(len >= 1024*1024) { // for really huge data, call memcpy to bypass the cache
memcpy(t, s, 4 * len);
return;
}
while(t <= e) {
Copy4(0); Copy4(4); Copy4(8); Copy4(12);
t += 16;
s += 16;
}
if(len & 8) {
Copy4(0); Copy4(4);
t += 8;
s += 8;
}
if(len & 4)
Copy4(0);
}
void memcpyq_l(qword *t, const qword *s, size_t len)
{
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
if(len >= 512*1024) { // for really huge data, call memcpy to bypass the cache
memcpy(t, s, 8 * len);
return;
}
Copy4(0); // align target data up on next 16 bytes boundary
const qword *e = t + len;
qword *t1 = (qword *)(((uintptr_t)t | 15) + 1);
s += t1 - t;
t = t1;
len = e - t;
e -= 8;
while(t <= e) {
Copy4(0); Copy4(2); Copy4(4); Copy4(6);
t += 8;
s += 8;
}
if(len & 4) {
Copy4(0); Copy4(2);
t += 4;
s += 4;
}
if(len & 2)
Copy4(0);
}
void memcpydq_l(dqword *t, const dqword *s, size_t len)
{
auto Copy4 = [&](size_t at) { _mm_storeu_si128((__m128i *)(t + at), _mm_loadu_si128((__m128i *)(s + at))); };
if(len >= 256*1024) { // for really huge data, call memcpy to bypass the cache
memcpy(t, s, 16 * len);
return;
}
Copy4(0); // align target data up on next 16 bytes boundary
const dqword *e = t + len;
dqword *t1 = (dqword *)(((uintptr_t)t | 15) + 1);
s += t1 - t;
t = t1;
len = e - t;
e -= 4;
while(t <= e) {
Copy4(0); Copy4(1); Copy4(2); Copy4(3);
t += 4;
s += 4;
}
if(len & 2) {
Copy4(0); Copy4(1);
t += 2;
s += 2;
}
if(len & 1)
Copy4(0);
}
#endif
#ifdef CPU_UNALIGNED
never_inline
void svo_memset_l(byte *t, dword val4, size_t len)
{
const byte *e = t + len;
t = (byte *)(((uintptr_t)t | 3) + 1);
len = e - t;
memsetd(t, val4, len >> 2);
}
never_inline
void svo_memcpy_l(byte *t, byte *s, size_t len)
{
const byte *e = t + len;
byte *t2 = (byte *)(((uintptr_t)t | 3) + 1);
s += t2 - t;
t = t2;
len = e - t;
memcpyd((dword *)t, (dword *)s, len >> 2);
}
#endif
}