mirror of
https://github.com/ultimatepp/ultimatepp.git
synced 2026-05-15 14:16:07 -06:00
Core: Developing NEON SIMD
git-svn-id: svn://ultimatepp.org/upp/trunk@14701 f0d560ea-af0d-0410-9eb7-867de7ffcac7
This commit is contained in:
parent
54422c1bdb
commit
e035de2dbb
6 changed files with 286 additions and 12 deletions
|
|
@ -291,8 +291,8 @@ class JsonIO;
|
|||
#endif
|
||||
|
||||
#ifdef CPU_NEON
|
||||
//#include "SIMD_NEON.h"
|
||||
//#define CPU_SIMD 1
|
||||
#include "SIMD_NEON.h"
|
||||
#define CPU_SIMD 1
|
||||
#endif
|
||||
|
||||
#include "Mem.h"
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ file
|
|||
Mem.h,
|
||||
Mem.cpp,
|
||||
SIMD_SSE2.h,
|
||||
SIMD_NEON.h,
|
||||
SIMD.cpp,
|
||||
Atomic.h,
|
||||
Mt.h,
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ void memset8__(void *p, i16x8 data, size_t len)
|
|||
t = (byte *)(((uintptr_t)t | 15) + 1);
|
||||
len = e - t;
|
||||
e -= 128;
|
||||
#ifdef CPU_SSE2
|
||||
if(len >= 1024*1024) { // for really huge data, bypass the cache
|
||||
auto Set4S = [&](int at) { data.Stream(t + at); };
|
||||
while(len >= 64) {
|
||||
|
|
@ -27,6 +28,7 @@ void memset8__(void *p, i16x8 data, size_t len)
|
|||
_mm_sfence();
|
||||
e = t - 1;
|
||||
}
|
||||
#endif
|
||||
while(t <= e) {
|
||||
Set4(0*16); Set4(1*16); Set4(2*16); Set4(3*16);
|
||||
Set4(4*16); Set4(5*16); Set4(6*16); Set4(7*16);
|
||||
|
|
|
|||
|
|
@ -453,11 +453,17 @@ void memset32(void *p, dword data, size_t len)
|
|||
}
|
||||
|
||||
inline
|
||||
void memset64(void *p, dword data, size_t len)
|
||||
void memset64(void *p, qword data, size_t len)
|
||||
{
|
||||
memset__<qword>(p, data, len);
|
||||
}
|
||||
|
||||
inline
|
||||
void memset128(void *p, m128 data, size_t len)
|
||||
{
|
||||
memset__<m128>(p, data, len);
|
||||
}
|
||||
|
||||
inline
|
||||
void memcpy8(void *p, const void *q, size_t len)
|
||||
{
|
||||
|
|
@ -567,6 +573,12 @@ bool memeq64(const void *p, const void *q, size_t len)
|
|||
return memcmp(p, q, 8 * len) == 0;
|
||||
}
|
||||
|
||||
inline
|
||||
bool memeq128(const void *p, const void *q, size_t len)
|
||||
{
|
||||
return memcmp(p, q, 16 * len) == 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(CPU_LE)
|
||||
|
|
|
|||
259
uppsrc/Core/SIMD_NEON.h
Normal file
259
uppsrc/Core/SIMD_NEON.h
Normal file
|
|
@ -0,0 +1,259 @@
|
|||
#include <arm_neon.h>
|
||||
|
||||
using namespace Upp;
|
||||
|
||||
struct f32x4 {
|
||||
float32x4_t data;
|
||||
|
||||
f32x4& Load(const void *ptr) { data = vld1q_f32((float *)ptr); return *this; }
|
||||
f32x4& Load64(const void *ptr) { data = vreinterpretq_f32_s64(vsetq_lane_s64(*(int64_t *)ptr, vdupq_n_s64(0), 0)); return *this; }
|
||||
f32x4& Load32(const void *ptr) { data = vsetq_lane_f32(*(float *)ptr, vdupq_n_f32(0), 0); return *this; }
|
||||
|
||||
void Store(void *ptr) { vst1q_f32((float32_t *)ptr, data); }
|
||||
void Store64(void *ptr) { vst1_f32((float32_t *)ptr, vget_low_f32(data)); }
|
||||
void Store32(void *ptr) { *(float32_t *)ptr = vgetq_lane_f32(data, 0); }
|
||||
|
||||
f32x4() {}
|
||||
f32x4(const void *ptr) { Load(ptr); }
|
||||
f32x4(float32x4_t d) { data = d; }
|
||||
|
||||
f32x4(double f) { data = vsetq_lane_f32((float)f, vdupq_n_f32(0), 0); }
|
||||
f32x4(float f) { data = vsetq_lane_f32((float)f, vdupq_n_f32(0), 0); }
|
||||
f32x4(int f) { data = vsetq_lane_f32((float)f, vdupq_n_f32(0), 0); }
|
||||
f32x4(double a, double b, double c, double d) {
|
||||
float __attribute__((aligned(16))) w[4] = { (float)d, (float)c, (float)b, (float)a };
|
||||
data = vld1q_f32(w);
|
||||
}
|
||||
|
||||
operator float32x4_t() { return data; }
|
||||
};
|
||||
|
||||
force_inline f32x4 f32all(double f) { return vdupq_n_f32((float)f); }
|
||||
|
||||
force_inline f32x4 operator+(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
|
||||
force_inline f32x4& operator+=(f32x4& a, f32x4 b) { return a = a + b; }
|
||||
force_inline f32x4 operator-(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
|
||||
force_inline f32x4& operator-=(f32x4& a, f32x4 b) { return a = a - b; }
|
||||
force_inline f32x4 operator*(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
|
||||
force_inline f32x4& operator*=(f32x4& a, f32x4 b) { return a = a * b; }
|
||||
|
||||
force_inline f32x4 operator/(f32x4 a, f32x4 b) {
|
||||
float32x4_t reciprocal = vrecpeq_f32(b);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
|
||||
return vmulq_f32(a,reciprocal);
|
||||
}
|
||||
|
||||
force_inline f32x4& operator/=(f32x4& a, f32x4 b) { return a = a / b; }
|
||||
|
||||
force_inline f32x4 operator==(f32x4 a, f32x4 b) { return vreinterpretq_f32_u32(vceqq_f32(a, b)); }
|
||||
force_inline f32x4 operator!=(f32x4 a, f32x4 b) { return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(a, b))); }
|
||||
force_inline f32x4 operator<(f32x4 a, f32x4 b) { return vreinterpretq_f32_u32(vcltq_f32(a, b)); }
|
||||
force_inline f32x4 operator>(f32x4 a, f32x4 b) { return vreinterpretq_f32_u32(vcgtq_f32(a, b)); }
|
||||
force_inline f32x4 operator<=(f32x4 a, f32x4 b) { return vreinterpretq_f32_u32(vcleq_f32(a, b)); }
|
||||
force_inline f32x4 operator>=(f32x4 a, f32x4 b) { return vreinterpretq_f32_u32(vcgeq_f32(a, b)); }
|
||||
|
||||
force_inline bool AllTrue(uint32x4_t v) {
|
||||
uint32x2_t tmp = vand_u32(vget_low_u32(v), vget_high_u32(v));
|
||||
return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xffffffff;;
|
||||
}
|
||||
|
||||
force_inline bool AllTrue(f32x4 a) {
|
||||
return AllTrue(vreinterpretq_u32_f32(a));
|
||||
}
|
||||
|
||||
force_inline f32x4 min(f32x4 a, f32x4 b) { return vminq_f32(a, b); }
|
||||
force_inline f32x4 max(f32x4 a, f32x4 b) { return vmaxq_f32(a, b); }
|
||||
|
||||
force_inline f32x4 Broadcast0(f32x4 a) { return vdupq_n_f32(vgetq_lane_f32(a, 0)); }
|
||||
force_inline f32x4 Broadcast1(f32x4 a) { return vdupq_n_f32(vgetq_lane_f32(a, 1)); }
|
||||
force_inline f32x4 Broadcast2(f32x4 a) { return vdupq_n_f32(vgetq_lane_f32(a, 2)); }
|
||||
force_inline f32x4 Broadcast3(f32x4 a) { return vdupq_n_f32(vgetq_lane_f32(a, 3)); }
|
||||
|
||||
struct i16x8 { // 8xint16
|
||||
int16x8_t data;
|
||||
|
||||
i16x8& Load(const void *ptr) { data = vld1q_s16((int16_t *)ptr); return *this; }
|
||||
i16x8& Load64(const void *ptr) { data = vreinterpretq_s16_s64(vsetq_lane_s64(*(int64_t *)ptr, vdupq_n_s64(0), 0)); return *this; }
|
||||
i16x8& Load32(const void *ptr) { data = vreinterpretq_s16_s32(vsetq_lane_s32(*(int32_t *)ptr, vdupq_n_s32(0), 0)); return *this; }
|
||||
|
||||
void Store(void *ptr) { vst1q_s16((int16_t *)ptr, data); }
|
||||
void Store64(void *ptr) { vst1_s16((int16_t *)ptr, vget_low_s16(data)); }
|
||||
void Store32(void *ptr) { *(int32_t *)ptr = vgetq_lane_s32(vreinterpretq_s32_s16(data), 0); }
|
||||
|
||||
i16x8() {}
|
||||
i16x8(const void *ptr) { Load(ptr); }
|
||||
i16x8(int16x8_t d) { data = d; }
|
||||
i16x8(int8x16_t d) { data = vreinterpretq_s16_s8(d); }
|
||||
i16x8(int32x4_t d) { data = vreinterpretq_s16_s32(d); }
|
||||
i16x8(int v) { data = vsetq_lane_s16(v, vdupq_n_s16(0), 0); }
|
||||
i16x8(int a, int b, int c, int d, int e, int f, int g, int h) {
|
||||
int16_t __attribute__((aligned(16))) val[8] = { (int16_t)h, (int16_t)g, (int16_t)f, (int16_t)e, (int16_t)d, (int16_t)c, (int16_t)b, (int16_t)a };
|
||||
data = vld1q_s16(val);
|
||||
}
|
||||
|
||||
operator int16x8_t() { return data; }
|
||||
};
|
||||
|
||||
|
||||
force_inline i16x8 i16all(int v) { return vdupq_n_s16(v); }
|
||||
|
||||
force_inline i16x8 operator+(i16x8 a, i16x8 b) { return vaddq_s16(a, b); }
|
||||
force_inline i16x8& operator+=(i16x8& a, i16x8 b) { return a = a + b; }
|
||||
force_inline i16x8 operator-(i16x8 a, i16x8 b) { return vsubq_s16(a, b); }
|
||||
force_inline i16x8& operator-=(i16x8& a, i16x8 b) { return a = a - b; }
|
||||
force_inline i16x8 operator*(i16x8 a, i16x8 b) { return vmulq_s16(a, b); }
|
||||
force_inline i16x8& operator*=(i16x8& a, i16x8 b) { return a = a * b; }
|
||||
|
||||
force_inline i16x8 operator&(i16x8 a, i16x8 b) { return vandq_s16(a, b); }
|
||||
force_inline i16x8& operator&=(i16x8& a, i16x8 b) { return a = a & b; }
|
||||
force_inline i16x8 operator|(i16x8 a, i16x8 b) { return vorrq_s16(a, b); }
|
||||
force_inline i16x8& operator|=(i16x8& a, i16x8 b) { return a = a | b; }
|
||||
force_inline i16x8 operator^(i16x8 a, i16x8 b) { return veorq_s16(a, b); }
|
||||
force_inline i16x8& operator^=(i16x8& a, i16x8 b) { return a = a ^ b; }
|
||||
force_inline i16x8 operator~(i16x8 a) { return vmvnq_s16(a); }
|
||||
|
||||
force_inline i16x8 operator>>(i16x8 a, int b) { return vshlq_s16(a, vdupq_n_s16(-b)); }
|
||||
force_inline i16x8& operator>>=(i16x8& a, int b) { return a = a >> b; }
|
||||
force_inline i16x8 operator<<(i16x8 a, int b) { return vshlq_s16(a, vdupq_n_s16(b)); }
|
||||
force_inline i16x8& operator<<=(i16x8& a, int b) { return a = a << b; }
|
||||
|
||||
force_inline i16x8 operator==(i16x8 a, i16x8 b) { return vreinterpretq_s16_u16(vceqq_s16(a, b)); }
|
||||
force_inline i16x8 operator<(i16x8 a, i16x8 b) { return vreinterpretq_s16_u16(vcltq_s16(a, b)); }
|
||||
force_inline i16x8 operator>(i16x8 a, i16x8 b) { return vreinterpretq_s16_u16(vcgtq_s16(a, b)); }
|
||||
|
||||
force_inline bool AllTrue(i16x8 a) {
|
||||
return AllTrue(vreinterpretq_u32_s16(a));
|
||||
}
|
||||
|
||||
struct i32x4 { // 4xint32
|
||||
int32x4_t data;
|
||||
|
||||
i32x4& Load(const void *ptr) { data = vld1q_s32((int32_t *)ptr); return *this; }
|
||||
i32x4& Load64(const void *ptr) { data = vreinterpretq_s32_s64(vsetq_lane_s64(*(int64_t *)ptr, vdupq_n_s64(0), 0)); return *this; }
|
||||
i32x4& Load32(const void *ptr) { data = vsetq_lane_s32(*(int32_t *)ptr, vdupq_n_s32(0), 0); return *this; }
|
||||
|
||||
void Store(void *ptr) { vst1q_s32((int32_t *)ptr, data); }
|
||||
void Store64(void *ptr) { vst1_s32((int32_t *)ptr, vget_low_s32(data)); }
|
||||
void Store32(void *ptr) { *(int32_t *)ptr = vgetq_lane_s32(data, 0); }
|
||||
|
||||
i32x4() {}
|
||||
i32x4(const void *ptr) { Load(ptr); }
|
||||
i32x4(int32x4_t d) { data = d; }
|
||||
i32x4(int8x16_t d) { data = vreinterpretq_s32_s8(d); }
|
||||
i32x4(int16x8_t d) { data = vreinterpretq_s32_s16(d); }
|
||||
i32x4(int v) { data = vsetq_lane_s32(v, vdupq_n_s32(0), 0); }
|
||||
i32x4(int a, int b, int c, int d) {
|
||||
int32_t __attribute__((aligned(16))) val[4] = { (int16_t)d, (int16_t)c, (int16_t)b, (int16_t)a };
|
||||
data = vld1q_s32(val);
|
||||
}
|
||||
operator int32x4_t() { return data; }
|
||||
operator int() { return vgetq_lane_s32(data, 0); }
|
||||
operator i16x8() const { return i16x8(data); }
|
||||
};
|
||||
|
||||
force_inline i32x4 i32all(int v) { return vdupq_n_s32(v); }
|
||||
|
||||
force_inline i32x4 operator+(i32x4 a, i32x4 b) { return vaddq_s32(a, b); }
|
||||
force_inline i32x4& operator+=(i32x4& a, i32x4 b) { return a = a + b; }
|
||||
force_inline i32x4 operator-(i32x4 a, i32x4 b) { return vsubq_s32(a, b); }
|
||||
force_inline i32x4& operator-=(i32x4& a, i32x4 b) { return a = a - b; }
|
||||
|
||||
force_inline i32x4 operator&(i32x4 a, i32x4 b) { return vandq_s32(a, b); }
|
||||
force_inline i32x4& operator&=(i32x4& a, i32x4 b) { return a = a & b; }
|
||||
force_inline i32x4 operator|(i32x4 a, i32x4 b) { return vorrq_s32(a, b); }
|
||||
force_inline i32x4& operator|=(i32x4& a, i32x4 b) { return a = a | b; }
|
||||
force_inline i32x4 operator^(i32x4 a, i32x4 b) { return veorq_s32(a, b); }
|
||||
force_inline i32x4& operator^=(i32x4& a, i32x4 b) { return a = a ^ b; }
|
||||
force_inline i32x4 operator~(i32x4 a) { return vmvnq_s32(a); }
|
||||
|
||||
force_inline i32x4 operator>>(i32x4 a, int b) { return vshlq_s32(a, vdupq_n_s32(-b)); }
|
||||
force_inline i32x4& operator>>=(i32x4& a, int b) { return a = a >> b; }
|
||||
force_inline i32x4 operator<<(i32x4 a, int b) { return vshlq_s32(a, vdupq_n_s32(b)); }
|
||||
force_inline i32x4& operator<<=(i32x4& a, int b) { return a = a << b; }
|
||||
|
||||
force_inline i32x4 operator==(i32x4 a, i32x4 b) { return vreinterpretq_s32_u32(vceqq_s32(a, b)); }
|
||||
force_inline i32x4 operator<(i32x4 a, i32x4 b) { return vreinterpretq_s32_u32(vcltq_s32(a, b)); }
|
||||
force_inline i32x4 operator>(i32x4 a, i32x4 b) { return vreinterpretq_s32_u32(vcgtq_s32(a, b)); }
|
||||
|
||||
force_inline bool AllTrue(i32x4 a) {
|
||||
return AllTrue(vreinterpretq_u32_s32(a));
|
||||
}
|
||||
|
||||
struct i8x16 { // 16*int8
|
||||
int8x16_t data;
|
||||
|
||||
i8x16& Load(const void *ptr) { data = vld1q_s8((int8_t *)ptr); return *this; }
|
||||
i8x16& Load64(const void *ptr) { data = vreinterpretq_s8_s64(vsetq_lane_s64(*(int64_t *)ptr, vdupq_n_s64(0), 0)); return *this; }
|
||||
i8x16& Load32(const void *ptr) { data = vreinterpretq_s8_s32(vsetq_lane_s32(*(int32_t *)ptr, vdupq_n_s32(0), 0)); return *this; }
|
||||
|
||||
void Store(void *ptr) { vst1q_s8((int8_t *)ptr, data); }
|
||||
void Store64(void *ptr) { vst1_s32((int32_t *)ptr, vget_low_s32(vreinterpretq_s32_s8(data))); }
|
||||
void Store32(void *ptr) { *(int32_t *)ptr = vgetq_lane_s32(vreinterpretq_s32_s8(data), 0); }
|
||||
|
||||
i8x16() {}
|
||||
i8x16(const void *ptr) { Load(ptr); }
|
||||
i8x16(int8x16_t d) { data = d; }
|
||||
i8x16(int16x8_t d) { data = vreinterpretq_s8_s16(d); }
|
||||
i8x16(int32x4_t d) { data = vreinterpretq_s8_s32(d); }
|
||||
i8x16(int v) { data = vsetq_lane_s8(v, vdupq_n_s8(0), 0); }
|
||||
i8x16(int a, int b, int c, int d, int e, int f, int g, int h, int i, int j, int k, int l, int m, int n, int o, int p)
|
||||
{
|
||||
int8_t __attribute__((aligned(16))) val[16] = {
|
||||
(int8_t)p, (int8_t)o, (int8_t)n, (int8_t)m,
|
||||
(int8_t)l, (int8_t)k, (int8_t)j, (int8_t)i,
|
||||
(int8_t)h, (int8_t)g, (int8_t)f, (int8_t)e,
|
||||
(int8_t)d, (int8_t)c, (int8_t)b, (int8_t)a,
|
||||
};
|
||||
data = vld1q_s8(val);
|
||||
}
|
||||
operator int8x16_t() const { return data; }
|
||||
operator i16x8() const { return i16x8(data); }
|
||||
};
|
||||
|
||||
force_inline i8x16 i8all(int v) { return vdupq_n_s8(v); }
|
||||
|
||||
force_inline i8x16 operator+(i8x16 a, i8x16 b) { return vaddq_s8(a, b); }
|
||||
force_inline i8x16& operator+=(i8x16& a, i8x16 b) { return a = a + b; }
|
||||
force_inline i8x16 operator-(i8x16 a, i8x16 b) { return vsubq_s8(a, b); }
|
||||
force_inline i8x16& operator-=(i8x16& a, i8x16 b) { return a = a - b; }
|
||||
|
||||
force_inline i8x16 operator&(i8x16 a, i8x16 b) { return vandq_s8(a, b); }
|
||||
force_inline i8x16& operator&=(i8x16& a, i8x16 b) { return a = a & b; }
|
||||
force_inline i8x16 operator|(i8x16 a, i8x16 b) { return vorrq_s8(a, b); }
|
||||
force_inline i8x16& operator|=(i8x16& a, i8x16 b) { return a = a | b; }
|
||||
force_inline i8x16 operator^(i8x16 a, i8x16 b) { return veorq_s8(a, b); }
|
||||
force_inline i8x16& operator^=(i8x16& a, i8x16 b) { return a = a ^ b; }
|
||||
force_inline i8x16 operator~(i8x16 a) { return vmvnq_s8(a); }
|
||||
|
||||
force_inline f32x4 ToFloat(i32x4 a) { return vcvtq_f32_s32(a); }
|
||||
force_inline i32x4 Truncate(f32x4 a) { return vcvtq_s32_f32(a); }
|
||||
|
||||
force_inline i16x8 Unpack8L(i16x8 a) { return vzipq_s8(vreinterpretq_s8_s16(a), vdupq_n_s8(0)).val[0]; }
|
||||
force_inline i16x8 Unpack8H(i16x8 a) { return vzipq_s8(vreinterpretq_s8_s16(a), vdupq_n_s8(0)).val[1]; }
|
||||
|
||||
force_inline i32x4 Unpack16L(i16x8 a) { return vzipq_s16(a, vdupq_n_s16(0)).val[0]; }
|
||||
force_inline i32x4 Unpack16H(i16x8 a) { return vzipq_s16(a, vdupq_n_s16(0)).val[1]; }
|
||||
|
||||
force_inline i8x16 Pack16(i16x8 l, i16x8 h) { return vreinterpretq_s8_u8(vcombine_u8(vqmovun_s16(l), vqmovun_s16(h))); }
|
||||
force_inline i8x16 Pack16(i16x8 l) { return vreinterpretq_s8_u8(vcombine_u8(vqmovun_s16(l), vdup_n_u8(0))); }
|
||||
|
||||
force_inline i16x8 Pack32(i32x4 a) { return vcombine_s16(vqmovn_s32(a), vdup_n_s16(0)); }
|
||||
|
||||
force_inline i16x8 BroadcastLH0(i16x8 a) {
|
||||
return vcombine_s16(vdup_n_s16(vgetq_lane_s16(a, 0)), vdup_n_s16(vgetq_lane_s16(a, 4)));
|
||||
}
|
||||
|
||||
force_inline i16x8 BroadcastLH1(i16x8 a) {
|
||||
return vcombine_s16(vdup_n_s16(vgetq_lane_s16(a, 1)), vdup_n_s16(vgetq_lane_s16(a, 5)));
|
||||
}
|
||||
|
||||
force_inline i16x8 BroadcastLH2(i16x8 a) {
|
||||
return vcombine_s16(vdup_n_s16(vgetq_lane_s16(a, 2)), vdup_n_s16(vgetq_lane_s16(a, 6)));
|
||||
}
|
||||
|
||||
force_inline i16x8 BroadcastLH3(i16x8 a) {
|
||||
return vcombine_s16(vdup_n_s16(vgetq_lane_s16(a, 3)), vdup_n_s16(vgetq_lane_s16(a, 7)));
|
||||
}
|
||||
|
||||
force_inline i16x8 i64all(qword data) { return vreinterpretq_s16_u64(vdupq_n_u64(data)); }
|
||||
|
|
@ -1,16 +1,16 @@
|
|||
struct f32x4 {
|
||||
__m128 data;
|
||||
|
||||
f32x4& Load(void *ptr) { data = _mm_loadu_ps((float *)ptr); return *this; }
|
||||
f32x4& Load64(void *ptr) { data = _mm_castpd_ps(_mm_load_sd((double *)ptr)); return *this; }
|
||||
f32x4& Load32(void *ptr) { data = _mm_load_ss((float *)ptr); return *this; }
|
||||
f32x4& Load(const void *ptr) { data = _mm_loadu_ps((float *)ptr); return *this; }
|
||||
f32x4& Load64(const void *ptr) { data = _mm_castpd_ps(_mm_load_sd((double *)ptr)); return *this; }
|
||||
f32x4& Load32(const void *ptr) { data = _mm_load_ss((float *)ptr); return *this; }
|
||||
|
||||
void Store(void *ptr) { _mm_storeu_ps((float *)ptr, data); }
|
||||
void Store64(void *ptr) { _mm_store_sd((double *)ptr, _mm_castps_pd(data)); }
|
||||
void Store32(void *ptr) { _mm_store_ss((float *)ptr, data); }
|
||||
void Store(void *ptr) { _mm_storeu_ps((float *)ptr, data); }
|
||||
void Store64(void *ptr) { _mm_store_sd((double *)ptr, _mm_castps_pd(data)); }
|
||||
void Store32(void *ptr) { _mm_store_ss((float *)ptr, data); }
|
||||
|
||||
f32x4() {}
|
||||
f32x4(void *ptr) { Load(ptr); }
|
||||
f32x4(const void *ptr) { Load(ptr); }
|
||||
f32x4(__m128 d) { data = d; }
|
||||
f32x4(double f) { data = _mm_set_ss((float)f); }
|
||||
f32x4(float f) { data = _mm_set_ss(f); }
|
||||
|
|
@ -98,7 +98,7 @@ force_inline bool AllTrue(i16x8 a) { return _mm_movemask_epi8(a.
|
|||
|
||||
struct i32x4 : i16x8 { // 4xint32
|
||||
i32x4() {}
|
||||
i32x4(void *ptr) { Load(ptr); }
|
||||
i32x4(const void *ptr) { Load(ptr); }
|
||||
i32x4(__m128i d) { data = d; }
|
||||
i32x4(int v) { data = _mm_set_epi32(0, 0, 0, v); }
|
||||
i32x4(int a, int b, int c, int d) { data = _mm_set_epi32(a, b, c, d); }
|
||||
|
|
@ -132,7 +132,7 @@ force_inline bool AllTrue(i32x4 a) { return _mm_movemask_epi8(a.
|
|||
|
||||
struct i8x16 : i16x8 { // 16xint8
|
||||
i8x16() {}
|
||||
i8x16(void *ptr) { Load(ptr); }
|
||||
i8x16(const void *ptr) { Load(ptr); }
|
||||
i8x16(__m128i d) { data = d; }
|
||||
i8x16(int v) { data = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,v); }
|
||||
i8x16(int a, int b, int c, int d, int e, int f, int g, int h, int i, int j, int k, int l, int m, int n, int o, int p)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue