Core: SSE iTxN cleanup

This commit is contained in:
Mirek Fidler 2025-04-10 10:50:50 +02:00
parent fb1deb732f
commit 3f37110ce6

View file

@ -54,18 +54,23 @@ force_inline f32x4 Broadcast1(f32x4 a) { return _mm_shuffle_ps(a.data
force_inline f32x4 Broadcast2(f32x4 a) { return _mm_shuffle_ps(a.data, a.data, _MM_BCAST(2)); }
force_inline f32x4 Broadcast3(f32x4 a) { return _mm_shuffle_ps(a.data, a.data, _MM_BCAST(3)); }
struct i16x8 { // 8xint16
template <class T>
struct iTxN { // 8xint16
__m128i data;
i16x8& Load(const void *ptr) { data = _mm_loadu_si128((__m128i *)ptr); return *this; }
i16x8& Load64(const void *ptr) { data = _mm_castpd_si128(_mm_load_sd((double *)ptr)); return *this; }
i16x8& Load32(const void *ptr) { data = _mm_castps_si128(_mm_load_ss((float *)ptr)); return *this; }
void Store(void *ptr) { _mm_storeu_si128((__m128i *)ptr, data); }
void Store64(void *ptr) { _mm_store_sd((double *)ptr, _mm_castsi128_pd(data)); }
void Store32(void *ptr) { _mm_store_ss((float *)ptr, _mm_castsi128_ps(data)); }
void Stream(void *ptr) { _mm_stream_si128((__m128i *)ptr, data); };
T& AsT() { return *static_cast<T *>(this); }
T& Load(const void *ptr) { data = _mm_loadu_si128((__m128i *)ptr); return AsT(); }
T& Load64(const void *ptr) { data = _mm_castpd_si128(_mm_load_sd((double *)ptr)); return AsT(); }
T& Load32(const void *ptr) { data = _mm_castps_si128(_mm_load_ss((float *)ptr)); return AsT(); }
void Store(void *ptr) { _mm_storeu_si128((__m128i *)ptr, data); }
void Store64(void *ptr) { _mm_store_sd((double *)ptr, _mm_castsi128_pd(data)); }
void Store32(void *ptr) { _mm_store_ss((float *)ptr, _mm_castsi128_ps(data)); }
void Stream(void *ptr) { _mm_stream_si128((__m128i *)ptr, data); };
};
struct i16x8 : iTxN<i16x8> { // 8xint16
i16x8() {}
i16x8(const void *ptr) { Load(ptr); }
i16x8(__m128i d) { data = d; }
@ -106,13 +111,14 @@ force_inline int FirstTrue(i16x8 a) { return CountTrailingZeroBit
force_inline int FirstFalse(i16x8 a) { return CountTrailingZeroBits(~_mm_movemask_epi8(a.data)) >> 1; }
force_inline int IsTrue(i16x8 a, int i) { return _mm_movemask_epi8(a.data) & (1 << 2 * i); }
struct i32x4 : i16x8 { // 4xint32
struct i32x4 : iTxN<i32x4> { // 4xint32
i32x4() {}
i32x4(const void *ptr) { Load(ptr); }
i32x4(__m128i d) { data = d; }
i32x4(int v) { data = _mm_set_epi32(0, 0, 0, v); }
i32x4(int a, int b, int c, int d) { data = _mm_set_epi32(a, b, c, d); }
operator int() { return _mm_cvtsi128_si32(data); }
operator i16x8() const { return i16x8(data); }
};
force_inline i32x4 i32all(int v) { return _mm_set1_epi32(v); }
@ -145,13 +151,14 @@ force_inline int FirstTrue(i32x4 a) { return CountTrailingZeroBit
force_inline int FirstFalse(i32x4 a) { return CountTrailingZeroBits(~_mm_movemask_ps(_mm_castsi128_ps(a.data))); }
force_inline bool IsTrue(i32x4 a, int i) { return _mm_movemask_ps(_mm_castsi128_ps(a.data)) & (1 << i); }
struct i8x16 : i16x8 { // 16xint8
struct i8x16 : iTxN<i8x16> { // 16xint8
i8x16() {}
i8x16(const void *ptr) { Load(ptr); }
i8x16(__m128i d) { data = d; }
i8x16(int v) { data = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,v); }
i8x16(int a, int b, int c, int d, int e, int f, int g, int h, int i, int j, int k, int l, int m, int n, int o, int p)
{ data = _mm_set_epi8(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p); }
operator i16x8() const { return i16x8(data); }
};
force_inline i8x16 i8all(int v) { return _mm_set1_epi8(v); }