47 __m128i x0 = _mm_loadu_si128((__m128i*)address);
48 __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
49 x0 = _mm_or_si128(x0, x1);
50 x1 = _mm_shuffle_epi32(x0, 0xEE);
51 x0 = _mm_or_si128(x0, x1);
52 x1 = _mm_shuffle_epi32(x0, 0x55);
53 x0 = _mm_or_si128(x0, x1);
54 ui32 t = (
ui32)_mm_extract_epi32(x0, 0);
60 float delta_inv,
ui32 count,
ui32* max_val)
65 ui32 shift = 31 - K_max;
66 __m256i m0 = _mm256_set1_epi32((
int)0x80000000);
67 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
68 __m256i *p = (__m256i*)sp;
69 for (
ui32 i = 0; i < count; i += 8, p += 1, dp += 8)
71 __m256i v = _mm256_loadu_si256(p);
72 __m256i sign = _mm256_and_si256(v, m0);
73 __m256i val = _mm256_abs_epi32(v);
74 val = _mm256_slli_epi32(val, (
int)shift);
75 tmax = _mm256_or_si256(tmax, val);
76 val = _mm256_or_si256(val, sign);
77 _mm256_storeu_si256((__m256i*)dp, val);
79 _mm256_storeu_si256((__m256i*)max_val, tmax);
84 float delta_inv,
ui32 count,
ui32* max_val)
89 __m256 d = _mm256_set1_ps(delta_inv);
90 __m256i m0 = _mm256_set1_epi32((
int)0x80000000);
91 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
92 float *p = (
float*)sp;
94 for (
ui32 i = 0; i < count; i += 8, p += 8, dp += 8)
96 __m256 vf = _mm256_loadu_ps(p);
97 vf = _mm256_mul_ps(vf, d);
98 __m256i val = _mm256_cvtps_epi32(vf);
99 __m256i sign = _mm256_and_si256(val, m0);
100 val = _mm256_abs_epi32(val);
101 tmax = _mm256_or_si256(tmax, val);
102 val = _mm256_or_si256(val, sign);
103 _mm256_storeu_si256((__m256i*)dp, val);
105 _mm256_storeu_si256((__m256i*)max_val, tmax);
110 float delta,
ui32 count)
113 ui32 shift = 31 - K_max;
114 __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF);
116 for (
ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
118 __m256i v = _mm256_load_si256((__m256i*)sp);
119 __m256i val = _mm256_and_si256(v, m1);
120 val = _mm256_srli_epi32(val, (
int)shift);
121 val = _mm256_sign_epi32(val, v);
122 _mm256_storeu_si256((__m256i*)p, val);
128 float delta,
ui32 count)
131 __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF);
132 __m256 d = _mm256_set1_ps(delta);
133 float *p = (
float*)dp;
134 for (
ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
136 __m256i v = _mm256_load_si256((__m256i*)sp);
137 __m256i vali = _mm256_and_si256(v, m1);
138 __m256 valf = _mm256_cvtepi32_ps(vali);
139 valf = _mm256_mul_ps(valf, d);
140 __m256i sign = _mm256_andnot_si256(m1, v);
141 valf = _mm256_or_ps(valf, _mm256_castsi256_ps(sign));
142 _mm256_storeu_ps(p, valf);
ui32 avx2_find_max_val(ui32 *address)
void avx2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void avx2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void avx2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void avx2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)