47 __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
48 x1 = _mm_shuffle_epi32(x0, 0xEE);
49 x0 = _mm_or_si128(x0, x1);
50 x1 = _mm_shuffle_epi32(x0, 0x55);
51 x0 = _mm_or_si128(x0, x1);
52 _mm_storeu_si128((__m128i*)address, x0);
63 float delta_inv,
ui32 count,
ui32* max_val)
68 ui32 shift = 31 - K_max;
69 __m128i m0 = _mm_set1_epi32((
int)0x80000000);
70 __m128i zero = _mm_setzero_si128();
71 __m128i one = _mm_set1_epi32(1);
72 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
73 __m128i *p = (__m128i*)sp;
74 for (
ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
76 __m128i v = _mm_loadu_si128(p);
77 __m128i sign = _mm_cmplt_epi32(v, zero);
78 __m128i val = _mm_xor_si128(v, sign);
79 __m128i ones = _mm_and_si128(sign, one);
80 val = _mm_add_epi32(val, ones);
81 sign = _mm_and_si128(sign, m0);
82 val = _mm_slli_epi32(val, (
int)shift);
83 tmax = _mm_or_si128(tmax, val);
84 val = _mm_or_si128(val, sign);
85 _mm_storeu_si128((__m128i*)dp, val);
87 _mm_storeu_si128((__m128i*)max_val, tmax);
92 float delta_inv,
ui32 count,
ui32* max_val)
98 __m128 d = _mm_set1_ps(delta_inv);
99 __m128i zero = _mm_setzero_si128();
100 __m128i one = _mm_set1_epi32(1);
101 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
102 float *p = (
float*)sp;
103 for (
ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
105 __m128 vf = _mm_loadu_ps(p);
106 vf = _mm_mul_ps(vf, d);
107 __m128i val = _mm_cvtps_epi32(vf);
108 __m128i sign = _mm_cmplt_epi32(val, zero);
109 val = _mm_xor_si128(val, sign);
110 __m128i ones = _mm_and_si128(sign, one);
111 val = _mm_add_epi32(val, ones);
112 tmax = _mm_or_si128(tmax, val);
113 sign = _mm_slli_epi32(sign, 31);
114 val = _mm_or_si128(val, sign);
115 _mm_storeu_si128((__m128i*)dp, val);
117 _mm_storeu_si128((__m128i*)max_val, tmax);
122 float delta,
ui32 count)
125 ui32 shift = 31 - K_max;
126 __m128i m1 = _mm_set1_epi32(0x7FFFFFFF);
127 __m128i zero = _mm_setzero_si128();
128 __m128i one = _mm_set1_epi32(1);
130 for (
ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
132 __m128i v = _mm_load_si128((__m128i*)sp);
133 __m128i val = _mm_and_si128(v, m1);
134 val = _mm_srli_epi32(val, (
int)shift);
135 __m128i sign = _mm_cmplt_epi32(v, zero);
136 val = _mm_xor_si128(val, sign);
137 __m128i ones = _mm_and_si128(sign, one);
138 val = _mm_add_epi32(val, ones);
139 _mm_storeu_si128((__m128i*)p, val);
145 float delta,
ui32 count)
148 __m128i m1 = _mm_set1_epi32(0x7FFFFFFF);
149 __m128 d = _mm_set1_ps(delta);
150 float *p = (
float*)dp;
151 for (
ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
153 __m128i v = _mm_load_si128((__m128i*)sp);
154 __m128i vali = _mm_and_si128(v, m1);
155 __m128 valf = _mm_cvtepi32_ps(vali);
156 valf = _mm_mul_ps(valf, d);
157 __m128i sign = _mm_andnot_si128(m1, v);
158 valf = _mm_or_ps(valf, _mm_castsi128_ps(sign));
159 _mm_storeu_ps(p, valf);
void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
ui32 sse2_find_max_val(ui32 *address)