57 float *dst = line_dst->
f32;
58 const float *src1 = line_src1->
f32, *src2 = line_src2->
f32;
61 for (
ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
63 __m256 s1 = _mm256_load_ps(src1);
64 __m256 s2 = _mm256_load_ps(src2);
65 __m256 d = _mm256_load_ps(dst);
66 d = _mm256_add_ps(d, _mm256_mul_ps(factor, _mm256_add_ps(s1, s2)));
67 _mm256_store_ps(dst, d);
73 bool L_analysis_or_H_synthesis,
ui32 repeat)
75 float *dst = line_dst->
f32;
76 const float *src = line_src->
f32;
80 __m256 factor = _mm256_set1_ps(f);
81 for (
ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src+=8)
83 __m256 s = _mm256_load_ps(src);
84 _mm256_store_ps(dst, _mm256_mul_ps(factor, s));
96 float *src = line_src->
f32;
97 float *ldst = line_ldst->
f32, *hdst = line_hdst->
f32;
99 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
100 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
104 src[width] = src[width-2];
106 const float* sp = src + (even ? 1 : 0);
109 for (
ui32 i = (H_width + 3) >> 2; i > 0; --i)
112 __m256 s1 = _mm256_loadu_ps(sp - 1);
113 __m256 s2 = _mm256_loadu_ps(sp + 1);
114 __m256 d = _mm256_loadu_ps(sp);
115 s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
116 __m256 d1 = _mm256_add_ps(d, s1);
118 __m128 t1 = _mm256_extractf128_ps(d1, 0);
119 __m128 t2 = _mm256_extractf128_ps(d1, 1);
120 __m128 t = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(2, 0, 2, 0));
121 _mm_store_ps(dph, t);
127 hdst[H_width] = hdst[H_width-1];
130 sp = src + (even ? 0 : 1);
131 const float* sph = hdst + (even ? 0 : 1);
133 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
135 __m256 d1 = _mm256_loadu_ps(sp);
136 __m128 t1 = _mm256_extractf128_ps(d1, 0);
137 __m128 t2 = _mm256_extractf128_ps(d1, 1);
138 __m128 d = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(2, 0, 2, 0));
140 __m128 s1 = _mm_loadu_ps(sph - 1);
141 __m128 s2 = _mm_loadu_ps(sph);
142 s1 = _mm_mul_ps(factor128, _mm_add_ps(s1, s2));
143 d = _mm_add_ps(d, s1);
144 _mm_store_ps(dpl, d);
149 ldst[L_width] = ldst[L_width-1];
152 const float* spl = ldst + (even ? 1 : 0);
154 for (
ui32 i = (H_width + 7) >> 3; i > 0; --i, spl+=8, dph+=8)
156 __m256 s1 = _mm256_loadu_ps(spl - 1);
157 __m256 s2 = _mm256_loadu_ps(spl);
158 __m256 d = _mm256_loadu_ps(dph);
159 s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
160 d = _mm256_add_ps(d, s1);
161 _mm256_store_ps(dph, d);
166 hdst[H_width] = hdst[H_width-1];
169 sph = hdst + (even ? 0 : 1);
171 for (
ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, dpl+=8)
173 __m256 s1 = _mm256_loadu_ps(sph - 1);
174 __m256 s2 = _mm256_loadu_ps(sph);
175 __m256 d = _mm256_loadu_ps(dpl);
176 s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
177 d = _mm256_add_ps(d, s1);
178 _mm256_store_ps(dpl, d);
184 for (
ui32 i = (L_width + 7) >> 3; i > 0; --i, dp+=8)
186 __m256 d = _mm256_load_ps(dp);
187 _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
191 for (
ui32 i = (H_width + 7) >> 3; i > 0; --i, dp+=8)
193 __m256 d = _mm256_load_ps(dp);
194 _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
200 line_ldst->
f32[0] = line_src->
f32[0];
202 line_hdst->
f32[0] = line_src->
f32[0] + line_src->
f32[0];
213 float *lsrc = line_lsrc->
f32, *hsrc = line_hsrc->
f32;
214 float *dst = line_dst->
f32;
216 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
217 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
222 for (
ui32 i = (L_width + 7) >> 3; i > 0; --i, dp+=8)
224 __m256 d = _mm256_load_ps(dp);
225 _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
229 for (
ui32 i = (H_width + 7) >> 3; i > 0; --i, dp+=8)
231 __m256 d = _mm256_load_ps(dp);
232 _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
237 hsrc[H_width] = hsrc[H_width-1];
240 const float *sph = hsrc + (even ? 0 : 1);
242 for (
ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, dpl+=8)
244 __m256 s1 = _mm256_loadu_ps(sph - 1);
245 __m256 s2 = _mm256_loadu_ps(sph);
246 __m256 d = _mm256_loadu_ps(dpl);
247 s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
248 d = _mm256_add_ps(d, s1);
249 _mm256_store_ps(dpl, d);
254 lsrc[L_width] = lsrc[L_width-1];
257 const float *spl = lsrc + (even ? 0 : -1);
259 for (
ui32 i = (H_width + 7) >> 3; i > 0; --i, dph+=8, spl+=8)
261 __m256 s1 = _mm256_loadu_ps(spl);
262 __m256 s2 = _mm256_loadu_ps(spl + 1);
263 __m256 d = _mm256_loadu_ps(dph);
264 s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
265 d = _mm256_add_ps(d, s1);
266 _mm256_store_ps(dph, d);
271 hsrc[H_width] = hsrc[H_width-1];
274 sph = hsrc + (even ? 0 : 1);
276 for (
ui32 i = (L_width + 7) >> 3; i > 0; --i, dpl+=8, sph+=8)
278 __m256 s1 = _mm256_loadu_ps(sph - 1);
279 __m256 s2 = _mm256_loadu_ps(sph);
280 __m256 d = _mm256_loadu_ps(dpl);
281 s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
282 d = _mm256_add_ps(d, s1);
283 _mm256_store_ps(dpl, d);
288 lsrc[L_width] = lsrc[L_width-1];
291 dp = dst + (even ? 0 : -1);
292 spl = lsrc + (even ? 0 : -1);
294 ui32 width = L_width + (even ? 0 : 1);
295 for (
ui32 i = (width + 7) >> 3; i > 0; --i, spl+=8, sph+=8)
297 __m256 s1 = _mm256_loadu_ps(spl);
298 __m256 s2 = _mm256_loadu_ps(spl + 1);
299 __m256 d = _mm256_load_ps(sph);
300 s2 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
301 d = _mm256_add_ps(d, s2);
303 __m128 a0 = _mm256_extractf128_ps(s1, 0);
304 __m128 a1 = _mm256_extractf128_ps(s1, 1);
305 __m128 a2 = _mm256_extractf128_ps(d, 0);
306 __m128 a3 = _mm256_extractf128_ps(d, 1);
307 _mm_storeu_ps(dp, _mm_unpacklo_ps(a0, a2)); dp += 4;
308 _mm_storeu_ps(dp, _mm_unpackhi_ps(a0, a2)); dp += 4;
309 _mm_storeu_ps(dp, _mm_unpacklo_ps(a1, a3)); dp += 4;
310 _mm_storeu_ps(dp, _mm_unpackhi_ps(a1, a3)); dp += 4;
323 line_dst->
f32[0] = line_lsrc->
f32[0];
325 line_dst->
f32[0] = line_hsrc->
f32[0] * 0.5f;
void avx_irrev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, line_buf *line_hsrc, ui32 width, bool even)
void avx_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, line_buf *line_hdst, ui32 width, bool even)
void avx_irrev_vert_wvlt_K(const line_buf *line_src, line_buf *line_dst, bool L_analysis_or_H_synthesis, ui32 repeat)
void avx_irrev_vert_wvlt_step(const line_buf *line_src1, const line_buf *line_src2, line_buf *line_dst, int step_num, ui32 repeat)
static const float steps[8]