39#include <wasm_simd128.h>
56 const si32 *src1 = line_src1->
i32, *src2 = line_src2->
i32;
58 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
60 v128_t s1 = wasm_v128_load(src1);
61 v128_t s2 = wasm_v128_load(src2);
62 v128_t d = wasm_v128_load(dst);
63 s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
64 d = wasm_i32x4_sub(d, s1);
65 wasm_v128_store(dst, d);
75 const si32 *src1 = line_src1->
i32, *src2 = line_src2->
i32;
77 v128_t offset = wasm_i32x4_splat(2);
78 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
80 v128_t s1 = wasm_v128_load(src1);
81 s1 = wasm_i32x4_add(s1, offset);
82 v128_t s2 = wasm_v128_load(src2);
83 s2 = wasm_i32x4_add(s2, s1);
84 v128_t d = wasm_v128_load(dst);
85 d = wasm_i32x4_add(d, wasm_i32x4_shr(s2, 2));
86 wasm_v128_store(dst, d);
97 si32 *ldst = line_ldst->
i32, *hdst = line_hdst->
i32;
99 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
100 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
104 src[width] = src[width-2];
106 const si32* sp = src + (even ? 1 : 0);
108 for (
ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
111 v128_t s1 = wasm_v128_load(sp - 1);
112 v128_t s2 = wasm_v128_load(sp + 1);
113 v128_t d = wasm_v128_load(sp);
114 s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
115 v128_t d1 = wasm_i32x4_sub(d, s1);
117 s1 = wasm_v128_load(sp - 1);
118 s2 = wasm_v128_load(sp + 1);
119 d = wasm_v128_load(sp);
120 s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
121 v128_t d2 = wasm_i32x4_sub(d, s1);
123 d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
124 wasm_v128_store(dph, d);
129 hdst[H_width] = hdst[H_width-1];
131 sp = src + (even ? 0 : 1);
132 const si32* sph = hdst + (even ? 0 : 1);
134 v128_t offset = wasm_i32x4_splat(2);
135 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
137 v128_t s1 = wasm_v128_load(sph - 1);
138 s1 = wasm_i32x4_add(s1, offset);
139 v128_t s2 = wasm_v128_load(sph);
140 s2 = wasm_i32x4_add(s2, s1);
141 v128_t d1 = wasm_v128_load(sp);
142 v128_t d2 = wasm_v128_load(sp + 4);
143 v128_t d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
144 d = wasm_i32x4_add(d, wasm_i32x4_shr(s2, 2));
145 wasm_v128_store(dpl, d);
151 line_ldst->
i32[0] = line_src->
i32[0];
153 line_hdst->
i32[0] = line_src->
i32[0] << 1;
163 const si32 *src1 = line_src1->
i32, *src2 = line_src2->
i32;
165 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
167 v128_t s1 = wasm_v128_load(src1);
168 v128_t s2 = wasm_v128_load(src2);
169 v128_t d = wasm_v128_load(dst);
170 s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
171 d = wasm_i32x4_add(d, s1);
172 wasm_v128_store(dst, d);
182 const si32 *src1 = line_src1->
i32, *src2 = line_src2->
i32;
184 v128_t offset = wasm_i32x4_splat(2);
185 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
187 v128_t s1 = wasm_v128_load(src1);
188 s1 = wasm_i32x4_add(s1, offset);
189 v128_t s2 = wasm_v128_load(src2);
190 s2 = wasm_i32x4_add(s2, s1);
191 v128_t d = wasm_v128_load(dst);
192 d = wasm_i32x4_sub(d, wasm_i32x4_shr(s2, 2));
193 wasm_v128_store(dst, d);
203 si32 *lsrc = line_lsrc->
i32, *hsrc = line_hsrc->
i32;
206 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
207 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
211 hsrc[H_width] = hsrc[H_width-1];
213 const si32 *sph = hsrc + (even ? 0 : 1);
215 v128_t offset = wasm_i32x4_splat(2);
216 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, spl+=4)
218 v128_t s1 = wasm_v128_load(sph - 1);
219 s1 = wasm_i32x4_add(s1, offset);
220 v128_t s2 = wasm_v128_load(sph);
221 s2 = wasm_i32x4_add(s2, s1);
222 v128_t d = wasm_v128_load(spl);
223 d = wasm_i32x4_sub(d, wasm_i32x4_shr(s2, 2));
224 wasm_v128_store(spl, d);
229 lsrc[L_width] = lsrc[L_width - 1];
231 si32 *dp = dst + (even ? 0 : -1);
232 spl = lsrc + (even ? 0 : -1);
234 ui32 width = L_width + (even ? 0 : 1);
235 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sph+=4, spl+=4, dp+=8)
237 v128_t s1 = wasm_v128_load(spl);
238 v128_t s2 = wasm_v128_load(spl + 1);
239 v128_t d = wasm_v128_load(sph);
240 s2 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
241 d = wasm_i32x4_add(d, s2);
242 wasm_v128_store(dp, wasm_i32x4_shuffle(s1, d, 0, 4, 1, 5));
243 wasm_v128_store(dp + 4, wasm_i32x4_shuffle(s1, d, 2, 6, 3, 7));
249 line_dst->
i32[0] = line_lsrc->
i32[0];
251 line_dst->
i32[0] = line_hsrc->
i32[0] >> 1;
261 float *dst = line_dst->
f32;
262 const float *src1 = line_src1->
f32, *src2 = line_src2->
f32;
265 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
267 v128_t s1 = wasm_v128_load(src1);
268 v128_t s2 = wasm_v128_load(src2);
269 v128_t d = wasm_v128_load(dst);
270 d = wasm_f32x4_add(d, wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2)));
271 wasm_v128_store(dst, d);
277 bool L_analysis_or_H_synthesis,
ui32 repeat)
279 float *dst = line_dst->
f32;
280 const float *src = line_src->
f32;
284 v128_t factor = wasm_f32x4_splat(f);
285 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src+=4)
287 v128_t s = wasm_v128_load(src);
288 wasm_v128_store(dst, wasm_f32x4_mul(factor, s));
299 float *src = line_src->
f32;
300 float *ldst = line_ldst->
f32, *hdst = line_hdst->
f32;
302 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
303 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
307 src[width] = src[width-2];
309 const float* sp = src + (even ? 1 : 0);
312 for (
ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
315 v128_t s1 = wasm_v128_load(sp - 1);
316 v128_t s2 = wasm_v128_load(sp + 1);
317 v128_t d = wasm_v128_load(sp);
318 s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
319 v128_t d1 = wasm_f32x4_add(d, s1);
321 s1 = wasm_v128_load(sp - 1);
322 s2 = wasm_v128_load(sp + 1);
323 d = wasm_v128_load(sp);
324 s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
325 v128_t d2 = wasm_f32x4_add(d, s1);
327 d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
328 wasm_v128_store(dph, d);
333 hdst[H_width] = hdst[H_width-1];
336 sp = src + (even ? 0 : 1);
337 const float* sph = hdst + (even ? 0 : 1);
339 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
341 v128_t s1 = wasm_v128_load(sph - 1);
342 v128_t s2 = wasm_v128_load(sph);
343 s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
344 v128_t d1 = wasm_v128_load(sp);
345 v128_t d2 = wasm_v128_load(sp + 4);
346 v128_t d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
347 d = wasm_f32x4_add(d, s1);
348 wasm_v128_store(dpl, d);
353 ldst[L_width] = ldst[L_width-1];
356 const float* spl = ldst + (even ? 1 : 0);
358 for (
ui32 i = (H_width + 3) >> 2; i > 0; --i, spl+=4, dph+=4)
360 v128_t s1 = wasm_v128_load(spl - 1);
361 v128_t s2 = wasm_v128_load(spl);
362 v128_t d = wasm_v128_load(dph);
363 s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
364 d = wasm_f32x4_add(d, s1);
365 wasm_v128_store(dph, d);
370 hdst[H_width] = hdst[H_width-1];
373 sph = hdst + (even ? 0 : 1);
375 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, dpl+=4)
377 v128_t s1 = wasm_v128_load(sph - 1);
378 v128_t s2 = wasm_v128_load(sph);
379 v128_t d = wasm_v128_load(dpl);
380 s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
381 d = wasm_f32x4_add(d, s1);
382 wasm_v128_store(dpl, d);
388 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
390 v128_t d = wasm_v128_load(dp);
391 wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
395 for (
int i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
397 v128_t d = wasm_v128_load(dp);
398 wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
404 line_ldst->
f32[0] = line_src->
f32[0];
406 line_hdst->
f32[0] = line_src->
f32[0] + line_src->
f32[0];
417 float *lsrc = line_lsrc->
f32, *hsrc = line_hsrc->
f32;
418 float *dst = line_dst->
f32;
420 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
421 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
426 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
428 v128_t d = wasm_v128_load(dp);
429 wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
433 for (
ui32 i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
435 v128_t d = wasm_v128_load(dp);
436 wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
441 hsrc[H_width] = hsrc[H_width-1];
444 const float *sph = hsrc + (even ? 0 : 1);
446 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
448 v128_t s1 = wasm_v128_load(sph - 1);
449 v128_t s2 = wasm_v128_load(sph);
450 v128_t d = wasm_v128_load(dpl);
451 s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
452 d = wasm_f32x4_add(d, s1);
453 wasm_v128_store(dpl, d);
458 lsrc[L_width] = lsrc[L_width-1];
461 const float *spl = lsrc + (even ? 0 : -1);
463 for (
ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4, spl+=4)
465 v128_t s1 = wasm_v128_load(spl);
466 v128_t s2 = wasm_v128_load(spl + 1);
467 v128_t d = wasm_v128_load(dph);
468 s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
469 d = wasm_f32x4_add(d, s1);
470 wasm_v128_store(dph, d);
475 hsrc[H_width] = hsrc[H_width-1];
478 sph = hsrc + (even ? 0 : 1);
480 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
482 v128_t s1 = wasm_v128_load(sph - 1);
483 v128_t s2 = wasm_v128_load(sph);
484 v128_t d = wasm_v128_load(dpl);
485 s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
486 d = wasm_f32x4_add(d, s1);
487 wasm_v128_store(dpl, d);
492 lsrc[L_width] = lsrc[L_width-1];
495 dp = dst + (even ? 0 : -1);
496 spl = lsrc + (even ? 0 : -1);
498 ui32 width = L_width + (even ? 0 : 1);
499 for (
ui32 i = (width + 3) >> 2; i > 0; --i, spl+=4, sph+=4, dp+=8)
501 v128_t s1 = wasm_v128_load(spl);
502 v128_t s2 = wasm_v128_load(spl + 1);
503 v128_t d = wasm_v128_load(sph);
504 s2 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
505 d = wasm_f32x4_add(d, s2);
506 wasm_v128_store(dp, wasm_i32x4_shuffle(s1, d, 0, 4, 1, 5));
507 wasm_v128_store(dp + 4, wasm_i32x4_shuffle(s1, d, 2, 6, 3, 7));
513 line_dst->
f32[0] = line_lsrc->
f32[0];
515 line_dst->
f32[0] = line_hsrc->
f32[0] * 0.5f;
void wasm_rev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, line_buf *line_hsrc, ui32 width, bool even)
void wasm_rev_vert_wvlt_fwd_update(const line_buf *line_src1, const line_buf *line_src2, line_buf *line_dst, ui32 repeat)
void wasm_rev_vert_wvlt_bwd_predict(const line_buf *line_src1, const line_buf *line_src2, line_buf *line_dst, ui32 repeat)
void wasm_irrev_horz_wvlt_bwd_tx(line_buf *line_src, line_buf *line_ldst, line_buf *line_hdst, ui32 width, bool even)
void wasm_irrev_vert_wvlt_K(const line_buf *line_src, line_buf *line_dst, bool L_analysis_or_H_synthesis, ui32 repeat)
void wasm_irrev_vert_wvlt_step(const line_buf *line_src1, const line_buf *line_src2, line_buf *line_dst, int step_num, ui32 repeat)
void wasm_rev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, line_buf *line_hdst, ui32 width, bool even)
void wasm_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, line_buf *line_hdst, ui32 width, bool even)
void wasm_rev_vert_wvlt_fwd_predict(const line_buf *line_src1, const line_buf *line_src2, line_buf *line_dst, ui32 repeat)
void wasm_rev_vert_wvlt_bwd_update(const line_buf *line_src1, const line_buf *line_src2, line_buf *line_dst, ui32 repeat)
static const float steps[8]