39#include <wasm_simd128.h>
52 v128_t shift = wasm_f32x4_splat(0.5f);
53 v128_t m = wasm_f32x4_splat(mul);
54 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
56 v128_t t = wasm_v128_load(sp);
57 v128_t s = wasm_f32x4_convert_i32x4(t);
58 s = wasm_f32x4_mul(s, m);
59 s = wasm_f32x4_sub(s, shift);
60 wasm_v128_store(dp, s);
68 v128_t m = wasm_f32x4_splat(mul);
69 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
71 v128_t t = wasm_v128_load(sp);
72 v128_t s = wasm_f32x4_convert_i32x4(t);
73 s = wasm_f32x4_mul(s, m);
74 wasm_v128_store(dp, s);
83 v128_t shift = wasm_f32x4_splat(0.5f);
84 v128_t m = wasm_f32x4_splat(mul);
85 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
87 v128_t t = wasm_v128_load(sp);
88 v128_t s = wasm_f32x4_add(t, shift);
89 s = wasm_f32x4_mul(s, m);
90 s = wasm_f32x4_add(s, shift);
91 wasm_v128_store(dp, wasm_i32x4_trunc_sat_f32x4(s));
100 v128_t shift = wasm_f32x4_splat(0.5f);
101 v128_t m = wasm_f32x4_splat(mul);
102 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
104 v128_t t = wasm_v128_load(sp);
105 v128_t s = wasm_f32x4_mul(t, m);
106 s = wasm_f32x4_add(s, shift);
107 wasm_v128_store(dp, wasm_i32x4_trunc_sat_f32x4(s));
116 v128_t sh = wasm_i32x4_splat(shift);
117 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
119 v128_t s = wasm_v128_load(sp);
120 s = wasm_i32x4_add(s, sh);
121 wasm_v128_store(dp, s);
129 for (
int i = (repeat + 3) >> 2; i > 0; --i)
131 v128_t mr = wasm_v128_load(r);
132 v128_t mg = wasm_v128_load(g);
133 v128_t mb = wasm_v128_load(b);
134 v128_t t = wasm_i32x4_add(mr, mb);
135 t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1));
136 wasm_v128_store(y, wasm_i32x4_shr(t, 2));
137 t = wasm_i32x4_sub(mb, mg);
138 wasm_v128_store(cb, t);
139 t = wasm_i32x4_sub(mr, mg);
140 wasm_v128_store(cr, t);
142 r += 4; g += 4; b += 4;
143 y += 4; cb += 4; cr += 4;
151 for (
int i = (repeat + 3) >> 2; i > 0; --i)
153 v128_t my = wasm_v128_load(y);
154 v128_t mcb = wasm_v128_load(cb);
155 v128_t mcr = wasm_v128_load(cr);
157 v128_t t = wasm_i32x4_add(mcb, mcr);
158 t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2));
159 wasm_v128_store(g, t);
160 v128_t u = wasm_i32x4_add(mcb, t);
161 wasm_v128_store(b, u);
162 u = wasm_i32x4_add(mcr, t);
163 wasm_v128_store(r, u);
165 y += 4; cb += 4; cr += 4;
166 r += 4; g += 4; b += 4;
172 float *y,
float *cb,
float *cr,
ui32 repeat)
179 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i)
181 v128_t mr = wasm_v128_load(r);
182 v128_t mb = wasm_v128_load(b);
183 v128_t my = wasm_f32x4_mul(alpha_rf, mr);
184 my = wasm_f32x4_add(my, wasm_f32x4_mul(alpha_gf, wasm_v128_load(g)));
185 my = wasm_f32x4_add(my, wasm_f32x4_mul(alpha_bf, mb));
186 wasm_v128_store(y, my);
187 wasm_v128_store(cb, wasm_f32x4_mul(beta_cbf, wasm_f32x4_sub(mb, my)));
188 wasm_v128_store(cr, wasm_f32x4_mul(beta_crf, wasm_f32x4_sub(mr, my)));
190 r += 4; g += 4; b += 4;
191 y += 4; cb += 4; cr += 4;
197 float *r,
float *g,
float *b,
ui32 repeat)
203 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i)
205 v128_t my = wasm_v128_load(y);
206 v128_t mcr = wasm_v128_load(cr);
207 v128_t mcb = wasm_v128_load(cb);
208 v128_t mg = wasm_f32x4_sub(my, wasm_f32x4_mul(gamma_cr2g, mcr));
209 wasm_v128_store(g, wasm_f32x4_sub(mg, wasm_f32x4_mul(gamma_cb2g, mcb)));
210 wasm_v128_store(r, wasm_f32x4_add(my, wasm_f32x4_mul(gamma_cr2r, mcr)));
211 wasm_v128_store(b, wasm_f32x4_add(my, wasm_f32x4_mul(gamma_cb2b, mcb)));
213 y += 4; cb += 4; cr += 4;
214 r += 4; g += 4; b += 4;
void wasm_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
void wasm_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat)
void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width)
void wasm_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
void wasm_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat)
void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b, si32 *y, si32 *cb, si32 *cr, ui32 repeat)
void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, ui32 width)
void wasm_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, ui32 width)
void wasm_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, si32 *r, si32 *g, si32 *b, ui32 repeat)
static const float GAMMA_CR2R
static const float BETA_CbF
static const float GAMMA_CB2B
static const float ALPHA_RF
static const float GAMMA_CB2G
static const float GAMMA_CR2G
static const float ALPHA_BF
static const float BETA_CrF
static const float ALPHA_GF