54 __m256 shift = _mm256_set1_ps(0.5f);
55 __m256 m = _mm256_set1_ps(mul);
56 for (
int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
58 __m256i t = _mm256_loadu_si256((__m256i*)sp);
59 __m256 s = _mm256_cvtepi32_ps(t);
60 s = _mm256_mul_ps(s, m);
61 s = _mm256_sub_ps(s, shift);
62 _mm256_store_ps(dp, s);
70 __m256 m = _mm256_set1_ps(mul);
71 for (
int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
73 __m256i t = _mm256_loadu_si256((__m256i*)sp);
74 __m256 s = _mm256_cvtepi32_ps(t);
75 s = _mm256_mul_ps(s, m);
76 _mm256_store_ps(dp, s);
84 __m256 shift = _mm256_set1_ps(0.5f);
85 __m256 m = _mm256_set1_ps(mul);
86 for (
int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
88 __m256 t = _mm256_load_ps(sp);
89 __m256 s = _mm256_add_ps(t, shift);
90 s = _mm256_mul_ps(s, m);
91 s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
92 _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s));
100 __m256 m = _mm256_set1_ps(mul);
101 for (
int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
103 __m256 t = _mm256_load_ps(sp);
104 __m256 s = _mm256_mul_ps(t, m);
105 s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
106 _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s));
112 float *y,
float *cb,
float *cr,
ui32 repeat)
119 for (
int i = (repeat + 7) >> 3; i > 0; --i)
121 __m256 mr = _mm256_load_ps(r);
122 __m256 mb = _mm256_load_ps(b);
123 __m256 my = _mm256_mul_ps(alpha_rf, mr);
124 my = _mm256_add_ps(my, _mm256_mul_ps(alpha_gf, _mm256_load_ps(g)));
125 my = _mm256_add_ps(my, _mm256_mul_ps(alpha_bf, mb));
126 _mm256_store_ps(y, my);
127 _mm256_store_ps(cb, _mm256_mul_ps(beta_cbf, _mm256_sub_ps(mb, my)));
128 _mm256_store_ps(cr, _mm256_mul_ps(beta_crf, _mm256_sub_ps(mr, my)));
130 r += 8; g += 8; b += 8;
131 y += 8; cb += 8; cr += 8;
137 float *r,
float *g,
float *b,
ui32 repeat)
143 for (
int i = (repeat + 7) >> 3; i > 0; --i)
145 __m256 my = _mm256_load_ps(y);
146 __m256 mcr = _mm256_load_ps(cr);
147 __m256 mcb = _mm256_load_ps(cb);
148 __m256 mg = _mm256_sub_ps(my, _mm256_mul_ps(gamma_cr2g, mcr));
149 _mm256_store_ps(g, _mm256_sub_ps(mg, _mm256_mul_ps(gamma_cb2g, mcb)));
150 _mm256_store_ps(r, _mm256_add_ps(my, _mm256_mul_ps(gamma_cr2r, mcr)));
151 _mm256_store_ps(b, _mm256_add_ps(my, _mm256_mul_ps(gamma_cb2b, mcb)));
153 y += 8; cb += 8; cr += 8;
154 r += 8; g += 8; b += 8;
void avx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
void avx_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat)
void avx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width)
void avx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, ui32 width)
void avx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
void avx_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat)
static const float GAMMA_CR2R
static const float BETA_CbF
static const float GAMMA_CB2B
static const float ALPHA_RF
static const float GAMMA_CB2G
static const float GAMMA_CR2G
static const float ALPHA_BF
static const float BETA_CrF
static const float ALPHA_GF