53 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
54 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
55 __m128 shift = _mm_set1_ps(0.5f);
56 __m128 m = _mm_set1_ps(mul);
57 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
59 __m128 t = _mm_loadu_ps(sp);
60 __m128 s = _mm_add_ps(t, shift);
62 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
64 _MM_SET_ROUNDING_MODE(rounding_mode);
71 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
72 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
73 __m128 m = _mm_set1_ps(mul);
74 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
76 __m128 t = _mm_loadu_ps(sp);
77 __m128 s = _mm_mul_ps(t, m);
78 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
80 _MM_SET_ROUNDING_MODE(rounding_mode);
88 __m128i sh = _mm_set1_epi32(shift);
89 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
91 __m128i s = _mm_loadu_si128((__m128i*)sp);
92 s = _mm_add_epi32(s, sh);
93 _mm_storeu_si128((__m128i*)dp, s);
101 for (
int i = (repeat + 3) >> 2; i > 0; --i)
103 __m128i mr = _mm_load_si128((__m128i*)r);
104 __m128i mg = _mm_load_si128((__m128i*)g);
105 __m128i mb = _mm_load_si128((__m128i*)b);
106 __m128i t = _mm_add_epi32(mr, mb);
107 t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
108 _mm_store_si128((__m128i*)y, _mm_srai_epi32(t, 2));
109 t = _mm_sub_epi32(mb, mg);
110 _mm_store_si128((__m128i*)cb, t);
111 t = _mm_sub_epi32(mr, mg);
112 _mm_store_si128((__m128i*)cr, t);
114 r += 4; g += 4; b += 4;
115 y += 4; cb += 4; cr += 4;
123 for (
int i = (repeat + 3) >> 2; i > 0; --i)
125 __m128i my = _mm_load_si128((__m128i*)y);
126 __m128i mcb = _mm_load_si128((__m128i*)cb);
127 __m128i mcr = _mm_load_si128((__m128i*)cr);
129 __m128i t = _mm_add_epi32(mcb, mcr);
130 t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
131 _mm_store_si128((__m128i*)g, t);
132 __m128i u = _mm_add_epi32(mcb, t);
133 _mm_store_si128((__m128i*)b, u);
134 u = _mm_add_epi32(mcr, t);
135 _mm_store_si128((__m128i*)r, u);
137 y += 4; cb += 4; cr += 4;
138 r += 4; g += 4; b += 4;
void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b, si32 *y, si32 *cb, si32 *cr, ui32 repeat)
void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, ui32 width)
void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, si32 *r, si32 *g, si32 *b, ui32 repeat)
void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)