57 const si32 *src1 = line_src1->
i32, *src2 = line_src2->
i32;
59 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
61 __m128i s1 = _mm_load_si128((__m128i*)src1);
62 __m128i s2 = _mm_load_si128((__m128i*)src2);
63 __m128i d = _mm_load_si128((__m128i*)dst);
64 s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
65 d = _mm_sub_epi32(d, s1);
66 _mm_store_si128((__m128i*)dst, d);
76 const si32 *src1 = line_src1->
i32, *src2 = line_src2->
i32;
78 __m128i offset = _mm_set1_epi32(2);
79 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
81 __m128i s1 = _mm_load_si128((__m128i*)src1);
82 s1 = _mm_add_epi32(s1, offset);
83 __m128i s2 = _mm_load_si128((__m128i*)src2);
84 s2 = _mm_add_epi32(s2, s1);
85 __m128i d = _mm_load_si128((__m128i*)dst);
86 d = _mm_add_epi32(d, _mm_srai_epi32(s2, 2));
87 _mm_store_si128((__m128i*)dst, d);
98 si32 *ldst = line_ldst->
i32, *hdst = line_hdst->
i32;
100 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
101 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
105 src[width] = src[width-2];
107 const si32* sp = src + (even ? 1 : 0);
109 for (
ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
112 __m128i s1 = _mm_loadu_si128((__m128i*)(sp-1));
113 __m128i s2 = _mm_loadu_si128((__m128i*)(sp+1));
114 __m128i d = _mm_loadu_si128((__m128i*)sp);
115 s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
116 __m128i d1 = _mm_sub_epi32(d, s1);
118 s1 = _mm_loadu_si128((__m128i*)(sp-1));
119 s2 = _mm_loadu_si128((__m128i*)(sp+1));
120 d = _mm_loadu_si128((__m128i*)sp);
121 s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
122 __m128i d2 = _mm_sub_epi32(d, s1);
124 d = _mm_castps_si128(_mm_shuffle_ps(
125 _mm_castsi128_ps(d1), _mm_castsi128_ps(d2), 0x88));
126 _mm_store_si128((__m128i*)dph, d);
131 hdst[H_width] = hdst[H_width-1];
133 sp = src + (even ? 0 : 1);
134 const si32* sph = hdst + (even ? 0 : 1);
136 __m128i offset = _mm_set1_epi32(2);
137 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
139 __m128i s1 = _mm_loadu_si128((__m128i*)(sph-1));
140 s1 = _mm_add_epi32(s1, offset);
141 __m128i s2 = _mm_loadu_si128((__m128i*)sph);
142 s2 = _mm_add_epi32(s2, s1);
143 __m128i d1 = _mm_loadu_si128((__m128i*)sp);
144 __m128i d2 = _mm_loadu_si128((__m128i*)sp + 1);
145 __m128i d = _mm_castps_si128(_mm_shuffle_ps(
146 _mm_castsi128_ps(d1), _mm_castsi128_ps(d2), 0x88));
147 d = _mm_add_epi32(d, _mm_srai_epi32(s2, 2));
148 _mm_store_si128((__m128i*)dpl, d);
154 line_ldst->
i32[0] = line_src->
i32[0];
156 line_hdst->
i32[0] = line_src->
i32[0] << 1;
166 const si32 *src1 = line_src1->
i32, *src2 = line_src2->
i32;
168 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
170 __m128i s1 = _mm_load_si128((__m128i*)src1);
171 __m128i s2 = _mm_load_si128((__m128i*)src2);
172 __m128i d = _mm_load_si128((__m128i*)dst);
173 s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
174 d = _mm_add_epi32(d, s1);
175 _mm_store_si128((__m128i*)dst, d);
185 const si32 *src1 = line_src1->
i32, *src2 = line_src2->
i32;
187 __m128i offset = _mm_set1_epi32(2);
188 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
190 __m128i s1 = _mm_load_si128((__m128i*)src1);
191 s1 = _mm_add_epi32(s1, offset);
192 __m128i s2 = _mm_load_si128((__m128i*)src2);
193 s2 = _mm_add_epi32(s2, s1);
194 __m128i d = _mm_load_si128((__m128i*)dst);
195 d = _mm_sub_epi32(d, _mm_srai_epi32(s2, 2));
196 _mm_store_si128((__m128i*)dst, d);
206 si32 *lsrc = line_lsrc->
i32, *hsrc = line_hsrc->
i32;
209 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
210 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
214 hsrc[H_width] = hsrc[H_width-1];
216 const si32 *sph = hsrc + (even ? 0 : 1);
218 __m128i offset = _mm_set1_epi32(2);
219 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, spl+=4)
221 __m128i s1 = _mm_loadu_si128((__m128i*)(sph-1));
222 s1 = _mm_add_epi32(s1, offset);
223 __m128i s2 = _mm_loadu_si128((__m128i*)sph);
224 s2 = _mm_add_epi32(s2, s1);
225 __m128i d = _mm_load_si128((__m128i*)spl);
226 d = _mm_sub_epi32(d, _mm_srai_epi32(s2, 2));
227 _mm_store_si128((__m128i*)spl, d);
232 lsrc[L_width] = lsrc[L_width - 1];
234 si32 *dp = dst + (even ? 0 : -1);
235 spl = lsrc + (even ? 0 : -1);
237 ui32 width = L_width + (even ? 0 : 1);
238 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sph+=4, spl+=4, dp+=8)
240 __m128i s1 = _mm_loadu_si128((__m128i*)spl);
241 __m128i s2 = _mm_loadu_si128((__m128i*)(spl+1));
242 __m128i d = _mm_load_si128((__m128i*)sph);
243 s2 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
244 d = _mm_add_epi32(d, s2);
245 _mm_storeu_si128((__m128i*)dp, _mm_unpacklo_epi32(s1, d));
246 _mm_storeu_si128((__m128i*)dp + 1, _mm_unpackhi_epi32(s1, d));
252 line_dst->
i32[0] = line_lsrc->
i32[0];
254 line_dst->
i32[0] = line_hsrc->
i32[0] >> 1;
void sse2_rev_horz_wvlt_fwd_tx(line_buf *src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
void sse2_rev_vert_wvlt_fwd_update(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
void sse2_rev_horz_wvlt_bwd_tx(line_buf *dst, line_buf *lsrc, line_buf *hsrc, ui32 width, bool even)
void sse2_rev_vert_wvlt_bwd_predict(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
void sse2_rev_vert_wvlt_fwd_predict(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
void sse2_rev_vert_wvlt_bwd_update(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)