54 return (
ui16)((v<<8) | (v>>8));
65 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
66 __m128i zero = _mm_setzero_si128();
67 __m128i mask = _mm_set_epi64x(0x0F0B07030E0A0602, 0x0D0905010C080400);
72 for ( ; count >= 16; count -= 16, sp += 16, p += 16)
75 a = _mm_load_si128((__m128i*)sp);
76 a = _mm_max_epi32(a, zero);
77 t = _mm_min_epi32(a, max_val_vec);
79 a = _mm_load_si128((__m128i*)sp + 1);
80 a = _mm_max_epi32(a, zero);
81 a = _mm_min_epi32(a, max_val_vec);
82 a = _mm_slli_epi32(a, 8);
83 t = _mm_or_si128(t, a);
85 a = _mm_load_si128((__m128i*)sp + 2);
86 a = _mm_max_epi32(a, zero);
87 a = _mm_min_epi32(a, max_val_vec);
88 a = _mm_slli_epi32(a, 16);
89 t = _mm_or_si128(t, a);
91 a = _mm_load_si128((__m128i*)sp + 3);
92 a = _mm_max_epi32(a, zero);
93 a = _mm_min_epi32(a, max_val_vec);
94 a = _mm_slli_epi32(a, 24);
95 t = _mm_or_si128(t, a);
97 t = _mm_shuffle_epi8(t, mask);
98 _mm_storeu_si128((__m128i*)p, t);
101 int max_val = (1 << bit_depth) - 1;
102 for ( ; count > 0; --count)
105 val = val >= 0 ? val : 0;
106 val = val <= max_val ? val : max_val;
121 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
122 __m128i zero = _mm_setzero_si128();
123 __m128i m0 = _mm_set_epi64x((
si64)0xFFFFFFFF0E0D0C0A,
124 (
si64)0x0908060504020100);
127 for ( ; count >= 16; count -= 16, sp0 += 16, sp1 += 16, sp2 += 16, p += 48)
129 __m128i a, t, u, v, w;
130 a = _mm_load_si128((__m128i*)sp0);
131 a = _mm_max_epi32(a, zero);
132 t = _mm_min_epi32(a, max_val_vec);
134 a = _mm_load_si128((__m128i*)sp1);
135 a = _mm_max_epi32(a, zero);
136 a = _mm_min_epi32(a, max_val_vec);
137 a = _mm_slli_epi32(a, 8);
138 t = _mm_or_si128(t, a);
140 a = _mm_load_si128((__m128i*)sp2);
141 a = _mm_max_epi32(a, zero);
142 a = _mm_min_epi32(a, max_val_vec);
143 a = _mm_slli_epi32(a, 16);
144 t = _mm_or_si128(t, a);
145 t = _mm_shuffle_epi8(t, m0);
147 a = _mm_load_si128((__m128i*)sp0 + 1);
148 a = _mm_max_epi32(a, zero);
149 u = _mm_min_epi32(a, max_val_vec);
151 a = _mm_load_si128((__m128i*)sp1 + 1);
152 a = _mm_max_epi32(a, zero);
153 a = _mm_min_epi32(a, max_val_vec);
154 a = _mm_slli_epi32(a, 8);
155 u = _mm_or_si128(u, a);
157 a = _mm_load_si128((__m128i*)sp2 + 1);
158 a = _mm_max_epi32(a, zero);
159 a = _mm_min_epi32(a, max_val_vec);
160 a = _mm_slli_epi32(a, 16);
161 u = _mm_or_si128(u, a);
162 u = _mm_shuffle_epi8(u, m0);
164 a = _mm_load_si128((__m128i*)sp0 + 2);
165 a = _mm_max_epi32(a, zero);
166 v = _mm_min_epi32(a, max_val_vec);
168 a = _mm_load_si128((__m128i*)sp1 + 2);
169 a = _mm_max_epi32(a, zero);
170 a = _mm_min_epi32(a, max_val_vec);
171 a = _mm_slli_epi32(a, 8);
172 v = _mm_or_si128(v, a);
174 a = _mm_load_si128((__m128i*)sp2 + 2);
175 a = _mm_max_epi32(a, zero);
176 a = _mm_min_epi32(a, max_val_vec);
177 a = _mm_slli_epi32(a, 16);
178 v = _mm_or_si128(v, a);
179 v = _mm_shuffle_epi8(v, m0);
181 a = _mm_load_si128((__m128i*)sp0 + 3);
182 a = _mm_max_epi32(a, zero);
183 w = _mm_min_epi32(a, max_val_vec);
185 a = _mm_load_si128((__m128i*)sp1 + 3);
186 a = _mm_max_epi32(a, zero);
187 a = _mm_min_epi32(a, max_val_vec);
188 a = _mm_slli_epi32(a, 8);
189 w = _mm_or_si128(w, a);
191 a = _mm_load_si128((__m128i*)sp2 + 3);
192 a = _mm_max_epi32(a, zero);
193 a = _mm_min_epi32(a, max_val_vec);
194 a = _mm_slli_epi32(a, 16);
195 w = _mm_or_si128(w, a);
196 w = _mm_shuffle_epi8(w, m0);
198 t = _mm_or_si128(t, _mm_bslli_si128(u, 12));
199 u = _mm_or_si128(_mm_bsrli_si128(u, 4), _mm_bslli_si128(v, 8));
200 v = _mm_or_si128(_mm_bsrli_si128(v, 8), _mm_bslli_si128(w, 4));
202 _mm_storeu_si128((__m128i*)p + 0, t);
203 _mm_storeu_si128((__m128i*)p + 1, u);
204 _mm_storeu_si128((__m128i*)p + 2, v);
207 int max_val = (1<<bit_depth) - 1;
208 for ( ; count > 0; --count)
212 val = val >= 0 ? val : 0;
213 val = val <= max_val ? val : max_val;
216 val = val >= 0 ? val : 0;
217 val = val <= max_val ? val : max_val;
220 val = val >= 0 ? val : 0;
221 val = val <= max_val ? val : max_val;
234 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
235 __m128i zero = _mm_setzero_si128();
236 __m128i mask = _mm_set_epi64x(0x0F0E0B0A07060302, 0x0D0C090805040100);
241 for ( ; count >= 8; count -= 8, sp += 8, p += 8)
244 a = _mm_load_si128((__m128i*)sp);
245 a = _mm_max_epi32(a, zero);
246 t = _mm_min_epi32(a, max_val_vec);
248 a = _mm_load_si128((__m128i*)sp + 1);
249 a = _mm_max_epi32(a, zero);
250 a = _mm_min_epi32(a, max_val_vec);
251 a = _mm_slli_epi32(a, 16);
252 t = _mm_or_si128(t, a);
254 t = _mm_shuffle_epi8(t, mask);
255 _mm_storeu_si128((__m128i*)p, t);
258 int max_val = (1<<bit_depth) - 1;
259 for ( ; count > 0; --count)
262 val = val >= 0 ? val : 0;
263 val = val <= max_val ? val : max_val;
278 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
279 __m128i zero = _mm_setzero_si128();
281 __m128i m0 = _mm_set_epi64x((
si64)0x0B0A0908FFFF0706,
282 (
si64)0x0504FFFF03020100);
283 __m128i m1 = _mm_set_epi64x((
si64)0xFFFFFFFF0504FFFF,
284 (
si64)0xFFFF0100FFFFFFFF);
285 __m128i m2 = _mm_set_epi64x((
si64)0xFFFFFFFFFFFFFFFF,
286 (
si64)0xFFFF0F0E0D0CFFFF);
287 __m128i m3 = _mm_set_epi64x((
si64)0x0706FFFFFFFF0302,
288 (
si64)0x0D0CFFFFFFFF0908);
289 __m128i m4 = _mm_set_epi64x((
si64)0xFFFF03020100FFFF,
290 (
si64)0xFFFFFFFFFFFFFFFF);
291 __m128i m5 = _mm_set_epi64x((
si64)0xFFFFFFFF0F0EFFFF,
292 (
si64)0xFFFF0B0AFFFFFFFF);
293 __m128i m6 = _mm_set_epi64x((
si64)0x0F0E0D0CFFFF0B0A,
294 (
si64)0x0908FFFF07060504);
297 for ( ; count >= 8; count -= 8, sp0 += 8, sp1 += 8, sp2 += 8, p += 24)
299 __m128i a, b, t, u, v;
300 a = _mm_load_si128((__m128i*)sp0);
301 a = _mm_max_epi32(a, zero);
302 t = _mm_min_epi32(a, max_val_vec);
304 a = _mm_load_si128((__m128i*)sp1);
305 a = _mm_max_epi32(a, zero);
306 a = _mm_min_epi32(a, max_val_vec);
307 a = _mm_slli_epi32(a, 16);
308 t = _mm_or_si128(t, a);
310 a = _mm_load_si128((__m128i*)sp2);
311 a = _mm_max_epi32(a, zero);
312 u = _mm_min_epi32(a, max_val_vec);
314 a = _mm_load_si128((__m128i*)sp0 + 1);
315 a = _mm_max_epi32(a, zero);
316 a = _mm_min_epi32(a, max_val_vec);
317 a = _mm_slli_epi32(a, 16);
318 u = _mm_or_si128(u, a);
320 a = _mm_load_si128((__m128i*)sp1 + 1);
321 a = _mm_max_epi32(a, zero);
322 v = _mm_min_epi32(a, max_val_vec);
324 a = _mm_load_si128((__m128i*)sp2 + 1);
325 a = _mm_max_epi32(a, zero);
326 a = _mm_min_epi32(a, max_val_vec);
327 a = _mm_slli_epi32(a, 16);
328 v = _mm_or_si128(v, a);
330 a = _mm_shuffle_epi8(t, m0);
331 b = _mm_shuffle_epi8(u, m1);
332 a = _mm_or_si128(a, b);
333 _mm_storeu_si128((__m128i*)p, a);
335 a = _mm_shuffle_epi8(t, m2);
336 b = _mm_shuffle_epi8(u, m3);
337 a = _mm_or_si128(a, b);
338 b = _mm_shuffle_epi8(v, m4);
339 a = _mm_or_si128(a, b);
340 _mm_storeu_si128((__m128i*)p + 1, a);
342 a = _mm_shuffle_epi8(u, m5);
343 b = _mm_shuffle_epi8(v, m6);
344 a = _mm_or_si128(a, b);
345 _mm_storeu_si128((__m128i*)p + 2, a);
348 int max_val = (1<<bit_depth) - 1;
349 for ( ; count > 0; --count)
353 val = val >= 0 ? val : 0;
354 val = val <= max_val ? val : max_val;
357 val = val >= 0 ? val : 0;
358 val = val <= max_val ? val : max_val;
361 val = val >= 0 ? val : 0;
362 val = val <= max_val ? val : max_val;
375 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
376 __m128i zero = _mm_setzero_si128();
377 __m128i mask = _mm_set_epi64x(0x0E0F0A0B06070203, 0x0C0D080904050001);
382 for ( ; count >= 8; count -= 8, sp += 8, p += 8)
385 a = _mm_load_si128((__m128i*)sp);
386 a = _mm_max_epi32(a, zero);
387 t = _mm_min_epi32(a, max_val_vec);
389 a = _mm_load_si128((__m128i*)sp + 1);
390 a = _mm_max_epi32(a, zero);
391 a = _mm_min_epi32(a, max_val_vec);
392 a = _mm_slli_epi32(a, 16);
393 t = _mm_or_si128(t, a);
395 t = _mm_shuffle_epi8(t, mask);
396 _mm_storeu_si128((__m128i*)p, t);
399 int max_val = (1<<bit_depth) - 1;
400 for ( ; count > 0; --count)
403 val = val >= 0 ? val : 0;
404 val = val <= max_val ? val : max_val;
419 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
420 __m128i zero = _mm_setzero_si128();
422 __m128i m0 = _mm_set_epi64x((
si64)0x0A0B0809FFFF0607,
423 (
si64)0x0405FFFF02030001);
424 __m128i m1 = _mm_set_epi64x((
si64)0xFFFFFFFF0405FFFF,
425 (
si64)0xFFFF0001FFFFFFFF);
426 __m128i m2 = _mm_set_epi64x((
si64)0xFFFFFFFFFFFFFFFF,
427 (
si64)0xFFFF0E0F0C0DFFFF);
428 __m128i m3 = _mm_set_epi64x((
si64)0x0607FFFFFFFF0203,
429 (
si64)0x0C0DFFFFFFFF0809);
430 __m128i m4 = _mm_set_epi64x((
si64)0xFFFF02030001FFFF,
431 (
si64)0xFFFFFFFFFFFFFFFF);
432 __m128i m5 = _mm_set_epi64x((
si64)0xFFFFFFFF0E0FFFFF,
433 (
si64)0xFFFF0A0BFFFFFFFF);
434 __m128i m6 = _mm_set_epi64x((
si64)0x0E0F0C0DFFFF0A0B,
435 (
si64)0x0809FFFF06070405);
438 for ( ; count >= 8; count -= 8, sp0 += 8, sp1 += 8, sp2 += 8, p += 24)
440 __m128i a, b, t, u, v;
441 a = _mm_load_si128((__m128i*)sp0);
442 a = _mm_max_epi32(a, zero);
443 t = _mm_min_epi32(a, max_val_vec);
445 a = _mm_load_si128((__m128i*)sp1);
446 a = _mm_max_epi32(a, zero);
447 a = _mm_min_epi32(a, max_val_vec);
448 a = _mm_slli_epi32(a, 16);
449 t = _mm_or_si128(t, a);
451 a = _mm_load_si128((__m128i*)sp2);
452 a = _mm_max_epi32(a, zero);
453 u = _mm_min_epi32(a, max_val_vec);
455 a = _mm_load_si128((__m128i*)sp0 + 1);
456 a = _mm_max_epi32(a, zero);
457 a = _mm_min_epi32(a, max_val_vec);
458 a = _mm_slli_epi32(a, 16);
459 u = _mm_or_si128(u, a);
461 a = _mm_load_si128((__m128i*)sp1 + 1);
462 a = _mm_max_epi32(a, zero);
463 v = _mm_min_epi32(a, max_val_vec);
465 a = _mm_load_si128((__m128i*)sp2 + 1);
466 a = _mm_max_epi32(a, zero);
467 a = _mm_min_epi32(a, max_val_vec);
468 a = _mm_slli_epi32(a, 16);
469 v = _mm_or_si128(v, a);
471 a = _mm_shuffle_epi8(t, m0);
472 b = _mm_shuffle_epi8(u, m1);
473 a = _mm_or_si128(a, b);
474 _mm_storeu_si128((__m128i*)p, a);
476 a = _mm_shuffle_epi8(t, m2);
477 b = _mm_shuffle_epi8(u, m3);
478 a = _mm_or_si128(a, b);
479 b = _mm_shuffle_epi8(v, m4);
480 a = _mm_or_si128(a, b);
481 _mm_storeu_si128((__m128i*)p + 1, a);
483 a = _mm_shuffle_epi8(u, m5);
484 b = _mm_shuffle_epi8(v, m6);
485 a = _mm_or_si128(a, b);
486 _mm_storeu_si128((__m128i*)p + 2, a);
489 int max_val = (1<<bit_depth) - 1;
490 for ( ; count > 0; --count)
494 val = val >= 0 ? val : 0;
495 val = val <= max_val ? val : max_val;
498 val = val >= 0 ? val : 0;
499 val = val <= max_val ? val : max_val;
502 val = val >= 0 ? val : 0;
503 val = val <= max_val ? val : max_val;
void sse41_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
void sse41_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
void sse41_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
void sse41_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
static ui16 be2le(const ui16 v)
void sse41_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
void sse41_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)