OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_img_io_sse41.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_img_io_sse41.cpp
34// Author: Aous Naman
35// Date: 23 May 2022
36//***************************************************************************/
37
38
39#include <cstdlib>
40#include <cstring>
41#include <immintrin.h>
42
43#include "ojph_file.h"
44#include "ojph_img_io.h"
45#include "ojph_mem.h"
46#include "ojph_message.h"
47
48namespace ojph {
49
51 static
52 ui16 be2le(const ui16 v)
53 {
54 return (ui16)((v<<8) | (v>>8));
55 }
56
58 void sse41_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1,
59 const line_buf *ln2, void *dp,
60 ui32 bit_depth, ui32 count)
61 {
62 ojph_unused(ln1);
63 ojph_unused(ln2);
64
65 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
66 __m128i zero = _mm_setzero_si128();
67 __m128i mask = _mm_set_epi64x(0x0F0B07030E0A0602, 0x0D0905010C080400);
68 const si32 *sp = ln0->i32;
69 ui8* p = (ui8 *)dp;
70
71 // 16 bytes or entries in each loop
72 for ( ; count >= 16; count -= 16, sp += 16, p += 16)
73 {
74 __m128i a, t;
75 a = _mm_load_si128((__m128i*)sp);
76 a = _mm_max_epi32(a, zero);
77 t = _mm_min_epi32(a, max_val_vec);
78
79 a = _mm_load_si128((__m128i*)sp + 1);
80 a = _mm_max_epi32(a, zero);
81 a = _mm_min_epi32(a, max_val_vec);
82 a = _mm_slli_epi32(a, 8);
83 t = _mm_or_si128(t, a);
84
85 a = _mm_load_si128((__m128i*)sp + 2);
86 a = _mm_max_epi32(a, zero);
87 a = _mm_min_epi32(a, max_val_vec);
88 a = _mm_slli_epi32(a, 16);
89 t = _mm_or_si128(t, a);
90
91 a = _mm_load_si128((__m128i*)sp + 3);
92 a = _mm_max_epi32(a, zero);
93 a = _mm_min_epi32(a, max_val_vec);
94 a = _mm_slli_epi32(a, 24);
95 t = _mm_or_si128(t, a);
96
97 t = _mm_shuffle_epi8(t, mask);
98 _mm_storeu_si128((__m128i*)p, t);
99 }
100
101 int max_val = (1 << bit_depth) - 1;
102 for ( ; count > 0; --count)
103 {
104 int val = *sp++;
105 val = val >= 0 ? val : 0;
106 val = val <= max_val ? val : max_val;
107 *p++ = (ui8)val;
108 }
109 }
110
112 void sse41_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1,
113 const line_buf *ln2, void *dp,
114 ui32 bit_depth, ui32 count)
115 {
116 const si32 *sp0 = ln0->i32;
117 const si32 *sp1 = ln1->i32;
118 const si32 *sp2 = ln2->i32;
119 ui8* p = (ui8 *)dp;
120
121 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
122 __m128i zero = _mm_setzero_si128();
123 __m128i m0 = _mm_set_epi64x((si64)0xFFFFFFFF0E0D0C0A,
124 (si64)0x0908060504020100);
125
126 // 16 entries in each loop
127 for ( ; count >= 16; count -= 16, sp0 += 16, sp1 += 16, sp2 += 16, p += 48)
128 {
129 __m128i a, t, u, v, w;
130 a = _mm_load_si128((__m128i*)sp0);
131 a = _mm_max_epi32(a, zero);
132 t = _mm_min_epi32(a, max_val_vec);
133
134 a = _mm_load_si128((__m128i*)sp1);
135 a = _mm_max_epi32(a, zero);
136 a = _mm_min_epi32(a, max_val_vec);
137 a = _mm_slli_epi32(a, 8);
138 t = _mm_or_si128(t, a);
139
140 a = _mm_load_si128((__m128i*)sp2);
141 a = _mm_max_epi32(a, zero);
142 a = _mm_min_epi32(a, max_val_vec);
143 a = _mm_slli_epi32(a, 16);
144 t = _mm_or_si128(t, a);
145 t = _mm_shuffle_epi8(t, m0);
146
147 a = _mm_load_si128((__m128i*)sp0 + 1);
148 a = _mm_max_epi32(a, zero);
149 u = _mm_min_epi32(a, max_val_vec);
150
151 a = _mm_load_si128((__m128i*)sp1 + 1);
152 a = _mm_max_epi32(a, zero);
153 a = _mm_min_epi32(a, max_val_vec);
154 a = _mm_slli_epi32(a, 8);
155 u = _mm_or_si128(u, a);
156
157 a = _mm_load_si128((__m128i*)sp2 + 1);
158 a = _mm_max_epi32(a, zero);
159 a = _mm_min_epi32(a, max_val_vec);
160 a = _mm_slli_epi32(a, 16);
161 u = _mm_or_si128(u, a);
162 u = _mm_shuffle_epi8(u, m0);
163
164 a = _mm_load_si128((__m128i*)sp0 + 2);
165 a = _mm_max_epi32(a, zero);
166 v = _mm_min_epi32(a, max_val_vec);
167
168 a = _mm_load_si128((__m128i*)sp1 + 2);
169 a = _mm_max_epi32(a, zero);
170 a = _mm_min_epi32(a, max_val_vec);
171 a = _mm_slli_epi32(a, 8);
172 v = _mm_or_si128(v, a);
173
174 a = _mm_load_si128((__m128i*)sp2 + 2);
175 a = _mm_max_epi32(a, zero);
176 a = _mm_min_epi32(a, max_val_vec);
177 a = _mm_slli_epi32(a, 16);
178 v = _mm_or_si128(v, a);
179 v = _mm_shuffle_epi8(v, m0);
180
181 a = _mm_load_si128((__m128i*)sp0 + 3);
182 a = _mm_max_epi32(a, zero);
183 w = _mm_min_epi32(a, max_val_vec);
184
185 a = _mm_load_si128((__m128i*)sp1 + 3);
186 a = _mm_max_epi32(a, zero);
187 a = _mm_min_epi32(a, max_val_vec);
188 a = _mm_slli_epi32(a, 8);
189 w = _mm_or_si128(w, a);
190
191 a = _mm_load_si128((__m128i*)sp2 + 3);
192 a = _mm_max_epi32(a, zero);
193 a = _mm_min_epi32(a, max_val_vec);
194 a = _mm_slli_epi32(a, 16);
195 w = _mm_or_si128(w, a);
196 w = _mm_shuffle_epi8(w, m0);
197
198 t = _mm_or_si128(t, _mm_bslli_si128(u, 12));
199 u = _mm_or_si128(_mm_bsrli_si128(u, 4), _mm_bslli_si128(v, 8));
200 v = _mm_or_si128(_mm_bsrli_si128(v, 8), _mm_bslli_si128(w, 4));
201
202 _mm_storeu_si128((__m128i*)p + 0, t);
203 _mm_storeu_si128((__m128i*)p + 1, u);
204 _mm_storeu_si128((__m128i*)p + 2, v);
205 }
206
207 int max_val = (1<<bit_depth) - 1;
208 for ( ; count > 0; --count)
209 {
210 int val;
211 val = *sp0++;
212 val = val >= 0 ? val : 0;
213 val = val <= max_val ? val : max_val;
214 *p++ = (ui8) val;
215 val = *sp1++;
216 val = val >= 0 ? val : 0;
217 val = val <= max_val ? val : max_val;
218 *p++ = (ui8) val;
219 val = *sp2++;
220 val = val >= 0 ? val : 0;
221 val = val <= max_val ? val : max_val;
222 *p++ = (ui8) val;
223 }
224 }
225
227 void sse41_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1,
228 const line_buf *ln2, void *dp,
229 ui32 bit_depth, ui32 count)
230 {
231 ojph_unused(ln1);
232 ojph_unused(ln2);
233
234 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
235 __m128i zero = _mm_setzero_si128();
236 __m128i mask = _mm_set_epi64x(0x0F0E0B0A07060302, 0x0D0C090805040100);
237 const si32 *sp = ln0->i32;
238 ui16* p = (ui16 *)dp;
239
240 // 8 entries in each loop
241 for ( ; count >= 8; count -= 8, sp += 8, p += 8)
242 {
243 __m128i a, t;
244 a = _mm_load_si128((__m128i*)sp);
245 a = _mm_max_epi32(a, zero);
246 t = _mm_min_epi32(a, max_val_vec);
247
248 a = _mm_load_si128((__m128i*)sp + 1);
249 a = _mm_max_epi32(a, zero);
250 a = _mm_min_epi32(a, max_val_vec);
251 a = _mm_slli_epi32(a, 16);
252 t = _mm_or_si128(t, a);
253
254 t = _mm_shuffle_epi8(t, mask);
255 _mm_storeu_si128((__m128i*)p, t);
256 }
257
258 int max_val = (1<<bit_depth) - 1;
259 for ( ; count > 0; --count)
260 {
261 int val = *sp++;
262 val = val >= 0 ? val : 0;
263 val = val <= max_val ? val : max_val;
264 *p++ = (ui16) val;
265 }
266 }
267
269 void sse41_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1,
270 const line_buf *ln2, void *dp,
271 ui32 bit_depth, ui32 count)
272 {
273 const si32 *sp0 = ln0->i32;
274 const si32 *sp1 = ln1->i32;
275 const si32 *sp2 = ln2->i32;
276 ui16* p = (ui16*)dp;
277
278 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
279 __m128i zero = _mm_setzero_si128();
280
281 __m128i m0 = _mm_set_epi64x((si64)0x0B0A0908FFFF0706,
282 (si64)0x0504FFFF03020100);
283 __m128i m1 = _mm_set_epi64x((si64)0xFFFFFFFF0504FFFF,
284 (si64)0xFFFF0100FFFFFFFF);
285 __m128i m2 = _mm_set_epi64x((si64)0xFFFFFFFFFFFFFFFF,
286 (si64)0xFFFF0F0E0D0CFFFF);
287 __m128i m3 = _mm_set_epi64x((si64)0x0706FFFFFFFF0302,
288 (si64)0x0D0CFFFFFFFF0908);
289 __m128i m4 = _mm_set_epi64x((si64)0xFFFF03020100FFFF,
290 (si64)0xFFFFFFFFFFFFFFFF);
291 __m128i m5 = _mm_set_epi64x((si64)0xFFFFFFFF0F0EFFFF,
292 (si64)0xFFFF0B0AFFFFFFFF);
293 __m128i m6 = _mm_set_epi64x((si64)0x0F0E0D0CFFFF0B0A,
294 (si64)0x0908FFFF07060504);
295
296 // 24 entries in each loop
297 for ( ; count >= 8; count -= 8, sp0 += 8, sp1 += 8, sp2 += 8, p += 24)
298 {
299 __m128i a, b, t, u, v;
300 a = _mm_load_si128((__m128i*)sp0);
301 a = _mm_max_epi32(a, zero);
302 t = _mm_min_epi32(a, max_val_vec);
303
304 a = _mm_load_si128((__m128i*)sp1);
305 a = _mm_max_epi32(a, zero);
306 a = _mm_min_epi32(a, max_val_vec);
307 a = _mm_slli_epi32(a, 16);
308 t = _mm_or_si128(t, a);
309
310 a = _mm_load_si128((__m128i*)sp2);
311 a = _mm_max_epi32(a, zero);
312 u = _mm_min_epi32(a, max_val_vec);
313
314 a = _mm_load_si128((__m128i*)sp0 + 1);
315 a = _mm_max_epi32(a, zero);
316 a = _mm_min_epi32(a, max_val_vec);
317 a = _mm_slli_epi32(a, 16);
318 u = _mm_or_si128(u, a);
319
320 a = _mm_load_si128((__m128i*)sp1 + 1);
321 a = _mm_max_epi32(a, zero);
322 v = _mm_min_epi32(a, max_val_vec);
323
324 a = _mm_load_si128((__m128i*)sp2 + 1);
325 a = _mm_max_epi32(a, zero);
326 a = _mm_min_epi32(a, max_val_vec);
327 a = _mm_slli_epi32(a, 16);
328 v = _mm_or_si128(v, a);
329
330 a = _mm_shuffle_epi8(t, m0);
331 b = _mm_shuffle_epi8(u, m1);
332 a = _mm_or_si128(a, b);
333 _mm_storeu_si128((__m128i*)p, a);
334
335 a = _mm_shuffle_epi8(t, m2);
336 b = _mm_shuffle_epi8(u, m3);
337 a = _mm_or_si128(a, b);
338 b = _mm_shuffle_epi8(v, m4);
339 a = _mm_or_si128(a, b);
340 _mm_storeu_si128((__m128i*)p + 1, a);
341
342 a = _mm_shuffle_epi8(u, m5);
343 b = _mm_shuffle_epi8(v, m6);
344 a = _mm_or_si128(a, b);
345 _mm_storeu_si128((__m128i*)p + 2, a);
346 }
347
348 int max_val = (1<<bit_depth) - 1;
349 for ( ; count > 0; --count)
350 {
351 int val;
352 val = *sp0++;
353 val = val >= 0 ? val : 0;
354 val = val <= max_val ? val : max_val;
355 *p++ = be2le((ui16) val);
356 val = *sp1++;
357 val = val >= 0 ? val : 0;
358 val = val <= max_val ? val : max_val;
359 *p++ = be2le((ui16) val);
360 val = *sp2++;
361 val = val >= 0 ? val : 0;
362 val = val <= max_val ? val : max_val;
363 *p++ = (ui16) val;
364 }
365 }
366
368 void sse41_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1,
369 const line_buf *ln2, void *dp,
370 ui32 bit_depth, ui32 count)
371 {
372 ojph_unused(ln1);
373 ojph_unused(ln2);
374
375 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
376 __m128i zero = _mm_setzero_si128();
377 __m128i mask = _mm_set_epi64x(0x0E0F0A0B06070203, 0x0C0D080904050001);
378 const si32 *sp = ln0->i32;
379 ui16* p = (ui16 *)dp;
380
381 // 8 entries in each loop
382 for ( ; count >= 8; count -= 8, sp += 8, p += 8)
383 {
384 __m128i a, t;
385 a = _mm_load_si128((__m128i*)sp);
386 a = _mm_max_epi32(a, zero);
387 t = _mm_min_epi32(a, max_val_vec);
388
389 a = _mm_load_si128((__m128i*)sp + 1);
390 a = _mm_max_epi32(a, zero);
391 a = _mm_min_epi32(a, max_val_vec);
392 a = _mm_slli_epi32(a, 16);
393 t = _mm_or_si128(t, a);
394
395 t = _mm_shuffle_epi8(t, mask);
396 _mm_storeu_si128((__m128i*)p, t);
397 }
398
399 int max_val = (1<<bit_depth) - 1;
400 for ( ; count > 0; --count)
401 {
402 int val = *sp++;
403 val = val >= 0 ? val : 0;
404 val = val <= max_val ? val : max_val;
405 *p++ = be2le((ui16) val);
406 }
407 }
408
410 void sse41_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1,
411 const line_buf *ln2, void *dp,
412 ui32 bit_depth, ui32 count)
413 {
414 const si32 *sp0 = ln0->i32;
415 const si32 *sp1 = ln1->i32;
416 const si32 *sp2 = ln2->i32;
417 ui16* p = (ui16*)dp;
418
419 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
420 __m128i zero = _mm_setzero_si128();
421
422 __m128i m0 = _mm_set_epi64x((si64)0x0A0B0809FFFF0607,
423 (si64)0x0405FFFF02030001);
424 __m128i m1 = _mm_set_epi64x((si64)0xFFFFFFFF0405FFFF,
425 (si64)0xFFFF0001FFFFFFFF);
426 __m128i m2 = _mm_set_epi64x((si64)0xFFFFFFFFFFFFFFFF,
427 (si64)0xFFFF0E0F0C0DFFFF);
428 __m128i m3 = _mm_set_epi64x((si64)0x0607FFFFFFFF0203,
429 (si64)0x0C0DFFFFFFFF0809);
430 __m128i m4 = _mm_set_epi64x((si64)0xFFFF02030001FFFF,
431 (si64)0xFFFFFFFFFFFFFFFF);
432 __m128i m5 = _mm_set_epi64x((si64)0xFFFFFFFF0E0FFFFF,
433 (si64)0xFFFF0A0BFFFFFFFF);
434 __m128i m6 = _mm_set_epi64x((si64)0x0E0F0C0DFFFF0A0B,
435 (si64)0x0809FFFF06070405);
436
437 // 24 entries in each loop
438 for ( ; count >= 8; count -= 8, sp0 += 8, sp1 += 8, sp2 += 8, p += 24)
439 {
440 __m128i a, b, t, u, v;
441 a = _mm_load_si128((__m128i*)sp0);
442 a = _mm_max_epi32(a, zero);
443 t = _mm_min_epi32(a, max_val_vec);
444
445 a = _mm_load_si128((__m128i*)sp1);
446 a = _mm_max_epi32(a, zero);
447 a = _mm_min_epi32(a, max_val_vec);
448 a = _mm_slli_epi32(a, 16);
449 t = _mm_or_si128(t, a);
450
451 a = _mm_load_si128((__m128i*)sp2);
452 a = _mm_max_epi32(a, zero);
453 u = _mm_min_epi32(a, max_val_vec);
454
455 a = _mm_load_si128((__m128i*)sp0 + 1);
456 a = _mm_max_epi32(a, zero);
457 a = _mm_min_epi32(a, max_val_vec);
458 a = _mm_slli_epi32(a, 16);
459 u = _mm_or_si128(u, a);
460
461 a = _mm_load_si128((__m128i*)sp1 + 1);
462 a = _mm_max_epi32(a, zero);
463 v = _mm_min_epi32(a, max_val_vec);
464
465 a = _mm_load_si128((__m128i*)sp2 + 1);
466 a = _mm_max_epi32(a, zero);
467 a = _mm_min_epi32(a, max_val_vec);
468 a = _mm_slli_epi32(a, 16);
469 v = _mm_or_si128(v, a);
470
471 a = _mm_shuffle_epi8(t, m0);
472 b = _mm_shuffle_epi8(u, m1);
473 a = _mm_or_si128(a, b);
474 _mm_storeu_si128((__m128i*)p, a);
475
476 a = _mm_shuffle_epi8(t, m2);
477 b = _mm_shuffle_epi8(u, m3);
478 a = _mm_or_si128(a, b);
479 b = _mm_shuffle_epi8(v, m4);
480 a = _mm_or_si128(a, b);
481 _mm_storeu_si128((__m128i*)p + 1, a);
482
483 a = _mm_shuffle_epi8(u, m5);
484 b = _mm_shuffle_epi8(v, m6);
485 a = _mm_or_si128(a, b);
486 _mm_storeu_si128((__m128i*)p + 2, a);
487 }
488
489 int max_val = (1<<bit_depth) - 1;
490 for ( ; count > 0; --count)
491 {
492 int val;
493 val = *sp0++;
494 val = val >= 0 ? val : 0;
495 val = val <= max_val ? val : max_val;
496 *p++ = be2le((ui16) val);
497 val = *sp1++;
498 val = val >= 0 ? val : 0;
499 val = val <= max_val ? val : max_val;
500 *p++ = be2le((ui16) val);
501 val = *sp2++;
502 val = val >= 0 ? val : 0;
503 val = val <= max_val ? val : max_val;
504 *p++ = be2le((ui16) val);
505 }
506 }
507}
void sse41_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
void sse41_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
int64_t si64
Definition: ojph_defs.h:57
uint16_t ui16
Definition: ojph_defs.h:52
void sse41_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
void sse41_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
static ui16 be2le(const ui16 v)
Definition: ojph_img_io.cpp:55
void sse41_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
int32_t si32
Definition: ojph_defs.h:55
void sse41_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
uint32_t ui32
Definition: ojph_defs.h:54
uint8_t ui8
Definition: ojph_defs.h:50
#define ojph_unused(x)
Definition: ojph_defs.h:78
si32 * i32
Definition: ojph_mem.h:155