OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_img_io_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_img_io_avx2.cpp
34// Author: Aous Naman
35// Date: 23 May 2022
36//***************************************************************************/
37
38
39#include <cstdlib>
40#include <cstring>
41#include <immintrin.h>
42
43#include "ojph_file.h"
44#include "ojph_img_io.h"
45#include "ojph_mem.h"
46#include "ojph_message.h"
47
48namespace ojph {
49
51 static
52 ui16 be2le(const ui16 v)
53 {
54 return (ui16)((v<<8) | (v>>8));
55 }
56
58 void avx2_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1,
59 const line_buf *ln2, void *dp,
60 ui32 bit_depth, ui32 count)
61 {
62 ojph_unused(ln1);
63 ojph_unused(ln2);
64
65 __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
66 __m256i zero = _mm256_setzero_si256();
67 __m256i mask = _mm256_set_epi64x(0x0F0B07030E0A0602, 0x0D0905010C080400,
68 0x0F0B07030E0A0602, 0x0D0905010C080400);
69 const si32 *sp = ln0->i32;
70 ui8* p = (ui8 *)dp;
71
72 // 32 bytes or entries in each loop
73 for ( ; count >= 32; count -= 32, sp += 32, p += 32)
74 {
75 __m256i a, t, u, v0, v1;
76 a = _mm256_load_si256((__m256i*)sp);
77 a = _mm256_max_epi32(a, zero);
78 t = _mm256_min_epi32(a, max_val_vec);
79
80 a = _mm256_load_si256((__m256i*)sp + 1);
81 a = _mm256_max_epi32(a, zero);
82 a = _mm256_min_epi32(a, max_val_vec);
83 a = _mm256_slli_epi32(a, 16);
84 t = _mm256_or_si256(t, a);
85
86 a = _mm256_load_si256((__m256i*)sp + 2);
87 a = _mm256_max_epi32(a, zero);
88 u = _mm256_min_epi32(a, max_val_vec);
89
90 a = _mm256_load_si256((__m256i*)sp + 3);
91 a = _mm256_max_epi32(a, zero);
92 a = _mm256_min_epi32(a, max_val_vec);
93 a = _mm256_slli_epi32(a, 16);
94 u = _mm256_or_si256(u, a);
95
96 v0 = _mm256_permute2x128_si256(t, u, 0x20);
97 v1 = _mm256_permute2x128_si256(t, u, 0x31);
98 v1 = _mm256_slli_epi32(v1, 8);
99 v0 = _mm256_or_si256(v0, v1);
100
101 v0 = _mm256_shuffle_epi8(v0, mask);
102 _mm256_storeu_si256((__m256i*)p, v0);
103 }
104
105 int max_val = (1 << bit_depth) - 1;
106 for ( ; count > 0; --count)
107 {
108 int val = *sp++;
109 val = val >= 0 ? val : 0;
110 val = val <= max_val ? val : max_val;
111 *p++ = (ui8)val;
112 }
113 }
114
116 void avx2_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1,
117 const line_buf *ln2, void *dp,
118 ui32 bit_depth, ui32 count)
119 {
120 int max_val = (1 << bit_depth) - 1;
121 __m256i max_val_vec = _mm256_set1_epi32(max_val);
122 __m256i zero = _mm256_setzero_si256();
123 __m256i m0 = _mm256_set_epi64x((si64)0xFFFFFFFF0E0D0C0A,
124 (si64)0x0908060504020100,
125 (si64)0xFFFFFFFF0E0D0C0A,
126 (si64)0x0908060504020100);
127
128 // 32 entries in each loop
129 const __m256i* sp0 = (__m256i*)ln0->i32;
130 const __m256i* sp1 = (__m256i*)ln1->i32;
131 const __m256i* sp2 = (__m256i*)ln2->i32;
132 ui8* p = (ui8*)dp;
133 for ( ; count >= 32; count -= 32, sp0 += 4, sp1 += 4, sp2 += 4, p += 96)
134 {
135 __m256i a, t, u, v, w;
136
137 a = _mm256_load_si256(sp0);
138 a = _mm256_max_epi32(a, zero);
139 t = _mm256_min_epi32(a, max_val_vec);
140
141 a = _mm256_load_si256(sp1);
142 a = _mm256_max_epi32(a, zero);
143 a = _mm256_min_epi32(a, max_val_vec);
144 a = _mm256_slli_epi32(a, 8);
145 t = _mm256_or_si256(t, a);
146
147 a = _mm256_load_si256(sp2);
148 a = _mm256_max_epi32(a, zero);
149 a = _mm256_min_epi32(a, max_val_vec);
150 a = _mm256_slli_epi32(a, 16);
151 t = _mm256_or_si256(t, a);
152 t = _mm256_shuffle_epi8(t, m0);
153
154
155 a = _mm256_load_si256(sp0 + 1);
156 a = _mm256_max_epi32(a, zero);
157 u = _mm256_min_epi32(a, max_val_vec);
158
159 a = _mm256_load_si256(sp1 + 1);
160 a = _mm256_max_epi32(a, zero);
161 a = _mm256_min_epi32(a, max_val_vec);
162 a = _mm256_slli_epi32(a, 8);
163 u = _mm256_or_si256(u, a);
164
165 a = _mm256_load_si256(sp2 + 1);
166 a = _mm256_max_epi32(a, zero);
167 a = _mm256_min_epi32(a, max_val_vec);
168 a = _mm256_slli_epi32(a, 16);
169 u = _mm256_or_si256(u, a);
170 u = _mm256_shuffle_epi8(u, m0);
171
172
173 a = _mm256_load_si256(sp0 + 2);
174 a = _mm256_max_epi32(a, zero);
175 v = _mm256_min_epi32(a, max_val_vec);
176
177 a = _mm256_load_si256(sp1 + 2);
178 a = _mm256_max_epi32(a, zero);
179 a = _mm256_min_epi32(a, max_val_vec);
180 a = _mm256_slli_epi32(a, 8);
181 v = _mm256_or_si256(v, a);
182
183 a = _mm256_load_si256(sp2 + 2);
184 a = _mm256_max_epi32(a, zero);
185 a = _mm256_min_epi32(a, max_val_vec);
186 a = _mm256_slli_epi32(a, 16);
187 v = _mm256_or_si256(v, a);
188 v = _mm256_shuffle_epi8(v, m0);
189
190
191 a = _mm256_load_si256(sp0 + 3);
192 a = _mm256_max_epi32(a, zero);
193 w = _mm256_min_epi32(a, max_val_vec);
194
195 a = _mm256_load_si256(sp1 + 3);
196 a = _mm256_max_epi32(a, zero);
197 a = _mm256_min_epi32(a, max_val_vec);
198 a = _mm256_slli_epi32(a, 8);
199 w = _mm256_or_si256(w, a);
200
201 a = _mm256_load_si256(sp2 + 3);
202 a = _mm256_max_epi32(a, zero);
203 a = _mm256_min_epi32(a, max_val_vec);
204 a = _mm256_slli_epi32(a, 16);
205 w = _mm256_or_si256(w, a);
206 w = _mm256_shuffle_epi8(w, m0);
207
208 _mm_storeu_si128((__m128i*)(p ), _mm256_castsi256_si128(t));
209 _mm_storeu_si128((__m128i*)(p + 12), _mm256_extracti128_si256(t,1));
210 _mm_storeu_si128((__m128i*)(p + 24), _mm256_castsi256_si128(u));
211 _mm_storeu_si128((__m128i*)(p + 36), _mm256_extracti128_si256(u,1));
212 _mm_storeu_si128((__m128i*)(p + 48), _mm256_castsi256_si128(v));
213 _mm_storeu_si128((__m128i*)(p + 60), _mm256_extracti128_si256(v,1));
214 _mm_storeu_si128((__m128i*)(p + 72), _mm256_castsi256_si128(w));
215 *((si64*)(p + 84)) = _mm256_extract_epi64(w, 2);
216 *((si32*)(p + 92)) = _mm256_extract_epi32(w, 6);
217
218 // this is an alterative slower implementation
219 //__m256i tx, ux, vx, wx;
220 //tx = _mm256_permute2x128_si256(t, v, 0x20);
221 //ux = _mm256_permute2x128_si256(t, v, 0x31);
222 //vx = _mm256_permute2x128_si256(u, w, 0x20);
223 //wx = _mm256_permute2x128_si256(u, w, 0x31);
224
225 //tx = _mm256_or_si256(tx, _mm256_bslli_epi128(ux, 12));
226 //ux = _mm256_or_si256(_mm256_bsrli_epi128(ux, 4),
227 // _mm256_bslli_epi128(vx, 8));
228 //vx = _mm256_or_si256(_mm256_bsrli_epi128(vx, 8),
229 // _mm256_bslli_epi128(wx, 4));
230
231 //a = _mm256_permute2x128_si256(tx, ux, 0x20);
232 //_mm256_storeu_si256(p, a);
233 //a = _mm256_permute2x128_si256(vx, tx, 0x30);
234 //_mm256_storeu_si256(p + 1, a);
235 //a = _mm256_permute2x128_si256(ux, vx, 0x31);
236 //_mm256_storeu_si256(p + 2, a);
237 }
238
239 const si32* ssp0 = (si32*)sp0;
240 const si32* ssp1 = (si32*)sp1;
241 const si32* ssp2 = (si32*)sp2;
242 for ( ; count > 0; --count)
243 {
244 int val;
245 val = *ssp0++;
246 val = val >= 0 ? val : 0;
247 val = val <= max_val ? val : max_val;
248 *p++ = (ui8) val;
249 val = *ssp1++;
250 val = val >= 0 ? val : 0;
251 val = val <= max_val ? val : max_val;
252 *p++ = (ui8) val;
253 val = *ssp2++;
254 val = val >= 0 ? val : 0;
255 val = val <= max_val ? val : max_val;
256 *p++ = (ui8) val;
257 }
258 }
259
261 void avx2_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1,
262 const line_buf *ln2, void *dp,
263 ui32 bit_depth, ui32 count)
264 {
265 ojph_unused(ln1);
266 ojph_unused(ln2);
267
268 __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
269 __m256i zero = _mm256_setzero_si256();
270 __m256i mask = _mm256_set_epi64x(0x0F0E0B0A07060302, 0x0D0C090805040100,
271 0x0F0E0B0A07060302, 0x0D0C090805040100);
272 const si32 *sp = ln0->i32;
273 ui16* p = (ui16 *)dp;
274
275 // 16 entries in each loop
276 for ( ; count >= 16; count -= 16, sp += 16, p += 16)
277 {
278 __m256i a, t;
279 a = _mm256_load_si256((__m256i*)sp);
280 a = _mm256_max_epi32(a, zero);
281 t = _mm256_min_epi32(a, max_val_vec);
282
283 a = _mm256_load_si256((__m256i*)sp + 1);
284 a = _mm256_max_epi32(a, zero);
285 a = _mm256_min_epi32(a, max_val_vec);
286 a = _mm256_slli_epi32(a, 16);
287 t = _mm256_or_si256(t, a);
288
289 t = _mm256_shuffle_epi8(t, mask);
290 t = _mm256_permute4x64_epi64(t, 0xD8);
291 _mm256_storeu_si256((__m256i*)p, t);
292 }
293
294 int max_val = (1<<bit_depth) - 1;
295 for ( ; count > 0; --count)
296 {
297 int val = *sp++;
298 val = val >= 0 ? val : 0;
299 val = val <= max_val ? val : max_val;
300 *p++ = (ui16) val;
301 }
302 }
303
305 void avx2_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1,
306 const line_buf *ln2, void *dp,
307 ui32 bit_depth, ui32 count)
308 {
309 ojph_unused(ln1);
310 ojph_unused(ln2);
311
312 __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
313 __m256i zero = _mm256_setzero_si256();
314 __m256i mask = _mm256_set_epi64x(0x0E0F0A0B06070203, 0x0C0D080904050001,
315 0x0E0F0A0B06070203, 0x0C0D080904050001);
316 const si32 *sp = ln0->i32;
317 ui16* p = (ui16 *)dp;
318
319 // 16 entries in each loop
320 for ( ; count >= 16; count -= 16, sp += 16, p += 16)
321 {
322 __m256i a, t;
323 a = _mm256_load_si256((__m256i*)sp);
324 a = _mm256_max_epi32(a, zero);
325 t = _mm256_min_epi32(a, max_val_vec);
326
327 a = _mm256_load_si256((__m256i*)sp + 1);
328 a = _mm256_max_epi32(a, zero);
329 a = _mm256_min_epi32(a, max_val_vec);
330 a = _mm256_slli_epi32(a, 16);
331 t = _mm256_or_si256(t, a);
332
333 t = _mm256_shuffle_epi8(t, mask);
334 t = _mm256_permute4x64_epi64(t, 0xD8);
335 _mm256_storeu_si256((__m256i*)p, t);
336 }
337
338 int max_val = (1<<bit_depth) - 1;
339 for ( ; count > 0; --count)
340 {
341 int val = *sp++;
342 val = val >= 0 ? val : 0;
343 val = val <= max_val ? val : max_val;
344 *p++ = be2le((ui16) val);
345 }
346 }
347}
int64_t si64
Definition: ojph_defs.h:57
void avx2_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
uint16_t ui16
Definition: ojph_defs.h:52
void avx2_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
void avx2_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
static ui16 be2le(const ui16 v)
Definition: ojph_img_io.cpp:55
void avx2_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
int32_t si32
Definition: ojph_defs.h:55
uint32_t ui32
Definition: ojph_defs.h:54
uint8_t ui8
Definition: ojph_defs.h:50
#define ojph_unused(x)
Definition: ojph_defs.h:78
si32 * i32
Definition: ojph_mem.h:155