OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_transform_avx.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_transform_avx.cpp
34// Author: Aous Naman
35// Date: 28 August 2019
36//***************************************************************************/
37
38#include <cstdio>
39
40#include "ojph_defs.h"
41#include "ojph_arch.h"
42#include "ojph_mem.h"
43#include "ojph_transform.h"
45
46#include <immintrin.h>
47
48namespace ojph {
49 namespace local {
50
52 void avx_irrev_vert_wvlt_step(const line_buf* line_src1,
53 const line_buf* line_src2,
54 line_buf *line_dst, int step_num,
55 ui32 repeat)
56 {
57 float *dst = line_dst->f32;
58 const float *src1 = line_src1->f32, *src2 = line_src2->f32;
59
60 __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[step_num]);
61 for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
62 {
63 __m256 s1 = _mm256_load_ps(src1);
64 __m256 s2 = _mm256_load_ps(src2);
65 __m256 d = _mm256_load_ps(dst);
66 d = _mm256_add_ps(d, _mm256_mul_ps(factor, _mm256_add_ps(s1, s2)));
67 _mm256_store_ps(dst, d);
68 }
69 }
70
72 void avx_irrev_vert_wvlt_K(const line_buf* line_src, line_buf* line_dst,
73 bool L_analysis_or_H_synthesis, ui32 repeat)
74 {
75 float *dst = line_dst->f32;
76 const float *src = line_src->f32;
77
78 float f = LIFTING_FACTORS::K_inv;
79 f = L_analysis_or_H_synthesis ? f : LIFTING_FACTORS::K;
80 __m256 factor = _mm256_set1_ps(f);
81 for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src+=8)
82 {
83 __m256 s = _mm256_load_ps(src);
84 _mm256_store_ps(dst, _mm256_mul_ps(factor, s));
85 }
86 }
87
88
90 void avx_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst,
91 line_buf *line_hdst, ui32 width,
92 bool even)
93 {
94 if (width > 1)
95 {
96 float *src = line_src->f32;
97 float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
98
99 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
100 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
101
102 //extension
103 src[-1] = src[1];
104 src[width] = src[width-2];
105 // predict
106 const float* sp = src + (even ? 1 : 0);
107 float *dph = hdst;
108 __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[0]);
109 for (ui32 i = (H_width + 3) >> 2; i > 0; --i)
110 { //this is doing twice the work it needs to do
111 //it can be definitely written better
112 __m256 s1 = _mm256_loadu_ps(sp - 1);
113 __m256 s2 = _mm256_loadu_ps(sp + 1);
114 __m256 d = _mm256_loadu_ps(sp);
115 s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
116 __m256 d1 = _mm256_add_ps(d, s1);
117 sp += 8;
118 __m128 t1 = _mm256_extractf128_ps(d1, 0);
119 __m128 t2 = _mm256_extractf128_ps(d1, 1);
120 __m128 t = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(2, 0, 2, 0));
121 _mm_store_ps(dph, t);
122 dph += 4;
123 }
124
125 // extension
126 hdst[-1] = hdst[0];
127 hdst[H_width] = hdst[H_width-1];
128 // update
129 __m128 factor128 = _mm_set1_ps(LIFTING_FACTORS::steps[1]);
130 sp = src + (even ? 0 : 1);
131 const float* sph = hdst + (even ? 0 : 1);
132 float *dpl = ldst;
133 for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
134 {
135 __m256 d1 = _mm256_loadu_ps(sp); //is there an advantage here?
136 __m128 t1 = _mm256_extractf128_ps(d1, 0);
137 __m128 t2 = _mm256_extractf128_ps(d1, 1);
138 __m128 d = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(2, 0, 2, 0));
139
140 __m128 s1 = _mm_loadu_ps(sph - 1);
141 __m128 s2 = _mm_loadu_ps(sph);
142 s1 = _mm_mul_ps(factor128, _mm_add_ps(s1, s2));
143 d = _mm_add_ps(d, s1);
144 _mm_store_ps(dpl, d);
145 }
146
147 //extension
148 ldst[-1] = ldst[0];
149 ldst[L_width] = ldst[L_width-1];
150 //predict
151 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[2]);
152 const float* spl = ldst + (even ? 1 : 0);
153 dph = hdst;
154 for (ui32 i = (H_width + 7) >> 3; i > 0; --i, spl+=8, dph+=8)
155 {
156 __m256 s1 = _mm256_loadu_ps(spl - 1);
157 __m256 s2 = _mm256_loadu_ps(spl);
158 __m256 d = _mm256_loadu_ps(dph);
159 s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
160 d = _mm256_add_ps(d, s1);
161 _mm256_store_ps(dph, d);
162 }
163
164 // extension
165 hdst[-1] = hdst[0];
166 hdst[H_width] = hdst[H_width-1];
167 // update
168 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[3]);
169 sph = hdst + (even ? 0 : 1);
170 dpl = ldst;
171 for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, dpl+=8)
172 {
173 __m256 s1 = _mm256_loadu_ps(sph - 1);
174 __m256 s2 = _mm256_loadu_ps(sph);
175 __m256 d = _mm256_loadu_ps(dpl);
176 s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
177 d = _mm256_add_ps(d, s1);
178 _mm256_store_ps(dpl, d);
179 }
180
181 //multipliers
182 float *dp = ldst;
183 factor = _mm256_set1_ps(LIFTING_FACTORS::K_inv);
184 for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dp+=8)
185 {
186 __m256 d = _mm256_load_ps(dp);
187 _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
188 }
189 dp = hdst;
190 factor = _mm256_set1_ps(LIFTING_FACTORS::K);
191 for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dp+=8)
192 {
193 __m256 d = _mm256_load_ps(dp);
194 _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
195 }
196 }
197 else
198 {
199 if (even)
200 line_ldst->f32[0] = line_src->f32[0];
201 else
202 line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
203 }
204 }
205
207 void avx_irrev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
208 line_buf *line_hsrc, ui32 width,
209 bool even)
210 {
211 if (width > 1)
212 {
213 float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
214 float *dst = line_dst->f32;
215
216 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
217 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
218
219 //multipliers
220 float *dp = lsrc;
221 __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::K);
222 for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dp+=8)
223 {
224 __m256 d = _mm256_load_ps(dp);
225 _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
226 }
227 dp = hsrc;
228 factor = _mm256_set1_ps(LIFTING_FACTORS::K_inv);
229 for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dp+=8)
230 {
231 __m256 d = _mm256_load_ps(dp);
232 _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
233 }
234
235 //extension
236 hsrc[-1] = hsrc[0];
237 hsrc[H_width] = hsrc[H_width-1];
238 //inverse update
239 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[7]);
240 const float *sph = hsrc + (even ? 0 : 1);
241 float *dpl = lsrc;
242 for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, dpl+=8)
243 {
244 __m256 s1 = _mm256_loadu_ps(sph - 1);
245 __m256 s2 = _mm256_loadu_ps(sph);
246 __m256 d = _mm256_loadu_ps(dpl);
247 s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
248 d = _mm256_add_ps(d, s1);
249 _mm256_store_ps(dpl, d);
250 }
251
252 //extension
253 lsrc[-1] = lsrc[0];
254 lsrc[L_width] = lsrc[L_width-1];
255 //inverse perdict
256 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[6]);
257 const float *spl = lsrc + (even ? 0 : -1);
258 float *dph = hsrc;
259 for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dph+=8, spl+=8)
260 {
261 __m256 s1 = _mm256_loadu_ps(spl);
262 __m256 s2 = _mm256_loadu_ps(spl + 1);
263 __m256 d = _mm256_loadu_ps(dph);
264 s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
265 d = _mm256_add_ps(d, s1);
266 _mm256_store_ps(dph, d);
267 }
268
269 //extension
270 hsrc[-1] = hsrc[0];
271 hsrc[H_width] = hsrc[H_width-1];
272 //inverse update
273 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[5]);
274 sph = hsrc + (even ? 0 : 1);
275 dpl = lsrc;
276 for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dpl+=8, sph+=8)
277 {
278 __m256 s1 = _mm256_loadu_ps(sph - 1);
279 __m256 s2 = _mm256_loadu_ps(sph);
280 __m256 d = _mm256_loadu_ps(dpl);
281 s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
282 d = _mm256_add_ps(d, s1);
283 _mm256_store_ps(dpl, d);
284 }
285
286 //extension
287 lsrc[-1] = lsrc[0];
288 lsrc[L_width] = lsrc[L_width-1];
289 //inverse perdict and combine
290 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[4]);
291 dp = dst + (even ? 0 : -1);
292 spl = lsrc + (even ? 0 : -1);
293 sph = hsrc;
294 ui32 width = L_width + (even ? 0 : 1);
295 for (ui32 i = (width + 7) >> 3; i > 0; --i, spl+=8, sph+=8)
296 {
297 __m256 s1 = _mm256_loadu_ps(spl);
298 __m256 s2 = _mm256_loadu_ps(spl + 1);
299 __m256 d = _mm256_load_ps(sph);
300 s2 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
301 d = _mm256_add_ps(d, s2);
302
303 __m128 a0 = _mm256_extractf128_ps(s1, 0);
304 __m128 a1 = _mm256_extractf128_ps(s1, 1);
305 __m128 a2 = _mm256_extractf128_ps(d, 0);
306 __m128 a3 = _mm256_extractf128_ps(d, 1);
307 _mm_storeu_ps(dp, _mm_unpacklo_ps(a0, a2)); dp += 4;
308 _mm_storeu_ps(dp, _mm_unpackhi_ps(a0, a2)); dp += 4;
309 _mm_storeu_ps(dp, _mm_unpacklo_ps(a1, a3)); dp += 4;
310 _mm_storeu_ps(dp, _mm_unpackhi_ps(a1, a3)); dp += 4;
311
312// s2 = _mm256_unpackhi_ps(s1, d);
313// s1 = _mm256_unpacklo_ps(s1, d);
314// d = _mm256_permute2f128_ps(s1, s2, (2 << 4) | 0);
315// _mm256_storeu_ps(dp, d);
316// d = _mm256_permute2f128_ps(s1, s2, (3 << 4) | 1);
317// _mm256_storeu_ps(dp + 1, d);
318 }
319 }
320 else
321 {
322 if (even)
323 line_dst->f32[0] = line_lsrc->f32[0];
324 else
325 line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
326 }
327 }
328 }
329}
void avx_irrev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, line_buf *line_hsrc, ui32 width, bool even)
void avx_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, line_buf *line_hdst, ui32 width, bool even)
void avx_irrev_vert_wvlt_K(const line_buf *line_src, line_buf *line_dst, bool L_analysis_or_H_synthesis, ui32 repeat)
void avx_irrev_vert_wvlt_step(const line_buf *line_src1, const line_buf *line_src2, line_buf *line_dst, int step_num, ui32 repeat)
uint32_t ui32
Definition: ojph_defs.h:54
float * f32
Definition: ojph_mem.h:156