OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_transform_sse2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_transform_sse2.cpp
34// Author: Aous Naman
35// Date: 28 August 2019
36//***************************************************************************/
37
38#include <cstdio>
39
40#include "ojph_defs.h"
41#include "ojph_arch.h"
42#include "ojph_mem.h"
43#include "ojph_transform.h"
45
46#include <immintrin.h>
47
48namespace ojph {
49 namespace local {
50
53 const line_buf* line_src2,
54 line_buf *line_dst, ui32 repeat)
55 {
56 si32 *dst = line_dst->i32;
57 const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
58
59 for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
60 {
61 __m128i s1 = _mm_load_si128((__m128i*)src1);
62 __m128i s2 = _mm_load_si128((__m128i*)src2);
63 __m128i d = _mm_load_si128((__m128i*)dst);
64 s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
65 d = _mm_sub_epi32(d, s1);
66 _mm_store_si128((__m128i*)dst, d);
67 }
68 }
69
72 const line_buf* line_src2,
73 line_buf *line_dst, ui32 repeat)
74 {
75 si32 *dst = line_dst->i32;
76 const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
77
78 __m128i offset = _mm_set1_epi32(2);
79 for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
80 {
81 __m128i s1 = _mm_load_si128((__m128i*)src1);
82 s1 = _mm_add_epi32(s1, offset);
83 __m128i s2 = _mm_load_si128((__m128i*)src2);
84 s2 = _mm_add_epi32(s2, s1);
85 __m128i d = _mm_load_si128((__m128i*)dst);
86 d = _mm_add_epi32(d, _mm_srai_epi32(s2, 2));
87 _mm_store_si128((__m128i*)dst, d);
88 }
89 }
90
92 void sse2_rev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst,
93 line_buf *line_hdst, ui32 width, bool even)
94 {
95 if (width > 1)
96 {
97 si32 *src = line_src->i32;
98 si32 *ldst = line_ldst->i32, *hdst = line_hdst->i32;
99
100 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
101 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
102
103 // extension
104 src[-1] = src[1];
105 src[width] = src[width-2];
106 // predict
107 const si32* sp = src + (even ? 1 : 0);
108 si32 *dph = hdst;
109 for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
110 { //this is doing twice the work it needs to do
111 //it can be definitely written better
112 __m128i s1 = _mm_loadu_si128((__m128i*)(sp-1));
113 __m128i s2 = _mm_loadu_si128((__m128i*)(sp+1));
114 __m128i d = _mm_loadu_si128((__m128i*)sp);
115 s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
116 __m128i d1 = _mm_sub_epi32(d, s1);
117 sp += 4;
118 s1 = _mm_loadu_si128((__m128i*)(sp-1));
119 s2 = _mm_loadu_si128((__m128i*)(sp+1));
120 d = _mm_loadu_si128((__m128i*)sp);
121 s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
122 __m128i d2 = _mm_sub_epi32(d, s1);
123 sp += 4;
124 d = _mm_castps_si128(_mm_shuffle_ps(
125 _mm_castsi128_ps(d1), _mm_castsi128_ps(d2), 0x88));
126 _mm_store_si128((__m128i*)dph, d);
127 }
128
129 // extension
130 hdst[-1] = hdst[0];
131 hdst[H_width] = hdst[H_width-1];
132 // update
133 sp = src + (even ? 0 : 1);
134 const si32* sph = hdst + (even ? 0 : 1);
135 si32 *dpl = ldst;
136 __m128i offset = _mm_set1_epi32(2);
137 for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
138 {
139 __m128i s1 = _mm_loadu_si128((__m128i*)(sph-1));
140 s1 = _mm_add_epi32(s1, offset);
141 __m128i s2 = _mm_loadu_si128((__m128i*)sph);
142 s2 = _mm_add_epi32(s2, s1);
143 __m128i d1 = _mm_loadu_si128((__m128i*)sp);
144 __m128i d2 = _mm_loadu_si128((__m128i*)sp + 1);
145 __m128i d = _mm_castps_si128(_mm_shuffle_ps(
146 _mm_castsi128_ps(d1), _mm_castsi128_ps(d2), 0x88));
147 d = _mm_add_epi32(d, _mm_srai_epi32(s2, 2));
148 _mm_store_si128((__m128i*)dpl, d);
149 }
150 }
151 else
152 {
153 if (even)
154 line_ldst->i32[0] = line_src->i32[0];
155 else
156 line_hdst->i32[0] = line_src->i32[0] << 1;
157 }
158 }
159
162 const line_buf* line_src2,
163 line_buf *line_dst, ui32 repeat)
164 {
165 si32 *dst = line_dst->i32;
166 const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
167
168 for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
169 {
170 __m128i s1 = _mm_load_si128((__m128i*)src1);
171 __m128i s2 = _mm_load_si128((__m128i*)src2);
172 __m128i d = _mm_load_si128((__m128i*)dst);
173 s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
174 d = _mm_add_epi32(d, s1);
175 _mm_store_si128((__m128i*)dst, d);
176 }
177 }
178
181 const line_buf* line_src2,
182 line_buf *line_dst, ui32 repeat)
183 {
184 si32 *dst = line_dst->i32;
185 const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
186
187 __m128i offset = _mm_set1_epi32(2);
188 for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
189 {
190 __m128i s1 = _mm_load_si128((__m128i*)src1);
191 s1 = _mm_add_epi32(s1, offset);
192 __m128i s2 = _mm_load_si128((__m128i*)src2);
193 s2 = _mm_add_epi32(s2, s1);
194 __m128i d = _mm_load_si128((__m128i*)dst);
195 d = _mm_sub_epi32(d, _mm_srai_epi32(s2, 2));
196 _mm_store_si128((__m128i*)dst, d);
197 }
198 }
199
201 void sse2_rev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc,
202 line_buf *line_hsrc, ui32 width, bool even)
203 {
204 if (width > 1)
205 {
206 si32 *lsrc = line_lsrc->i32, *hsrc = line_hsrc->i32;
207 si32 *dst = line_dst->i32;
208
209 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
210 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
211
212 // extension
213 hsrc[-1] = hsrc[0];
214 hsrc[H_width] = hsrc[H_width-1];
215 //inverse update
216 const si32 *sph = hsrc + (even ? 0 : 1);
217 si32 *spl = lsrc;
218 __m128i offset = _mm_set1_epi32(2);
219 for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, spl+=4)
220 {
221 __m128i s1 = _mm_loadu_si128((__m128i*)(sph-1));
222 s1 = _mm_add_epi32(s1, offset);
223 __m128i s2 = _mm_loadu_si128((__m128i*)sph);
224 s2 = _mm_add_epi32(s2, s1);
225 __m128i d = _mm_load_si128((__m128i*)spl);
226 d = _mm_sub_epi32(d, _mm_srai_epi32(s2, 2));
227 _mm_store_si128((__m128i*)spl, d);
228 }
229
230 // extension
231 lsrc[-1] = lsrc[0];
232 lsrc[L_width] = lsrc[L_width - 1];
233 // inverse predict and combine
234 si32 *dp = dst + (even ? 0 : -1);
235 spl = lsrc + (even ? 0 : -1);
236 sph = hsrc;
237 ui32 width = L_width + (even ? 0 : 1);
238 for (ui32 i = (width + 3) >> 2; i > 0; --i, sph+=4, spl+=4, dp+=8)
239 {
240 __m128i s1 = _mm_loadu_si128((__m128i*)spl);
241 __m128i s2 = _mm_loadu_si128((__m128i*)(spl+1));
242 __m128i d = _mm_load_si128((__m128i*)sph);
243 s2 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
244 d = _mm_add_epi32(d, s2);
245 _mm_storeu_si128((__m128i*)dp, _mm_unpacklo_epi32(s1, d));
246 _mm_storeu_si128((__m128i*)dp + 1, _mm_unpackhi_epi32(s1, d));
247 }
248 }
249 else
250 {
251 if (even)
252 line_dst->i32[0] = line_lsrc->i32[0];
253 else
254 line_dst->i32[0] = line_hsrc->i32[0] >> 1;
255 }
256 }
257 }
258}
void sse2_rev_horz_wvlt_fwd_tx(line_buf *src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
void sse2_rev_vert_wvlt_fwd_update(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
void sse2_rev_horz_wvlt_bwd_tx(line_buf *dst, line_buf *lsrc, line_buf *hsrc, ui32 width, bool even)
void sse2_rev_vert_wvlt_bwd_predict(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
void sse2_rev_vert_wvlt_fwd_predict(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
void sse2_rev_vert_wvlt_bwd_update(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
int32_t si32
Definition: ojph_defs.h:55
uint32_t ui32
Definition: ojph_defs.h:54
si32 * i32
Definition: ojph_mem.h:155