OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_block_encoder_avx512.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8// Copyright (c) 2023, Intel Corporation
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32//***************************************************************************/
33// This file is part of the OpenJPH software implementation.
34// File: ojph_block_encoder_avx512.cpp
35//***************************************************************************/
36
37#include <cassert>
38#include <cstring>
39#include <cstdint>
40#include <climits>
41#include <immintrin.h>
42
43#include "ojph_mem.h"
44#include "ojph_arch.h"
45#include "ojph_block_encoder.h"
46#include "ojph_message.h"
47
48#ifdef OJPH_COMPILER_MSVC
49 #define likely(x) (x)
50 #define unlikely(x) (x)
51#else
52 #define likely(x) __builtin_expect((x), 1)
53 #define unlikely(x) __builtin_expect((x), 0)
54#endif
55
56namespace ojph {
57 namespace local {
58
60 // tables
62
63 //VLC encoding
64 // index is (c_q << 8) + (rho << 4) + eps
65 // data is (cwd << 8) + (cwd_len << 4) + eps
66 // table 0 is for the initial line of quads
67 static ui32 vlc_tbl0[2048] = { 0 };
68 static ui32 vlc_tbl1[2048] = { 0 };
69
70 //UVLC encoding
71 static ui32 ulvc_cwd_pre[33];
72 static int ulvc_cwd_pre_len[33];
73 static ui32 ulvc_cwd_suf[33];
74 static int ulvc_cwd_suf_len[33];
75
77 static bool vlc_init_tables()
78 {
79 struct vlc_src_table { int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
80 vlc_src_table tbl0[] = {
81 #include "table0.h"
82 };
83 size_t tbl0_size = sizeof(tbl0) / sizeof(vlc_src_table);
84
85 si32 pattern_popcnt[16];
86 for (ui32 i = 0; i < 16; ++i)
87 pattern_popcnt[i] = (si32)population_count(i);
88
89 vlc_src_table* src_tbl = tbl0;
90 ui32 *tgt_tbl = vlc_tbl0;
91 size_t tbl_size = tbl0_size;
92 for (int i = 0; i < 2048; ++i)
93 {
94 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
95 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
96 tgt_tbl[i] = 0;
97 else
98 {
99 vlc_src_table *best_entry = NULL;
100 if (emb) // u_off = 1
101 {
102 int best_e_k = -1;
103 for (size_t j = 0; j < tbl_size; ++j)
104 {
105 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
106 if (src_tbl[j].u_off == 1)
107 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
108 {
109 //now we need to find the smallest cwd with the highest
110 // number of bits set in e_k
111 int ones_count = pattern_popcnt[src_tbl[j].e_k];
112 if (ones_count >= best_e_k)
113 {
114 best_entry = src_tbl + j;
115 best_e_k = ones_count;
116 }
117 }
118 }
119 }
120 else // u_off = 0
121 {
122 for (size_t j = 0; j < tbl_size; ++j)
123 {
124 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
125 if (src_tbl[j].u_off == 0)
126 {
127 best_entry = src_tbl + j;
128 break;
129 }
130 }
131 }
132 assert(best_entry);
133 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
134 + best_entry->e_k);
135 }
136 }
137
138 vlc_src_table tbl1[] = {
139 #include "table1.h"
140 };
141 size_t tbl1_size = sizeof(tbl1) / sizeof(vlc_src_table);
142
143 src_tbl = tbl1;
144 tgt_tbl = vlc_tbl1;
145 tbl_size = tbl1_size;
146 for (int i = 0; i < 2048; ++i)
147 {
148 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
149 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
150 tgt_tbl[i] = 0;
151 else
152 {
153 vlc_src_table *best_entry = NULL;
154 if (emb) // u_off = 1
155 {
156 int best_e_k = -1;
157 for (size_t j = 0; j < tbl_size; ++j)
158 {
159 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
160 if (src_tbl[j].u_off == 1)
161 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
162 {
163 //now we need to find the smallest cwd with the highest
164 // number of bits set in e_k
165 int ones_count = pattern_popcnt[src_tbl[j].e_k];
166 if (ones_count >= best_e_k)
167 {
168 best_entry = src_tbl + j;
169 best_e_k = ones_count;
170 }
171 }
172 }
173 }
174 else // u_off = 0
175 {
176 for (size_t j = 0; j < tbl_size; ++j)
177 {
178 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
179 if (src_tbl[j].u_off == 0)
180 {
181 best_entry = src_tbl + j;
182 break;
183 }
184 }
185 }
186 assert(best_entry);
187 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
188 + best_entry->e_k);
189 }
190 }
191
192
193 return true;
194 }
195
197 static bool uvlc_init_tables()
198 {
199 //code goes from 0 to 31, extension and 32 are not supported here
200 ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
201 ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
202 ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
203 ulvc_cwd_pre_len[2] = 2;
204 ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
205 ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
206 ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
207 ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
208 ulvc_cwd_suf_len[2] = 0;
209 ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
210 for (int i = 5; i < 33; ++i)
211 {
212 ulvc_cwd_pre[i] = 0;
213 ulvc_cwd_pre_len[i] = 3;
214 ulvc_cwd_suf[i] = (ui32)(i-5);
215 ulvc_cwd_suf_len[i] = 5;
216 }
217 return true;
218 }
219
223 bool result;
224 result = vlc_init_tables();
225 result = result && uvlc_init_tables();
226 return result;
227 }
228 return false;
229 }
230
233
235 //
237 struct mel_struct {
238 //storage
239 ui8* buf; //pointer to data buffer
240 ui32 pos; //position of next writing within buf
241 ui32 buf_size; //size of buffer, which we must not exceed
242
243 // all these can be replaced by bytes
244 int remaining_bits; //number of empty bits in tmp
245 int tmp; //temporary storage of coded bits
246 int run; //number of 0 run
247 int k; //state
248 int threshold; //threshold where one bit must be coded
249 };
250
252 static inline void
253 mel_init(mel_struct* melp, ui32 buffer_size, ui8* data)
254 {
255 melp->buf = data;
256 melp->pos = 0;
257 melp->buf_size = buffer_size;
258 melp->remaining_bits = 8;
259 melp->tmp = 0;
260 melp->run = 0;
261 melp->k = 0;
262 melp->threshold = 1; // this is 1 << mel_exp[melp->k];
263 }
264
266 static inline void
268 {
269 melp->tmp = (melp->tmp << 1) + v;
270 melp->remaining_bits--;
271 if (melp->remaining_bits == 0) {
272 melp->buf[melp->pos++] = (ui8)melp->tmp;
273 melp->remaining_bits = (melp->tmp == 0xFF ? 7 : 8);
274 melp->tmp = 0;
275 }
276 }
277
279 static inline void
280 mel_encode(mel_struct* melp, bool bit)
281 {
282 //MEL exponent
283 static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
284
285 if (bit == false) {
286 ++melp->run;
287 if (melp->run >= melp->threshold) {
288 mel_emit_bit(melp, 1);
289 melp->run = 0;
290 melp->k = ojph_min(12, melp->k + 1);
291 melp->threshold = 1 << mel_exp[melp->k];
292 }
293 } else {
294 mel_emit_bit(melp, 0);
295 int t = mel_exp[melp->k];
296 while (t > 0) {
297 mel_emit_bit(melp, (melp->run >> --t) & 1);
298 }
299 melp->run = 0;
300 melp->k = ojph_max(0, melp->k - 1);
301 melp->threshold = 1 << mel_exp[melp->k];
302 }
303 }
304
306 //
308 struct vlc_struct {
309 //storage
310 ui8* buf; //pointer to data buffer
311 ui32 pos; //position of next writing within buf
312 ui32 buf_size; //size of buffer, which we must not exceed
313
314 int used_bits; //number of occupied bits in tmp
315 ui64 tmp; //temporary storage of coded bits
316 bool last_greater_than_8F; //true if last byte us greater than 0x8F
317 };
318
320 static inline void
321 vlc_init(vlc_struct* vlcp, ui32 buffer_size, ui8* data)
322 {
323 vlcp->buf = data + buffer_size - 1; //points to last byte
324 vlcp->pos = 1; //locations will be all -pos
325 vlcp->buf_size = buffer_size;
326
327 vlcp->buf[0] = 0xFF;
328 vlcp->used_bits = 4;
329 vlcp->tmp = 0xF;
330 vlcp->last_greater_than_8F = true;
331 }
332
334 static inline void
335 vlc_encode(vlc_struct* vlcp, ui32 cwd, int cwd_len)
336 {
337 vlcp->tmp |= (ui64)cwd << vlcp->used_bits;
338 vlcp->used_bits += cwd_len;
339
340 while (vlcp->used_bits >= 8) {
341 ui8 tmp;
342
343 if (unlikely(vlcp->last_greater_than_8F)) {
344 tmp = vlcp->tmp & 0x7F;
345
346 if (likely(tmp != 0x7F)) {
347 tmp = vlcp->tmp & 0xFF;
348 *(vlcp->buf - vlcp->pos) = tmp;
349 vlcp->last_greater_than_8F = tmp > 0x8F;
350 vlcp->tmp >>= 8;
351 vlcp->used_bits -= 8;
352 } else {
353 *(vlcp->buf - vlcp->pos) = tmp;
354 vlcp->last_greater_than_8F = false;
355 vlcp->tmp >>= 7;
356 vlcp->used_bits -= 7;
357 }
358
359 } else {
360 tmp = vlcp->tmp & 0xFF;
361 *(vlcp->buf - vlcp->pos) = tmp;
362 vlcp->last_greater_than_8F = tmp > 0x8F;
363 vlcp->tmp >>= 8;
364 vlcp->used_bits -= 8;
365 }
366
367 vlcp->pos++;
368 }
369 }
370
372 //
374 static inline void
376 {
377 if (melp->run > 0)
378 mel_emit_bit(melp, 1);
379
380 melp->tmp = melp->tmp << melp->remaining_bits;
381 int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
382 int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
383 if ((mel_mask | vlc_mask) == 0)
384 return; //last mel byte cannot be 0xFF, since then
385 //melp->remaining_bits would be < 8
386 if (melp->pos >= melp->buf_size)
387 OJPH_ERROR(0x00020003, "mel encoder's buffer is full");
388 ui8 vlcp_tmp = (ui8)vlcp->tmp;
389 int fuse = melp->tmp | vlcp_tmp;
390 if ( ( ((fuse ^ melp->tmp) & mel_mask)
391 | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
392 && (fuse != 0xFF) && vlcp->pos > 1)
393 {
394 melp->buf[melp->pos++] = (ui8)fuse;
395 }
396 else
397 {
398 if (vlcp->pos >= vlcp->buf_size)
399 OJPH_ERROR(0x00020004, "vlc encoder's buffer is full");
400 melp->buf[melp->pos++] = (ui8)melp->tmp; //melp->tmp cannot be 0xFF
401 *(vlcp->buf - vlcp->pos) = (ui8)vlcp_tmp;
402 vlcp->pos++;
403 }
404 }
405
407//
409 struct ms_struct {
410 //storage
411 ui8* buf; //pointer to data buffer
412 ui32 pos; //position of next writing within buf
413 ui32 buf_size; //size of buffer, which we must not exceed
414
415 int max_bits; //maximum number of bits that can be store in tmp
416 int used_bits; //number of occupied bits in tmp
417 ui32 tmp; //temporary storage of coded bits
418 };
419
421 static inline void
422 ms_init(ms_struct* msp, ui32 buffer_size, ui8* data)
423 {
424 msp->buf = data;
425 msp->pos = 0;
426 msp->buf_size = buffer_size;
427 msp->max_bits = 8;
428 msp->used_bits = 0;
429 msp->tmp = 0;
430 }
431
433 static inline void
434 ms_encode(ms_struct* msp, ui64 cwd, int cwd_len)
435 {
436 while (cwd_len > 0)
437 {
438 if (msp->pos >= msp->buf_size)
439 OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full");
440 int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len);
441 msp->tmp |= ((ui32)(cwd & ((1U << t) - 1))) << msp->used_bits;
442 msp->used_bits += t;
443 cwd >>= t;
444 cwd_len -= t;
445 if (msp->used_bits >= msp->max_bits)
446 {
447 msp->buf[msp->pos++] = (ui8)msp->tmp;
448 msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
449 msp->tmp = 0;
450 msp->used_bits = 0;
451 }
452 }
453 }
454
456 static inline void
458 {
459 if (msp->used_bits)
460 {
461 int t = msp->max_bits - msp->used_bits; //unused bits
462 msp->tmp |= (0xFF & ((1U << t) - 1)) << msp->used_bits;
463 msp->used_bits += t;
464 if (msp->tmp != 0xFF)
465 {
466 if (msp->pos >= msp->buf_size)
467 OJPH_ERROR(0x00020006, "magnitude sign encoder's buffer is full");
468 msp->buf[msp->pos++] = (ui8)msp->tmp;
469 }
470 }
471 else if (msp->max_bits == 7)
472 msp->pos--;
473 }
474
475#define ZERO _mm512_setzero_epi32()
476#define ONE _mm512_set1_epi32(1)
477
478#if 0
479static void print_epi32(const char *msg, __m512i &val)
480{
481 uint32_t A[16] = {0};
482
483 _mm512_store_epi32(A, val);
484
485 printf("%s: ", msg);
486 for (int i = 0; i < 16; ++i) {
487 printf("%X ", A[i]);
488 }
489 printf("\n");
490}
491#endif
492
493static void proc_pixel(__m512i *src_vec, ui32 p,
494 __m512i *eq_vec, __m512i *s_vec,
495 __m512i &rho_vec, __m512i &e_qmax_vec)
496{
497 __m512i val_vec[4];
498 __m512i _eq_vec[4];
499 __m512i _s_vec[4];
500 __m512i _rho_vec[4];
501 ui16 val_mask[4];
502
503 for (ui32 i = 0; i < 4; ++i) {
504 /* val = t + t; //multiply by 2 and get rid of sign */
505 val_vec[i] = _mm512_add_epi32(src_vec[i], src_vec[i]);
506
507 /* val >>= p; // 2 \mu_p + x */
508 val_vec[i] = _mm512_srli_epi32(val_vec[i], p);
509
510 /* val &= ~1u; // 2 \mu_p */
511 val_vec[i] = _mm512_and_epi32(val_vec[i], _mm512_set1_epi32((int)~1u));
512
513 /* if (val) { */
514 val_mask[i] = _mm512_cmpneq_epi32_mask(val_vec[i], ZERO);
515
516 /* rho[i] = 1 << i;
517 * rho is processed below.
518 */
519
520 /* e_q[i] = 32 - (int)count_leading_ZEROs(--val); //2\mu_p - 1 */
521 val_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i], val_vec[i], ONE);
522 _eq_vec[i] = _mm512_mask_lzcnt_epi32(ZERO, val_mask[i], val_vec[i]);
523 _eq_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i],
524 _mm512_set1_epi32(32), _eq_vec[i]);
525
526 /* e_qmax[i] = ojph_max(e_qmax[i], e_q[j]);
527 * e_qmax is processed below
528 */
529
530 /* s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
531 val_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i], val_vec[i], ONE);
532 _s_vec[i] = _mm512_mask_srli_epi32(ZERO, val_mask[i], src_vec[i], 31);
533 _s_vec[i] =
534 _mm512_mask_add_epi32(ZERO, val_mask[i], _s_vec[i], val_vec[i]);
535 /* } */
536 }
537
538 val_vec[0] = _mm512_mask_mov_epi32(ZERO, val_mask[0], ONE);
539 val_vec[1] = _mm512_mask_mov_epi32(ZERO, val_mask[1], ONE);
540 val_vec[2] = _mm512_mask_mov_epi32(ZERO, val_mask[2], ONE);
541 val_vec[3] = _mm512_mask_mov_epi32(ZERO, val_mask[3], ONE);
542 e_qmax_vec = ZERO;
543
544 const __m512i idx[2] = {
545 _mm512_set_epi32(14, 12, 10, 8, 6, 4, 2, 0, 14, 12, 10, 8, 6, 4, 2, 0),
546 _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 15, 13, 11, 9, 7, 5, 3, 1),
547 };
548
549 /* Reorder from
550 * *_vec[0]:[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5]...[0,14], [0,15]
551 * *_vec[1]:[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5]...[1,14], [1,15]
552 * *_vec[2]:[0,16], [0,17], [0,18], [0,19], [0,20], [0,21]...[0,30], [0,31]
553 * *_vec[3]:[1,16], [1,17], [1,18], [1,19], [1,20], [1,21]...[1,30], [1,31]
554 * to
555 * *_vec[0]:[0, 0], [0, 2] ... [0,14], [0,16], [0,18] ... [0,30]
556 * *_vec[1]:[1, 0], [1, 2] ... [1,14], [1,16], [1,18] ... [1,30]
557 * *_vec[2]:[0, 1], [0, 3] ... [0,15], [0,17], [0,19] ... [0,31]
558 * *_vec[3]:[1, 1], [1, 3] ... [1,15], [1,17], [1,19] ... [1,31]
559 */
560 for (ui32 i = 0; i < 4; ++i) {
561 ui32 e_idx = i >> 1;
562 ui32 o_idx = i & 0x1;
563
564 eq_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _eq_vec[o_idx]);
565 eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00,
566 idx[e_idx],
567 _eq_vec[o_idx + 2]);
568
569 s_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _s_vec[o_idx]);
570 s_vec[i] = _mm512_mask_permutexvar_epi32(s_vec[i], 0xFF00,
571 idx[e_idx],
572 _s_vec[o_idx + 2]);
573
574 _rho_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], val_vec[o_idx]);
575 _rho_vec[i] = _mm512_mask_permutexvar_epi32(_rho_vec[i], 0xFF00,
576 idx[e_idx],
577 val_vec[o_idx + 2]);
578 _rho_vec[i] = _mm512_slli_epi32(_rho_vec[i], i);
579
580 e_qmax_vec = _mm512_max_epi32(e_qmax_vec, eq_vec[i]);
581 }
582
583 rho_vec = _mm512_or_epi32(_rho_vec[0], _rho_vec[1]);
584 rho_vec = _mm512_or_epi32(rho_vec, _rho_vec[2]);
585 rho_vec = _mm512_or_epi32(rho_vec, _rho_vec[3]);
586}
587
588/* from [0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, ...]
589 * [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, ...]
590 * [0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, ...]
591 * [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, ...]
592 *
593 * to [0x00, 0x10, 0x20, 0x30, 0x01, 0x11, 0x21, 0x31,
594 * 0x02, 0x12, 0x22, 0x32, 0x03, 0x13, 0x23, 0x33]
595 *
596 * [0x04, 0x14, 0x24, 0x34, 0x05, 0x15, 0x25, 0x35,
597 * 0x06, 0x16, 0x26, 0x36, 0x07, 0x17, 0x27, 0x37]
598 *
599 * [..]
600 */
601static void rotate_matrix(__m512i *matrix)
602{
603 __m512i _matrix[4];
604 _matrix[0] = _mm512_unpacklo_epi32(matrix[0], matrix[1]);
605 _matrix[1] = _mm512_unpackhi_epi32(matrix[0], matrix[1]);
606 _matrix[2] = _mm512_unpacklo_epi32(matrix[2], matrix[3]);
607 _matrix[3] = _mm512_unpackhi_epi32(matrix[2], matrix[3]);
608
609 matrix[0] = _mm512_unpacklo_epi64(_matrix[0], _matrix[2]);
610 matrix[1] = _mm512_unpackhi_epi64(_matrix[0], _matrix[2]);
611 matrix[2] = _mm512_unpacklo_epi64(_matrix[1], _matrix[3]);
612 matrix[3] = _mm512_unpackhi_epi64(_matrix[1], _matrix[3]);
613
614 _matrix[0] = _mm512_shuffle_i32x4(matrix[0], matrix[1], 0x88);
615 _matrix[1] = _mm512_shuffle_i32x4(matrix[2], matrix[3], 0x88);
616 _matrix[2] = _mm512_shuffle_i32x4(matrix[0], matrix[1], 0xDD);
617 _matrix[3] = _mm512_shuffle_i32x4(matrix[2], matrix[3], 0xDD);
618
619 matrix[0] = _mm512_shuffle_i32x4(_matrix[0], _matrix[1], 0x88);
620 matrix[1] = _mm512_shuffle_i32x4(_matrix[2], _matrix[3], 0x88);
621 matrix[2] = _mm512_shuffle_i32x4(_matrix[0], _matrix[1], 0xDD);
622 matrix[3] = _mm512_shuffle_i32x4(_matrix[2], _matrix[3], 0xDD);
623}
624
625static void proc_ms_encode(ms_struct *msp,
626 __m512i &tuple_vec,
627 __m512i &uq_vec,
628 __m512i &rho_vec,
629 __m512i *s_vec)
630{
631 __m512i m_vec[4];
632
633 /* Prepare parameters for ms_encode */
634 /* m = (rho[i] & 1) ? Uq[i] - ((tuple[i] & 1) >> 0) : 0; */
635 auto tmp = _mm512_and_epi32(tuple_vec, ONE);
636 tmp = _mm512_sub_epi32(uq_vec, tmp);
637 auto tmp1 = _mm512_and_epi32(rho_vec, ONE);
638 auto mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
639 m_vec[0] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
640
641 /* m = (rho[i] & 2) ? Uq[i] - ((tuple[i] & 2) >> 1) : 0; */
642 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(2));
643 tmp = _mm512_srli_epi32(tmp, 1);
644 tmp = _mm512_sub_epi32(uq_vec, tmp);
645 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(2));
646 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
647 m_vec[1] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
648
649 /* m = (rho[i] & 4) ? Uq[i] - ((tuple[i] & 4) >> 2) : 0; */
650 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(4));
651 tmp = _mm512_srli_epi32(tmp, 2);
652 tmp = _mm512_sub_epi32(uq_vec, tmp);
653 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(4));
654 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
655 m_vec[2] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
656
657 /* m = (rho[i] & 8) ? Uq[i] - ((tuple[i] & 8) >> 3) : 0; */
658 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(8));
659 tmp = _mm512_srli_epi32(tmp, 3);
660 tmp = _mm512_sub_epi32(uq_vec, tmp);
661 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(8));
662 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
663 m_vec[3] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
664
665 rotate_matrix(m_vec);
666 /* s_vec from
667 * s_vec[0]:[0, 0], [0, 2] ... [0,14], [0, 16], [0, 18] ... [0,30]
668 * s_vec[1]:[1, 0], [1, 2] ... [1,14], [1, 16], [1, 18] ... [1,30]
669 * s_vec[2]:[0, 1], [0, 3] ... [0,15], [0, 17], [0, 19] ... [0,31]
670 * s_vec[3]:[1, 1], [1, 3] ... [1,15], [1, 17], [1, 19] ... [1,31]
671 * to
672 * s_vec[0]:[0, 0], [1, 0], [0, 1], [1, 1], [0, 2], [1, 2]...[0, 7], [1, 7]
673 * s_vec[1]:[0, 8], [1, 8], [0, 9], [1, 9], [0,10], [1,10]...[0,15], [1,15]
674 * s_vec[2]:[0,16], [1,16], [0,17], [1,17], [0,18], [1,18]...[0,23], [1,23]
675 * s_vec[3]:[0,24], [1,24], [0,25], [1,25], [0,26], [1,26]...[0,31], [1,31]
676 */
677 rotate_matrix(s_vec);
678
679 ui32 cwd[16];
680 int cwd_len[16];
681 ui64 _cwd = 0;
682 int _cwd_len = 0;
683
684 /* Each iteration process 8 bytes * 2 lines */
685 for (ui32 i = 0; i < 4; ++i) {
686 /* cwd = s[i * 4 + 0] & ((1U << m) - 1)
687 * cwd_len = m
688 */
689 _mm512_store_epi32(cwd_len, m_vec[i]);
690 tmp = _mm512_sllv_epi32(ONE, m_vec[i]);
691 tmp = _mm512_sub_epi32(tmp, ONE);
692 tmp = _mm512_and_epi32(tmp, s_vec[i]);
693 _mm512_store_epi32(cwd, tmp);
694
695 for (ui32 j = 0; j < 8; ++j) {
696 ui32 idx = j * 2;
697 _cwd = cwd[idx];
698 _cwd_len = cwd_len[idx];
699 _cwd |= ((ui64)cwd[idx + 1]) << _cwd_len;
700 _cwd_len += cwd_len[idx + 1];
701 ms_encode(msp, _cwd, _cwd_len);
702 }
703 }
704}
705
706static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec,
707 __m512i &e_qmax_vec)
708{
709 /* if (u_q[i] > 0) {
710 * eps[i] |= (e_q[i * 4 + 0] == e_qmax[i]);
711 * eps[i] |= (e_q[i * 4 + 1] == e_qmax[i]) << 1;
712 * eps[i] |= (e_q[i * 4 + 2] == e_qmax[i]) << 2;
713 * eps[i] |= (e_q[i * 4 + 3] == e_qmax[i]) << 3;
714 * }
715 */
716 auto u_q_mask = _mm512_cmpgt_epi32_mask(u_q_vec, ZERO);
717
718 auto mask = _mm512_cmpeq_epi32_mask(eq_vec[0], e_qmax_vec);
719 auto tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
720 auto eps_vec = _mm512_mask_mov_epi32(ZERO, u_q_mask, tmp);
721
722 mask = _mm512_cmpeq_epi32_mask(eq_vec[1], e_qmax_vec);
723 tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
724 tmp = _mm512_slli_epi32(tmp, 1);
725 eps_vec = _mm512_mask_or_epi32(ZERO, u_q_mask, eps_vec, tmp);
726
727 mask = _mm512_cmpeq_epi32_mask(eq_vec[2], e_qmax_vec);
728 tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
729 tmp = _mm512_slli_epi32(tmp, 2);
730 eps_vec = _mm512_mask_or_epi32(ZERO, u_q_mask, eps_vec, tmp);
731
732 mask = _mm512_cmpeq_epi32_mask(eq_vec[3], e_qmax_vec);
733 tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
734 tmp = _mm512_slli_epi32(tmp, 3);
735
736 return _mm512_mask_or_epi32(ZERO, u_q_mask, eps_vec, tmp);
737}
738
739static void update_lep(ui32 x, __m512i &prev_e_val_vec,
740 __m512i *eq_vec, __m512i *e_val_vec,
741 const __m512i left_shift)
742{
743 /* lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
744 * lep[0] = (ui8)e_q[3];
745 * Compare e_q[1] with e_q[3] of the prevous round.
746 */
747 auto tmp = _mm512_mask_permutexvar_epi32(prev_e_val_vec, 0xFFFE,
748 left_shift, eq_vec[3]);
749 prev_e_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
750 eq_vec[3]);
751 e_val_vec[x] = _mm512_max_epi32(eq_vec[1], tmp);
752}
753
754
755static void update_lcxp(ui32 x, __m512i &prev_cx_val_vec,
756 __m512i &rho_vec, __m512i *cx_val_vec,
757 const __m512i left_shift)
758{
759 /* lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
760 * lcxp[0] = (ui8)((rho[0] & 8) >> 3);
761 * Or (rho[0] & 2) and (rho[0] of the previous round & 8).
762 */
763 auto tmp = _mm512_mask_permutexvar_epi32(prev_cx_val_vec, 0xFFFE,
764 left_shift, rho_vec);
765 prev_cx_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
766 rho_vec);
767
768 tmp = _mm512_and_epi32(tmp, _mm512_set1_epi32(8));
769 tmp = _mm512_srli_epi32(tmp, 3);
770
771 auto tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(2));
772 tmp1 = _mm512_srli_epi32(tmp1, 1);
773 cx_val_vec[x] = _mm512_or_epi32(tmp, tmp1);
774}
775
776static __m512i cal_tuple(__m512i &cq_vec, __m512i &rho_vec,
777 __m512i &eps_vec, ui32 *vlc_tbl)
778{
779 /* tuple[i] = vlc_tbl1[(c_q[i] << 8) + (rho[i] << 4) + eps[i]]; */
780 auto tmp = _mm512_slli_epi32(cq_vec, 8);
781 auto tmp1 = _mm512_slli_epi32(rho_vec, 4);
782 tmp = _mm512_add_epi32(tmp, tmp1);
783 tmp = _mm512_add_epi32(tmp, eps_vec);
784 return _mm512_i32gather_epi32(tmp, vlc_tbl, 4);
785}
786
787static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
788 const __m512i right_shift)
789{
790 ojph_unused(x);
791 ojph_unused(cx_val_vec);
792 ojph_unused(right_shift);
793
794 /* c_q[i + 1] = (rho[i] >> 1) | (rho[i] & 1); */
795 auto tmp = _mm512_srli_epi32(rho_vec, 1);
796 auto tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(1));
797 return _mm512_or_epi32(tmp, tmp1);
798}
799
800static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
801 const __m512i right_shift)
802{
803 // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
804 // | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
805 auto lcxp1_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x]);
806 auto lcxp2_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x + 1]);
807 auto tmp = _mm512_permutexvar_epi32(right_shift, lcxp1_vec);
808 tmp = _mm512_mask_permutexvar_epi32(tmp, 0xC000, right_shift, lcxp2_vec);
809 tmp = _mm512_slli_epi32(tmp, 2);
810 auto tmp1 = _mm512_mask_mov_epi32(lcxp1_vec, 0x8000, lcxp2_vec);
811 tmp = _mm512_add_epi32(tmp1, tmp);
812
813 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(4));
814 tmp1 = _mm512_srli_epi32(tmp1, 1);
815 tmp = _mm512_or_epi32(tmp, tmp1);
816
817 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(8));
818 tmp1 = _mm512_srli_epi32(tmp1, 2);
819
820 return _mm512_or_epi32(tmp, tmp1);
821}
822
823using fn_proc_cq = __m512i (*)(ui32, __m512i *, __m512i &, const __m512i);
824
825static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
826 __m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
827 const __m512i right_shift)
828{
829 /* Prepare mel_encode params */
830 /* if (c_q[i] == 0) { */
831 auto mel_need_encode = _mm512_cmpeq_epi32_mask(cq_vec, ZERO);
832 /* mel_encode(&mel, rho[i] != 0); */
833 auto mel_bit = _mm512_cmpneq_epi32_mask(rho_vec, ZERO);
834 /* } */
835
836 /* mel_encode(&mel, ojph_min(u_q[i], u_q[i + 1]) > 2); */
837 auto tmp = _mm512_permutexvar_epi32(right_shift, u_q_vec);
838 auto tmp1 = _mm512_min_epi32(u_q_vec, tmp);
839 auto mel_bit2 = (ui16)_mm512_cmpgt_epi32_mask(tmp1, _mm512_set1_epi32(2));
840
841 /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
842 auto mel_need_encode2 = (ui16)_mm512_cmpgt_epi32_mask(u_q_vec, ZERO);
843 mel_need_encode2 =
844 mel_need_encode2 & (ui16)_mm512_cmpgt_epi32_mask(tmp, ZERO);
845
846 ui32 i_max = 16 - (ignore / 2);
847
848 for (ui32 i = 0; i < i_max; i += 2) {
849 auto mask = 1 << i;
850 if (0 != (mel_need_encode & mask)) {
851 mel_encode(melp, mel_bit & mask);
852 }
853
854 if (i + 1 < i_max) {
855 auto mask = 1 << (i + 1);
856 if (0 != (mel_need_encode & mask)) {
857 mel_encode(melp, mel_bit & mask);
858 }
859 }
860
861 if (0 != (mel_need_encode2 & mask)) {
862 mel_encode(melp, mel_bit2 & mask);
863 }
864 }
865}
866
867static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
868 __m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
869 const __m512i right_shift)
870{
871 ojph_unused(u_q_vec);
872 ojph_unused(right_shift);
873
874 /* Prepare mel_encode params */
875 /* if (c_q[i] == 0) { */
876 auto mel_need_encode = _mm512_cmpeq_epi32_mask(cq_vec, ZERO);
877 /* mel_encode(&mel, rho[i] != 0); */
878 auto mel_bit = _mm512_cmpneq_epi32_mask(rho_vec, ZERO);
879 /* } */
880
881 ui32 i_max = 16 - (ignore / 2);
882
883 for (ui32 i = 0; i < i_max; ++i) {
884 auto mask = 1 << i;
885 if (0 != (mel_need_encode & mask)) {
886 mel_encode(melp, mel_bit & mask);
887 }
888 }
889}
890
891using fn_proc_mel_encode = void (*)(mel_struct *, __m512i &, __m512i &,
892 __m512i, ui32, const __m512i);
893
894static void proc_vlc_encode1(vlc_struct *vlcp, ui32 *tuple,
895 ui32 *u_q, ui32 ignore)
896{
897 ui32 i_max = 16 - (ignore / 2);
898
899 for (ui32 i = 0; i < i_max; i += 2) {
900 /* 7 bits */
901 ui32 val = tuple[i + 0] >> 4;
902 int size = tuple[i + 0] & 7;
903
904 if (i + 1 < i_max) {
905 /* 7 bits */
906 val |= (tuple[i + 1] >> 4) << size;
907 size += tuple[i + 1] & 7;
908 }
909
910 if (u_q[i] > 2 && u_q[i + 1] > 2) {
911 /* 3 bits */
912 val |= (ulvc_cwd_pre[u_q[i] - 2]) << size;
913 size += ulvc_cwd_pre_len[u_q[i] - 2];
914
915 /* 3 bits */
916 val |= (ulvc_cwd_pre[u_q[i + 1] - 2]) << size;
917 size += ulvc_cwd_pre_len[u_q[i + 1] - 2];
918
919 /* 5 bits */
920 val |= (ulvc_cwd_suf[u_q[i] - 2]) << size;
921 size += ulvc_cwd_suf_len[u_q[i] - 2];
922
923 /* 5 bits */
924 val |= (ulvc_cwd_suf[u_q[i + 1] - 2]) << size;
925 size += ulvc_cwd_suf_len[u_q[i + 1] - 2];
926
927 } else if (u_q[i] > 2 && u_q[i + 1] > 0) {
928 /* 3 bits */
929 val |= (ulvc_cwd_pre[u_q[i]]) << size;
930 size += ulvc_cwd_pre_len[u_q[i]];
931
932 /* 1 bit */
933 val |= (u_q[i + 1] - 1) << size;
934 size += 1;
935
936 /* 5 bits */
937 val |= (ulvc_cwd_suf[u_q[i]]) << size;
938 size += ulvc_cwd_suf_len[u_q[i]];
939
940 } else {
941 /* 3 bits */
942 val |= (ulvc_cwd_pre[u_q[i]]) << size;
943 size += ulvc_cwd_pre_len[u_q[i]];
944
945 /* 3 bits */
946 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
947 size += ulvc_cwd_pre_len[u_q[i + 1]];
948
949 /* 5 bits */
950 val |= (ulvc_cwd_suf[u_q[i]]) << size;
951 size += ulvc_cwd_suf_len[u_q[i]];
952
953 /* 5 bits */
954 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
955 size += ulvc_cwd_suf_len[u_q[i + 1]];
956 }
957
958 vlc_encode(vlcp, val, size);
959 }
960}
961
962static void proc_vlc_encode2(vlc_struct *vlcp, ui32 *tuple,
963 ui32 *u_q, ui32 ignore)
964{
965 ui32 i_max = 16 - (ignore / 2);
966
967 for (ui32 i = 0; i < i_max; i += 2) {
968 /* 7 bits */
969 ui32 val = tuple[i + 0] >> 4;
970 int size = tuple[i + 0] & 7;
971
972 if (i + 1 < i_max) {
973 /* 7 bits */
974 val |= (tuple[i + 1] >> 4) << size;
975 size += tuple[i + 1] & 7;
976 }
977
978 /* 3 bits */
979 val |= ulvc_cwd_pre[u_q[i]] << size;
980 size += ulvc_cwd_pre_len[u_q[i]];
981
982 /* 3 bits */
983 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
984 size += ulvc_cwd_pre_len[u_q[i + 1]];
985
986 /* 5 bits */
987 val |= (ulvc_cwd_suf[u_q[i + 0]]) << size;
988 size += ulvc_cwd_suf_len[u_q[i + 0]];
989
990 /* 5 bits */
991 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
992 size += ulvc_cwd_suf_len[u_q[i + 1]];
993
994 vlc_encode(vlcp, val, size);
995 }
996}
997
998using fn_proc_vlc_encode = void (*)(vlc_struct *, ui32 *, ui32 *, ui32);
999
1000void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
1001 ui32 num_passes, ui32 _width, ui32 height,
1002 ui32 stride, ui32* lengths,
1004 ojph::coded_lists *& coded)
1005{
1006 ojph_unused(num_passes); //currently not used
1007
1008 ui32 width = (_width + 31) & ~31u;
1009 ui32 ignore = width - _width;
1010 const int ms_size = (16384 * 16 + 14) / 15; //more than enough
1011 const int mel_vlc_size = 3072; //more than enough
1012 const int mel_size = 192;
1013 const int vlc_size = mel_vlc_size - mel_size;
1014
1015 ui8 ms_buf[ms_size];
1016 ui8 mel_vlc_buf[mel_vlc_size];
1017 ui8 *mel_buf = mel_vlc_buf;
1018 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1019
1020 mel_struct mel;
1021 mel_init(&mel, mel_size, mel_buf);
1022 vlc_struct vlc;
1023 vlc_init(&vlc, vlc_size, vlc_buf);
1024 ms_struct ms;
1025 ms_init(&ms, ms_size, ms_buf);
1026
1027 ui32 p = 30 - missing_msbs;
1028
1029 //e_val: E values for a line (these are the highest set bit)
1030 //cx_val: is the context values
1031 //Each byte stores the info for the 2 sample. For E, it is maximum
1032 // of the two samples, while for cx, it is the OR of these two samples.
1033 //The maximum is between the pixel at the bottom left of one quad
1034 // and the bottom right of the earlier quad. The same is true for cx.
1035 //For a 1024 pixels, we need 512 bytes, the 2 extra,
1036 // one for the non-existing earlier quad, and one for beyond the
1037 // the end
1038 const __m512i right_shift = _mm512_set_epi32(
1039 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
1040 );
1041
1042 const __m512i left_shift = _mm512_set_epi32(
1043 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15
1044 );
1045
1046 __m512i e_val_vec[33];
1047 for (ui32 i = 0; i < 32; ++i) {
1048 e_val_vec[i] = ZERO;
1049 }
1050 __m512i prev_e_val_vec = ZERO;
1051
1052 __m512i cx_val_vec[33];
1053 __m512i prev_cx_val_vec = ZERO;
1054
1055 __m512i prev_cq_vec = ZERO;
1056
1057 __m512i tmp;
1058 __m512i tmp1;
1059
1060 __m512i eq_vec[4];
1061 __m512i s_vec[4];
1062 __m512i src_vec[4];
1063 __m512i rho_vec;
1064 __m512i e_qmax_vec;
1065 __m512i kappa_vec;
1066
1067 ui32 n_loop = (width + 31) / 32;
1068
1069 ui32 *vlc_tbl = vlc_tbl0;
1070 fn_proc_cq proc_cq = proc_cq1;
1071 fn_proc_mel_encode proc_mel_encode = proc_mel_encode1;
1072 fn_proc_vlc_encode proc_vlc_encode = proc_vlc_encode1;
1073
1074 /* 2 lines per iteration */
1075 for (ui32 y = 0; y < height; y += 2)
1076 {
1077 e_val_vec[n_loop] = prev_e_val_vec;
1078 /* lcxp[0] = (ui8)((rho[0] & 8) >> 3); */
1079 tmp = _mm512_and_epi32(prev_cx_val_vec, _mm512_set1_epi32(8));
1080 tmp = _mm512_srli_epi32(tmp, 3);
1081 cx_val_vec[n_loop] = tmp;
1082
1083 prev_e_val_vec = ZERO;
1084 prev_cx_val_vec = ZERO;
1085
1086 ui32 *sp = buf + y * stride;
1087
1088 /* 32 bytes per iteration */
1089 for (ui32 x = 0; x < n_loop; ++x) {
1090
1091 // mask to stop loading unnecessary data
1092 si32 true_x = (si32)x << 5;
1093 ui32 mask32 = 0xFFFFFFFFu;
1094 si32 entries = true_x + 32 - (si32)_width;
1095 mask32 >>= ((entries >= 0) ? entries : 0);
1096 __mmask16 load_mask0 = _cvtu32_mask16(mask32);
1097 __mmask16 load_mask1 = _cvtu32_mask16(mask32 >> 16);
1098
1099 /* t = sp[i]; */
1100 src_vec[0] = _mm512_maskz_loadu_epi32(load_mask0, sp);
1101 src_vec[2] = _mm512_maskz_loadu_epi32(load_mask1, sp + 16);
1102
1103 if (y + 1 < height) {
1104 src_vec[1] = _mm512_maskz_loadu_epi32(load_mask0, sp + stride);
1105 src_vec[3] =
1106 _mm512_maskz_loadu_epi32(load_mask1, sp + 16 + stride);
1107 } else {
1108 src_vec[1] = ZERO;
1109 src_vec[3] = ZERO;
1110 }
1111 sp += 32;
1112
1113 /* src_vec layout:
1114 * src_vec[0]:[0, 0],[0, 1],[0, 2],[0, 3],[0, 4],[0, 5]...[0,15]
1115 * src_vec[1]:[1, 0],[1, 1],[1, 2],[1, 3],[1, 4],[1, 5]...[1,15]
1116 * src_vec[2]:[0,16],[0,17],[0,18],[0,19],[0,20],[0,21]...[0,31]
1117 * src_vec[3]:[1,16],[1,17],[1,18],[1,19],[1,20],[1,21]...[1,31]
1118 */
1119 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1120
1121 // max_e[(i + 1) % num] = ojph_max(lep[i + 1], lep[i + 2]) - 1;
1122 tmp = _mm512_permutexvar_epi32(right_shift, e_val_vec[x]);
1123 tmp = _mm512_mask_permutexvar_epi32(tmp, 0x8000, right_shift,
1124 e_val_vec[x + 1]);
1125 auto mask = _mm512_cmpgt_epi32_mask(e_val_vec[x], tmp);
1126 auto max_e_vec = _mm512_mask_mov_epi32(tmp, mask, e_val_vec[x]);
1127 max_e_vec = _mm512_sub_epi32(max_e_vec, ONE);
1128
1129 // kappa[i] = (rho[i] & (rho[i] - 1)) ? ojph_max(1, max_e[i]) : 1;
1130 tmp = _mm512_max_epi32(max_e_vec, ONE);
1131 tmp1 = _mm512_sub_epi32(rho_vec, ONE);
1132 tmp1 = _mm512_and_epi32(rho_vec, tmp1);
1133 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
1134 kappa_vec = _mm512_mask_mov_epi32(ONE, mask, tmp);
1135
1136 /* cq[1 - 16] = cq_vec
1137 * cq[0] = prev_cq_vec[0]
1138 */
1139 tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
1140 auto cq_vec = _mm512_mask_permutexvar_epi32(prev_cq_vec, 0xFFFE,
1141 left_shift, tmp);
1142 prev_cq_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
1143 tmp);
1144
1145 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1146 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1147
1148 /* Uq[i] = ojph_max(e_qmax[i], kappa[i]); */
1149 /* u_q[i] = Uq[i] - kappa[i]; */
1150 auto uq_vec = _mm512_max_epi32(kappa_vec, e_qmax_vec);
1151 auto u_q_vec = _mm512_sub_epi32(uq_vec, kappa_vec);
1152
1153 auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1154 __m512i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1155 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1156
1157 proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1158 right_shift);
1159
1160 proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
1161
1162 // vlc_encode(&vlc, tuple[i*2+0] >> 8, (tuple[i*2+0] >> 4) & 7);
1163 // vlc_encode(&vlc, tuple[i*2+1] >> 8, (tuple[i*2+1] >> 4) & 7);
1164 ui32 u_q[16];
1165 ui32 tuple[16];
1166 /* The tuple is scaled by 4 due to:
1167 * vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7, true);
1168 * So in the vlc_encode, the tuple will only be scaled by 2.
1169 */
1170 tuple_vec = _mm512_srli_epi32(tuple_vec, 4);
1171 _mm512_store_epi32(tuple, tuple_vec);
1172 _mm512_store_epi32(u_q, u_q_vec);
1173 proc_vlc_encode(&vlc, tuple, u_q, _ignore);
1174 }
1175
1176 tmp = _mm512_permutexvar_epi32(right_shift, cx_val_vec[0]);
1177 tmp = _mm512_slli_epi32(tmp, 2);
1178 prev_cq_vec = _mm512_maskz_add_epi32(0x1, tmp, cx_val_vec[0]);
1179
1180 proc_cq = proc_cq2;
1181 vlc_tbl = vlc_tbl1;
1182 proc_mel_encode = proc_mel_encode2;
1183 proc_vlc_encode = proc_vlc_encode2;
1184 }
1185
1186 ms_terminate(&ms);
1187 terminate_mel_vlc(&mel, &vlc);
1188
1189 //copy to elastic
1190 lengths[0] = mel.pos + vlc.pos + ms.pos;
1191 elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded);
1192 memcpy(coded->buf, ms.buf, ms.pos);
1193 memcpy(coded->buf + ms.pos, mel.buf, mel.pos);
1194 memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
1195
1196 // put in the interface locator word
1197 ui32 num_bytes = mel.pos + vlc.pos;
1198 coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4);
1199 coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0;
1200 coded->buf[lengths[0]-2] =
1201 (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF));
1202
1203 coded->avail_size -= lengths[0];
1204}
1205
1206} /* namespace local */
1207} /* namespace ojph */
1208
void get_buffer(ui32 needed_bytes, coded_lists *&p)
Definition: ojph_mem.cpp:95
static bool uvlc_init_tables()
Initializes uvlc_tbl0 and uvlc_tbl1 tables.
static bool vlc_init_tables()
Initializes vlc_tbl0 and vlc_tbl1 tables, from table0.h and table1.h.
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
static void ms_terminate(ms_struct *msp)
static void update_lep(ui32 x, __m512i &prev_e_val_vec, __m512i *eq_vec, __m512i *e_val_vec, const __m512i left_shift)
static int ulvc_cwd_suf[33]
static int ulvc_cwd_suf_len[33]
static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec, __m512i &rho_vec, __m512i u_q_vec, ui32 ignore, const __m512i right_shift)
static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec, __m512i &rho_vec, __m512i u_q_vec, ui32 ignore, const __m512i right_shift)
static void proc_vlc_encode2(vlc_struct *vlcp, ui32 *tuple, ui32 *u_q, ui32 ignore)
static void vlc_encode(vlc_struct *vlcp, int cwd, int cwd_len)
void(*)(mel_struct *, __m512i &, __m512i &, __m512i, ui32, const __m512i) fn_proc_mel_encode
static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec, __m512i &e_qmax_vec)
static void terminate_mel_vlc(mel_struct *melp, vlc_struct *vlcp)
static void proc_pixel(__m512i *src_vec, ui32 p, __m512i *eq_vec, __m512i *s_vec, __m512i &rho_vec, __m512i &e_qmax_vec)
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static void proc_ms_encode(ms_struct *msp, __m512i &tuple_vec, __m512i &uq_vec, __m512i &rho_vec, __m512i *s_vec)
void(*)(vlc_struct *, ui32 *, ui32 *, ui32) fn_proc_vlc_encode
static void rotate_matrix(__m512i *matrix)
static void ms_init(ms_struct *msp, ui32 buffer_size, ui8 *data)
static void ms_encode(ms_struct *msp, ui32 cwd, int cwd_len)
static int ulvc_cwd_pre_len[33]
static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec, const __m512i right_shift)
static int ulvc_cwd_pre[33]
static void mel_encode(mel_struct *melp, bool bit)
static void mel_emit_bit(mel_struct *melp, int v)
static void proc_vlc_encode1(vlc_struct *vlcp, ui32 *tuple, ui32 *u_q, ui32 ignore)
static __m512i cal_tuple(__m512i &cq_vec, __m512i &rho_vec, __m512i &eps_vec, ui32 *vlc_tbl)
static void update_lcxp(ui32 x, __m512i &prev_cx_val_vec, __m512i &rho_vec, __m512i *cx_val_vec, const __m512i left_shift)
__m512i(*)(ui32, __m512i *, __m512i &, const __m512i) fn_proc_cq
static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec, const __m512i right_shift)
static void vlc_init(vlc_struct *vlcp, ui32 buffer_size, ui8 *data)
void ojph_encode_codeblock_avx512(ui32 *buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32 *lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *&coded)
uint64_t ui64
Definition: ojph_defs.h:56
@ X86_CPU_EXT_LEVEL_AVX512
Definition: ojph_arch.h:106
uint16_t ui16
Definition: ojph_defs.h:52
static ui32 population_count(ui32 val)
Definition: ojph_arch.h:110
OJPH_EXPORT int get_cpu_ext_level()
Definition: ojph_arch.cpp:184
int32_t si32
Definition: ojph_defs.h:55
uint32_t ui32
Definition: ojph_defs.h:54
uint8_t ui8
Definition: ojph_defs.h:50
#define likely(x)
#define unlikely(x)
#define ojph_max(a, b)
Definition: ojph_defs.h:73
#define ojph_min(a, b)
Definition: ojph_defs.h:76
#define ojph_unused(x)
Definition: ojph_defs.h:78
#define OJPH_ERROR(t,...)
Definition: ojph_message.h:131