OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_block_decoder_ssse3.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2022, Aous Naman
6// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2022, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_block_decoder_ssse3.cpp
34// Author: Aous Naman
35// Date: 13 May 2022
36//***************************************************************************/
37
38//***************************************************************************/
43#include <string>
44#include <iostream>
45
46#include <cassert>
47#include <cstring>
48#include "ojph_block_common.h"
49#include "ojph_block_decoder.h"
50#include "ojph_arch.h"
51#include "ojph_message.h"
52
53#include <immintrin.h>
54
55namespace ojph {
56 namespace local {
57
58 //************************************************************************/
65 struct dec_mel_st {
66 dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
67 k(0), num_runs(0), runs(0)
68 {}
69 // data decoding machinary
70 ui8* data;
71 ui64 tmp;
72 int bits;
73 int size;
74 bool unstuff;
75 int k;
76
77 // queue of decoded runs
78 int num_runs;
79 ui64 runs;
80 };
81
82 //************************************************************************/
94 static inline
95 void mel_read(dec_mel_st *melp)
96 {
97 if (melp->bits > 32) //there are enough bits in the tmp variable
98 return; // return without reading new data
99
100 ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted
101 if (melp->size > 4) { // if there is data in the MEL segment
102 val = *(ui32*)melp->data; // read 32 bits from MEL data
103 melp->data += 4; // advance pointer
104 melp->size -= 4; // reduce counter
105 }
106 else if (melp->size > 0)
107 { // 4 or less
108 int i = 0;
109 while (melp->size > 1) {
110 ui32 v = *melp->data++; // read one byte at a time
111 ui32 m = ~(0xFFu << i); // mask of location
112 val = (val & m) | (v << i);// put one byte in its correct location
113 --melp->size;
114 i += 8;
115 }
116 // size equal to 1
117 ui32 v = *melp->data++; // the one before the last is different
118 v |= 0xF; // MEL and VLC segments can overlap
119 ui32 m = ~(0xFFu << i);
120 val = (val & m) | (v << i);
121 --melp->size;
122 }
123
124 // next we unstuff them before adding them to the buffer
125 int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
126 // the previously read byte requires
127 // unstuffing
128
129 // data is unstuffed and accumulated in t
130 // bits has the number of bits in t
131 ui32 t = val & 0xFF;
132 bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
133 bits -= unstuff; // there is one less bit in t if unstuffing is needed
134 t = t << (8 - unstuff); // move up to make room for the next byte
135
136 //this is a repeat of the above
137 t |= (val>>8) & 0xFF;
138 unstuff = (((val >> 8) & 0xFF) == 0xFF);
139 bits -= unstuff;
140 t = t << (8 - unstuff);
141
142 t |= (val>>16) & 0xFF;
143 unstuff = (((val >> 16) & 0xFF) == 0xFF);
144 bits -= unstuff;
145 t = t << (8 - unstuff);
146
147 t |= (val>>24) & 0xFF;
148 melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
149
150 // move t to tmp, and push the result all the way up, so we read from
151 // the MSB
152 melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
153 melp->bits += bits; //increment the number of bits in tmp
154 }
155
156 //************************************************************************/
171 static inline
173 {
174 static const int mel_exp[13] = { //MEL exponents
175 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
176 };
177
178 if (melp->bits < 6) // if there are less than 6 bits in tmp
179 mel_read(melp); // then read from the MEL bitstream
180 // 6 bits is the largest decodable MEL cwd
181
182 //repeat so long that there is enough decodable bits in tmp,
183 // and the runs store is not full (num_runs < 8)
184 while (melp->bits >= 6 && melp->num_runs < 8)
185 {
186 int eval = mel_exp[melp->k]; // number of bits associated with state
187 int run = 0;
188 if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
189 { //one is found
190 run = 1 << eval;
191 run--; // consecutive runs of 0 events - 1
192 melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
193 melp->tmp <<= 1; // consume one bit from tmp
194 melp->bits -= 1;
195 run = run << 1; // a stretch of zeros not terminating in one
196 }
197 else
198 { //0 is found
199 run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
200 melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
201 melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
202 melp->bits -= eval + 1;
203 run = (run << 1) + 1; // a stretch of zeros terminating with one
204 }
205 eval = melp->num_runs * 7; // 7 bits per run
206 melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
207 melp->runs |= ((ui64)run) << eval; // store the value in runs
208 melp->num_runs++; // increment count
209 }
210 }
211
212 //************************************************************************/
222 static inline
223 void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
224 {
225 melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
226 melp->bits = 0; // 0 bits in tmp
227 melp->tmp = 0; //
228 melp->unstuff = false; // no unstuffing
229 melp->size = scup - 1; // size is the length of MEL+VLC-1
230 melp->k = 0; // 0 for state
231 melp->num_runs = 0; // num_runs is 0
232 melp->runs = 0; //
233
234 //This code is borrowed; original is for a different architecture
235 //These few lines take care of the case where data is not at a multiple
236 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment
237 int num = 4 - (int)(intptr_t(melp->data) & 0x3);
238 for (int i = 0; i < num; ++i) { // this code is similar to mel_read
239 assert(melp->unstuff == false || melp->data[0] <= 0x8F);
240 ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
241 //set data to 0xFF
242 if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
243 // see the standard
244 melp->data += melp->size-- > 0; //increment if the end is not reached
245 int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
246 melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
247 melp->bits += d_bits; //increment tmp by number of bits
248 melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
249 //unstuffing
250 }
251 melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
252 // is the MSB
253 }
254
255 //************************************************************************/
261 static inline
263 {
264 if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment
265 mel_decode(melp);
266
267 int t = melp->runs & 0x7F; //retrieve one run
268 melp->runs >>= 7; // remove the retrieved run
269 melp->num_runs--;
270 return t; // return run
271 }
272
273 //************************************************************************/
277 struct rev_struct {
278 rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
279 {}
280 //storage
281 ui8* data;
282 ui64 tmp;
283 ui32 bits;
284 int size;
285 bool unstuff;
287 };
288
289 //************************************************************************/
309 static inline
311 {
312 //process 4 bytes at a time
313 if (vlcp->bits > 32) // if there are more than 32 bits in tmp, then
314 return; // reading 32 bits can overflow vlcp->tmp
315 ui32 val = 0;
316 //the next line (the if statement) needs to be tested first
317 if (vlcp->size > 3) // if there are more than 3 bytes left in VLC
318 {
319 // (vlcp->data - 3) move pointer back to read 32 bits at once
320 val = *(ui32*)(vlcp->data - 3); // then read 32 bits
321 vlcp->data -= 4; // move data pointer back by 4
322 vlcp->size -= 4; // reduce available byte by 4
323 }
324 else if (vlcp->size > 0)
325 { // 4 or less
326 int i = 24;
327 while (vlcp->size > 0) {
328 ui32 v = *vlcp->data--; // read one byte at a time
329 val |= (v << i); // put byte in its correct location
330 --vlcp->size;
331 i -= 8;
332 }
333 }
334
335 //accumulate in tmp, number of bits in tmp are stored in bits
336 ui32 tmp = val >> 24; //start with the MSB byte
337 ui32 bits;
338
339 // test unstuff (previous byte is >0x8F), and this byte is 0x7F
340 bits = 8 - ((vlcp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
341 bool unstuff = (val >> 24) > 0x8F; //this is for the next byte
342
343 tmp |= ((val >> 16) & 0xFF) << bits; //process the next byte
344 bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
345 unstuff = ((val >> 16) & 0xFF) > 0x8F;
346
347 tmp |= ((val >> 8) & 0xFF) << bits;
348 bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
349 unstuff = ((val >> 8) & 0xFF) > 0x8F;
350
351 tmp |= (val & 0xFF) << bits;
352 bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
353 unstuff = (val & 0xFF) > 0x8F;
354
355 // now move the read and unstuffed bits into vlcp->tmp
356 vlcp->tmp |= (ui64)tmp << vlcp->bits;
357 vlcp->bits += bits;
358 vlcp->unstuff = unstuff; // this for the next read
359 }
360
361 //************************************************************************/
375 static inline
376 void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup)
377 {
378 //first byte has only the upper 4 bits
379 vlcp->data = data + lcup - 2;
380
381 //size can not be larger than this, in fact it should be smaller
382 vlcp->size = scup - 2;
383
384 ui32 d = *vlcp->data--; // read one byte (this is a half byte)
385 vlcp->tmp = d >> 4; // both initialize and set
386 vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard
387 vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte
388
389 //This code is designed for an architecture that read address should
390 // align to the read size (address multiple of 4 if read size is 4)
391 //These few lines take care of the case where data is not at a multiple
392 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream.
393 // To read 32 bits, read from (vlcp->data - 3)
394 int num = 1 + (int)(intptr_t(vlcp->data) & 0x3);
395 int tnum = num < vlcp->size ? num : vlcp->size;
396 for (int i = 0; i < tnum; ++i) {
397 ui64 d;
398 d = *vlcp->data--; // read one byte and move read pointer
399 //check if the last byte was >0x8F (unstuff == true) and this is 0x7F
400 ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
401 vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp
402 vlcp->bits += d_bits;
403 vlcp->unstuff = d > 0x8F; // for next byte
404 }
405 vlcp->size -= tnum;
406 rev_read(vlcp); // read another 32 buts
407 }
408
409 //************************************************************************/
416 static inline
418 {
419 if (vlcp->bits < 32) // if there are less then 32 bits, read more
420 {
421 rev_read(vlcp); // read 32 bits, but unstuffing might reduce this
422 if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits
423 rev_read(vlcp); // read another 32
424 }
425 return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp
426 }
427
428 //************************************************************************/
434 static inline
436 {
437 assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
438 vlcp->tmp >>= num_bits; // remove bits
439 vlcp->bits -= num_bits; // decrement the number of bits
440 return (ui32)vlcp->tmp;
441 }
442
443 //************************************************************************/
454 static inline
456 {
457 //process 4 bytes at a time
458 if (mrp->bits > 32)
459 return;
460 ui32 val = 0;
461 if (mrp->size > 3) // If there are 3 byte or more
462 { // (mrp->data - 3) move pointer back to read 32 bits at once
463 val = *(ui32*)(mrp->data - 3); // read 32 bits
464 mrp->data -= 4; // move back pointer
465 mrp->size -= 4; // reduce count
466 }
467 else if (mrp->size > 0)
468 {
469 int i = 24;
470 while (mrp->size > 0) {
471 ui32 v = *mrp->data--; // read one byte at a time
472 val |= (v << i); // put byte in its correct location
473 --mrp->size;
474 i -= 8;
475 }
476 }
477
478 //accumulate in tmp, and keep count in bits
479 ui32 bits, tmp = val >> 24;
480
481 //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
482 bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
483 bool unstuff = (val >> 24) > 0x8F;
484
485 //process the next byte
486 tmp |= ((val >> 16) & 0xFF) << bits;
487 bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
488 unstuff = ((val >> 16) & 0xFF) > 0x8F;
489
490 tmp |= ((val >> 8) & 0xFF) << bits;
491 bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
492 unstuff = ((val >> 8) & 0xFF) > 0x8F;
493
494 tmp |= (val & 0xFF) << bits;
495 bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
496 unstuff = (val & 0xFF) > 0x8F;
497
498 mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
499 mrp->bits += bits;
500 mrp->unstuff = unstuff; // next byte
501 }
502
503 //************************************************************************/
518 static inline
519 void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
520 {
521 mrp->data = data + lcup + len2 - 1;
522 mrp->size = len2;
523 mrp->unstuff = true;
524 mrp->bits = 0;
525 mrp->tmp = 0;
526
527 //This code is designed for an architecture that read address should
528 // align to the read size (address multiple of 4 if read size is 4)
529 //These few lines take care of the case where data is not at a multiple
530 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream
531 int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
532 for (int i = 0; i < num; ++i) {
533 ui64 d;
534 //read a byte, 0 if no more data
535 d = (mrp->size-- > 0) ? *mrp->data-- : 0;
536 //check if unstuffing is needed
537 ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
538 mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
539 mrp->bits += d_bits;
540 mrp->unstuff = d > 0x8F; // for next byte
541 }
542 rev_read_mrp(mrp);
543 }
544
545 //************************************************************************/
552 static inline
554 {
555 if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
556 {
557 rev_read_mrp(mrp); // read 30-32 bits from mrp
558 if (mrp->bits < 32) // if there is a space of 32 bits
559 rev_read_mrp(mrp); // read more
560 }
561 return (ui32)mrp->tmp; // return the head of mrp->tmp
562 }
563
564 //************************************************************************/
570 inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
571 {
572 assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
573 mrp->tmp >>= num_bits; // discard the lowest num_bits bits
574 mrp->bits -= num_bits;
575 return (ui32)mrp->tmp; // return data after consumption
576 }
577
578 //************************************************************************/
582 struct frwd_struct {
583 const ui8* data;
584 ui8 tmp[48];
585 ui32 bits;
586 ui32 unstuff;
587 int size;
588 };
589
590 //************************************************************************/
608 template<int X>
609 static inline
611 {
612 assert(msp->bits <= 128);
613
614 __m128i offset, val, validity, all_xff;
615 val = _mm_loadu_si128((__m128i*)msp->data);
616 int bytes = msp->size >= 16 ? 16 : msp->size;
617 validity = _mm_set1_epi8((char)bytes);
618 msp->data += bytes;
619 msp->size -= bytes;
620 int bits = 128;
621 offset = _mm_set_epi64x(0x0F0E0D0C0B0A0908,0x0706050403020100);
622 validity = _mm_cmpgt_epi8(validity, offset);
623 all_xff = _mm_set1_epi8(-1);
624 if (X == 0xFF) // the compiler should remove this if statement
625 {
626 __m128i t = _mm_xor_si128(validity, all_xff); // complement
627 val = _mm_or_si128(t, val); // fill with 0xFF
628 }
629 else if (X == 0)
630 val = _mm_and_si128(validity, val); // fill with zeros
631 else
632 assert(0);
633
634 __m128i ff_bytes;
635 ff_bytes = _mm_cmpeq_epi8(val, all_xff);
636 ff_bytes = _mm_and_si128(ff_bytes, validity);
637 ui32 flags = (ui32)_mm_movemask_epi8(ff_bytes);
638 flags <<= 1; // unstuff following byte
639 ui32 next_unstuff = flags >> 16;
640 flags |= msp->unstuff;
641 flags &= 0xFFFF;
642 while (flags)
643 { // bit unstuffing occurs on average once every 256 bytes
644 // therefore it is not an issue if it is a bit slow
645 // here we process 16 bytes
646 --bits; // consuming one stuffing bit
647
648 ui32 loc = 31 - count_leading_zeros(flags);
649 flags ^= 1 << loc;
650
651 __m128i m, t, c;
652 t = _mm_set1_epi8((char)loc);
653 m = _mm_cmpgt_epi8(offset, t);
654
655 t = _mm_and_si128(m, val); // keep bits at locations larger than loc
656 c = _mm_srli_epi64(t, 1); // 1 bits left
657 t = _mm_srli_si128(t, 8); // 8 bytes left
658 t = _mm_slli_epi64(t, 63); // keep the MSB only
659 t = _mm_or_si128(t, c); // combine the above 3 steps
660
661 val = _mm_or_si128(t, _mm_andnot_si128(m, val));
662 }
663
664 // combine with earlier data
665 assert(msp->bits >= 0 && msp->bits <= 128);
666 int cur_bytes = msp->bits >> 3;
667 int cur_bits = msp->bits & 7;
668 __m128i b1, b2;
669 b1 = _mm_sll_epi64(val, _mm_set1_epi64x(cur_bits));
670 b2 = _mm_slli_si128(val, 8); // 8 bytes right
671 b2 = _mm_srl_epi64(b2, _mm_set1_epi64x(64-cur_bits));
672 b1 = _mm_or_si128(b1, b2);
673 b2 = _mm_loadu_si128((__m128i*)(msp->tmp + cur_bytes));
674 b2 = _mm_or_si128(b1, b2);
675 _mm_storeu_si128((__m128i*)(msp->tmp + cur_bytes), b2);
676
677 int consumed_bits = bits < 128 - cur_bits ? bits : 128 - cur_bits;
678 cur_bytes = (msp->bits + (ui32)consumed_bits + 7) >> 3; // round up
679 int upper = _mm_extract_epi16(val, 7);
680 upper >>= consumed_bits - 128 + 16;
681 msp->tmp[cur_bytes] = (ui8)upper; // copy byte
682
683 msp->bits += (ui32)bits;
684 msp->unstuff = next_unstuff; // next unstuff
685 assert(msp->unstuff == 0 || msp->unstuff == 1);
686 }
687
688 //************************************************************************/
697 template<int X>
698 static inline
699 void frwd_init(frwd_struct *msp, const ui8* data, int size)
700 {
701 msp->data = data;
702 _mm_storeu_si128((__m128i *)msp->tmp, _mm_setzero_si128());
703 _mm_storeu_si128((__m128i *)msp->tmp + 1, _mm_setzero_si128());
704 _mm_storeu_si128((__m128i *)msp->tmp + 2, _mm_setzero_si128());
705
706 msp->bits = 0;
707 msp->unstuff = 0;
708 msp->size = size;
709
710 frwd_read<X>(msp); // read 128 bits more
711 }
712
713 //************************************************************************/
719 static inline
720 void frwd_advance(frwd_struct *msp, ui32 num_bits)
721 {
722 assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
723 msp->bits -= num_bits;
724
725 __m128i *p = (__m128i*)(msp->tmp + ((num_bits >> 3) & 0x18));
726 num_bits &= 63;
727
728 __m128i v0, v1, c0, c1, t;
729 v0 = _mm_loadu_si128(p);
730 v1 = _mm_loadu_si128(p + 1);
731
732 // shift right by num_bits
733 c0 = _mm_srl_epi64(v0, _mm_set1_epi64x(num_bits));
734 t = _mm_srli_si128(v0, 8);
735 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
736 c0 = _mm_or_si128(c0, t);
737 t = _mm_slli_si128(v1, 8);
738 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
739 c0 = _mm_or_si128(c0, t);
740
741 _mm_storeu_si128((__m128i*)msp->tmp, c0);
742
743 c1 = _mm_srl_epi64(v1, _mm_set1_epi64x(num_bits));
744 t = _mm_srli_si128(v1, 8);
745 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
746 c1 = _mm_or_si128(c1, t);
747
748 _mm_storeu_si128((__m128i*)msp->tmp + 1, c1);
749 }
750
751 //************************************************************************/
758 template<int X>
759 static inline
761 {
762 if (msp->bits <= 128)
763 {
764 frwd_read<X>(msp);
765 if (msp->bits <= 128) //need to test
766 frwd_read<X>(msp);
767 }
768 __m128i t = _mm_loadu_si128((__m128i*)msp->tmp);
769 return t;
770 }
771
772 //************************************************************************/
784 template <int N>
785 static inline
786 __m128i decode_one_quad32(const __m128i inf_u_q, __m128i U_q,
787 frwd_struct* magsgn, ui32 p, __m128i& vn)
788 {
789 __m128i w0; // workers
790 __m128i insig; // lanes hold FF's if samples are insignificant
791 __m128i flags; // lanes hold e_k, e_1, and rho
792 __m128i row; // decoded row
793
794 row = _mm_setzero_si128();
795 w0 = _mm_shuffle_epi32(inf_u_q, _MM_SHUFFLE(N, N, N, N));
796 // we keeps e_k, e_1, and rho in w2
797 flags = _mm_and_si128(w0, _mm_set_epi32(0x8880, 0x4440, 0x2220, 0x1110));
798 insig = _mm_cmpeq_epi32(flags, _mm_setzero_si128());
799 if (_mm_movemask_epi8(insig) != 0xFFFF) //are all insignificant?
800 {
801 U_q = _mm_shuffle_epi32(U_q, _MM_SHUFFLE(N, N, N, N));
802 flags = _mm_mullo_epi16(flags, _mm_set_epi16(1,1,2,2,4,4,8,8));
803 __m128i ms_vec = frwd_fetch<0xFF>(magsgn);
804
805 // U_q holds U_q for this quad
806 // flags has e_k, e_1, and rho such that e_k is sitting in the
807 // 0x8000, e_1 in 0x800, and rho in 0x80
808
809 // next e_k and m_n
810 __m128i m_n;
811 w0 = _mm_srli_epi32(flags, 15); // e_k
812 m_n = _mm_sub_epi32(U_q, w0);
813 m_n = _mm_andnot_si128(insig, m_n);
814
815 // find cumulative sums
816 // to find at which bit in ms_vec the sample starts
817 __m128i inc_sum = m_n; // inclusive scan
818 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 4));
819 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 8));
820 int total_mn = _mm_extract_epi16(inc_sum, 6);
821 __m128i ex_sum = _mm_bslli_si128(inc_sum, 4); // exclusive scan
822
823 // find the starting byte and starting bit
824 __m128i byte_idx = _mm_srli_epi32(ex_sum, 3);
825 __m128i bit_idx = _mm_and_si128(ex_sum, _mm_set1_epi32(7));
826 byte_idx = _mm_shuffle_epi8(byte_idx,
827 _mm_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
828 byte_idx = _mm_add_epi32(byte_idx, _mm_set1_epi32(0x03020100));
829 __m128i d0 = _mm_shuffle_epi8(ms_vec, byte_idx);
830 byte_idx = _mm_add_epi32(byte_idx, _mm_set1_epi32(0x01010101));
831 __m128i d1 = _mm_shuffle_epi8(ms_vec, byte_idx);
832
833 // shift samples values to correct location
834 bit_idx = _mm_or_si128(bit_idx, _mm_slli_epi32(bit_idx, 16));
835 __m128i bit_shift = _mm_shuffle_epi8(
836 _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
837 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
838 bit_shift = _mm_add_epi16(bit_shift, _mm_set1_epi16(0x0101));
839 d0 = _mm_mullo_epi16(d0, bit_shift);
840 d0 = _mm_srli_epi16(d0, 8); // we should have 8 bits in the LSB
841 d1 = _mm_mullo_epi16(d1, bit_shift);
842 d1 = _mm_and_si128(d1, _mm_set1_epi32((si32)0xFF00FF00)); // 8 in MSB
843 d0 = _mm_or_si128(d0, d1);
844
845 // find location of e_k and mask
846 __m128i shift;
847 __m128i ones = _mm_set1_epi32(1);
848 __m128i twos = _mm_set1_epi32(2);
849 __m128i U_q_m1 = _mm_sub_epi32(U_q, ones);
850 U_q_m1 = _mm_and_si128(U_q_m1, _mm_set_epi32(0,0,0,0x1F));
851 w0 = _mm_sub_epi32(twos, w0);
852 shift = _mm_sll_epi32(w0, U_q_m1); // U_q_m1 must be no more than 31
853 ms_vec = _mm_and_si128(d0, _mm_sub_epi32(shift, ones));
854
855 // next e_1
856 w0 = _mm_and_si128(flags, _mm_set1_epi32(0x800));
857 w0 = _mm_cmpeq_epi32(w0, _mm_setzero_si128());
858 w0 = _mm_andnot_si128(w0, shift); // e_1 in correct position
859 ms_vec = _mm_or_si128(ms_vec, w0); // e_1
860 w0 = _mm_slli_epi32(ms_vec, 31); // sign
861 ms_vec = _mm_or_si128(ms_vec, ones); // bin center
862 __m128i tvn = ms_vec;
863 ms_vec = _mm_add_epi32(ms_vec, twos);// + 2
864 ms_vec = _mm_slli_epi32(ms_vec, (si32)p - 1);
865 ms_vec = _mm_or_si128(ms_vec, w0); // sign
866 row = _mm_andnot_si128(insig, ms_vec); // significant only
867
868 ms_vec = _mm_andnot_si128(insig, tvn); // significant only
869 if (N == 0) // the compiler should remove one
870 tvn = _mm_shuffle_epi8(ms_vec,
871 _mm_set_epi32(-1, -1, 0x0F0E0D0C, 0x07060504));
872 else if (N == 1)
873 tvn = _mm_shuffle_epi8(ms_vec,
874 _mm_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1));
875 else
876 assert(0);
877 vn = _mm_or_si128(vn, tvn);
878
879 if (total_mn)
880 frwd_advance(magsgn, (ui32)total_mn);
881 }
882 return row;
883 }
884
885 //************************************************************************/
895 static inline
896 __m128i decode_two_quad16(const __m128i inf_u_q, __m128i U_q,
897 frwd_struct* magsgn, ui32 p, __m128i& vn)
898 {
899 __m128i w0; // workers
900 __m128i insig; // lanes hold FF's if samples are insignificant
901 __m128i flags; // lanes hold e_k, e_1, and rho
902 __m128i row; // decoded row
903
904 row = _mm_setzero_si128();
905 w0 = _mm_shuffle_epi8(inf_u_q,
906 _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504,
907 0x0100, 0x0100, 0x0100, 0x0100));
908 // we keeps e_k, e_1, and rho in w2
909 flags = _mm_and_si128(w0,
910 _mm_set_epi16((si16)0x8880, 0x4440, 0x2220, 0x1110,
911 (si16)0x8880, 0x4440, 0x2220, 0x1110));
912 insig = _mm_cmpeq_epi16(flags, _mm_setzero_si128());
913 if (_mm_movemask_epi8(insig) != 0xFFFF) //are all insignificant?
914 {
915 U_q = _mm_shuffle_epi8(U_q,
916 _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504,
917 0x0100, 0x0100, 0x0100, 0x0100));
918 flags = _mm_mullo_epi16(flags, _mm_set_epi16(1,2,4,8,1,2,4,8));
919 __m128i ms_vec = frwd_fetch<0xFF>(magsgn);
920
921 // U_q holds U_q for this quad
922 // flags has e_k, e_1, and rho such that e_k is sitting in the
923 // 0x8000, e_1 in 0x800, and rho in 0x80
924
925 // next e_k and m_n
926 __m128i m_n;
927 w0 = _mm_srli_epi16(flags, 15); // e_k
928 m_n = _mm_sub_epi16(U_q, w0);
929 m_n = _mm_andnot_si128(insig, m_n);
930
931 // find cumulative sums
932 // to find at which bit in ms_vec the sample starts
933 __m128i inc_sum = m_n; // inclusive scan
934 inc_sum = _mm_add_epi16(inc_sum, _mm_bslli_si128(inc_sum, 2));
935 inc_sum = _mm_add_epi16(inc_sum, _mm_bslli_si128(inc_sum, 4));
936 inc_sum = _mm_add_epi16(inc_sum, _mm_bslli_si128(inc_sum, 8));
937 int total_mn = _mm_extract_epi16(inc_sum, 7);
938 __m128i ex_sum = _mm_bslli_si128(inc_sum, 2); // exclusive scan
939
940 // find the starting byte and starting bit
941 __m128i byte_idx = _mm_srli_epi16(ex_sum, 3);
942 __m128i bit_idx = _mm_and_si128(ex_sum, _mm_set1_epi16(7));
943 byte_idx = _mm_shuffle_epi8(byte_idx,
944 _mm_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
945 0x0606, 0x0404, 0x0202, 0x0000));
946 byte_idx = _mm_add_epi16(byte_idx, _mm_set1_epi16(0x0100));
947 __m128i d0 = _mm_shuffle_epi8(ms_vec, byte_idx);
948 byte_idx = _mm_add_epi16(byte_idx, _mm_set1_epi16(0x0101));
949 __m128i d1 = _mm_shuffle_epi8(ms_vec, byte_idx);
950
951 // shift samples values to correct location
952 __m128i bit_shift = _mm_shuffle_epi8(
953 _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
954 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
955 bit_shift = _mm_add_epi16(bit_shift, _mm_set1_epi16(0x0101));
956 d0 = _mm_mullo_epi16(d0, bit_shift);
957 d0 = _mm_srli_epi16(d0, 8); // we should have 8 bits in the LSB
958 d1 = _mm_mullo_epi16(d1, bit_shift);
959 d1 = _mm_and_si128(d1, _mm_set1_epi16((si16)0xFF00)); // 8 in MSB
960 d0 = _mm_or_si128(d0, d1);
961
962 // find location of e_k and mask
963 __m128i shift, t0, t1, Uq0, Uq1;
964 __m128i ones = _mm_set1_epi16(1);
965 __m128i twos = _mm_set1_epi16(2);
966 __m128i U_q_m1 = _mm_sub_epi32(U_q, ones);
967 Uq0 = _mm_and_si128(U_q_m1, _mm_set_epi32(0,0,0,0x1F));
968 Uq1 = _mm_bsrli_si128(U_q_m1, 14);
969 w0 = _mm_sub_epi16(twos, w0);
970 t0 = _mm_and_si128(w0, _mm_set_epi64x(0, -1));
971 t1 = _mm_and_si128(w0, _mm_set_epi64x(-1, 0));
972 t0 = _mm_sll_epi16(t0, Uq0);
973 t1 = _mm_sll_epi16(t1, Uq1);
974 shift = _mm_or_si128(t0, t1);
975 ms_vec = _mm_and_si128(d0, _mm_sub_epi16(shift, ones));
976
977 // next e_1
978 w0 = _mm_and_si128(flags, _mm_set1_epi16(0x800));
979 w0 = _mm_cmpeq_epi16(w0, _mm_setzero_si128());
980 w0 = _mm_andnot_si128(w0, shift); // e_1 in correct position
981 ms_vec = _mm_or_si128(ms_vec, w0); // e_1
982 w0 = _mm_slli_epi16(ms_vec, 15); // sign
983 ms_vec = _mm_or_si128(ms_vec, ones); // bin center
984 __m128i tvn = ms_vec;
985 ms_vec = _mm_add_epi16(ms_vec, twos);// + 2
986 ms_vec = _mm_slli_epi16(ms_vec, (si32)p - 1);
987 ms_vec = _mm_or_si128(ms_vec, w0); // sign
988 row = _mm_andnot_si128(insig, ms_vec); // significant only
989
990 ms_vec = _mm_andnot_si128(insig, tvn); // significant only
991 w0 = _mm_shuffle_epi8(ms_vec,
992 _mm_set_epi16(-1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
993 vn = _mm_or_si128(vn, w0);
994 w0 = _mm_shuffle_epi8(ms_vec,
995 _mm_set_epi16(-1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
996 vn = _mm_or_si128(vn, w0);
997
998 if (total_mn)
999 frwd_advance(magsgn, (ui32)total_mn);
1000 }
1001 return row;
1002 }
1003
1004
1005 //************************************************************************/
1022 bool ojph_decode_codeblock_ssse3(ui8* coded_data, ui32* decoded_data,
1023 ui32 missing_msbs, ui32 num_passes,
1024 ui32 lengths1, ui32 lengths2,
1025 ui32 width, ui32 height, ui32 stride,
1026 bool stripe_causal)
1027 {
1028 static bool insufficient_precision = false;
1029 static bool modify_code = false;
1030 static bool truncate_spp_mrp = false;
1031
1032 if (num_passes > 1 && lengths2 == 0)
1033 {
1034 OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
1035 "one coding pass, but zero length for "
1036 "2nd and potential 3rd pass.\n");
1037 num_passes = 1;
1038 }
1039
1040 if (num_passes > 3)
1041 {
1042 OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
1043 "This codeblocks has %d passes.\n",
1044 num_passes);
1045 return false;
1046 }
1047
1048 if (missing_msbs > 30) // p < 0
1049 {
1050 if (insufficient_precision == false)
1051 {
1052 insufficient_precision = true;
1053 OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
1054 "codeblock. This message will not be "
1055 "displayed again.\n");
1056 }
1057 return false;
1058 }
1059 else if (missing_msbs == 30) // p == 0
1060 { // not enough precision to decode and set the bin center to 1
1061 if (modify_code == false) {
1062 modify_code = true;
1063 OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
1064 "pass. The code can be modified to support "
1065 "this case. This message will not be "
1066 "displayed again.\n");
1067 }
1068 return false; // 32 bits are not enough to decode this
1069 }
1070 else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
1071 {
1072 if (num_passes > 1) {
1073 num_passes = 1;
1074 if (truncate_spp_mrp == false) {
1075 truncate_spp_mrp = true;
1076 OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
1077 "nor MagRef passes; both will be skipped. "
1078 "This message will not be displayed "
1079 "again.\n");
1080 }
1081 }
1082 }
1083 ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP
1084 // There is a way to handle the case of p == 0, but a different path
1085 // is required
1086
1087 if (lengths1 < 2)
1088 {
1089 OJPH_WARN(0x00010006, "Wrong codeblock length.\n");
1090 return false;
1091 }
1092
1093 // read scup and fix the bytes there
1094 int lcup, scup;
1095 lcup = (int)lengths1; // length of CUP
1096 //scup is the length of MEL + VLC
1097 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1098 if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
1099 return false;
1100
1101 // The temporary storage scratch holds two types of data in an
1102 // interleaved fashion. The interleaving allows us to use one
1103 // memory pointer.
1104 // We have one entry for a decoded VLC code, and one entry for UVLC.
1105 // Entries are 16 bits each, corresponding to one quad,
1106 // but since we want to use XMM registers of the SSE family
1107 // of SIMD; we allocated 16 bytes or more per quad row; that is,
1108 // the width is no smaller than 16 bytes (or 8 entries), and the
1109 // height is 512 quads
1110 // Each VLC entry contains, in the following order, starting
1111 // from MSB
1112 // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
1113 // Each entry in UVLC contains u_q
1114 // One extra row to handle the case of SPP propagating downwards
1115 // when codeblock width is 4
1116 ui16 scratch[8 * 513] = {0}; // 8+ kB
1117
1118 // We need an extra two entries (one inf and one u_q) beyond
1119 // the last column.
1120 // If the block width is 4 (2 quads), then we use sstr of 8
1121 // (enough for 4 quads). If width is 8 (4 quads) we use
1122 // sstr is 16 (enough for 8 quads). For a width of 16 (8
1123 // quads), we use 24 (enough for 12 quads).
1124 ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
1125
1126 assert((stride & 0x3) == 0);
1127
1128 ui32 mmsbp2 = missing_msbs + 2;
1129
1130 // The cleanup pass is decoded in two steps; in step one,
1131 // the VLC and MEL segments are decoded, generating a record that
1132 // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
1133 // This information should be sufficient for the next step.
1134 // In step 2, we decode the MagSgn segment.
1135
1136 // step 1 decoding VLC and MEL segments
1137 {
1138 // init structures
1139 dec_mel_st mel;
1140 mel_init(&mel, coded_data, lcup, scup);
1141 rev_struct vlc;
1142 rev_init(&vlc, coded_data, lcup, scup);
1143
1144 int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
1145 // data represented as runs of 0 events
1146 // See mel_decode description
1147
1148 ui32 vlc_val;
1149 ui32 c_q = 0;
1150 ui16 *sp = scratch;
1151 //initial quad row
1152 for (ui32 x = 0; x < width; sp += 4)
1153 {
1154 // decode VLC
1156
1157 // first quad
1158 vlc_val = rev_fetch(&vlc);
1159
1160 //decode VLC using the context c_q and the head of VLC bitstream
1161 ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
1162
1163 // if context is zero, use one MEL event
1164 if (c_q == 0) //zero context
1165 {
1166 run -= 2; //subtract 2, since events number if multiplied by 2
1167
1168 // Is the run terminated in 1? if so, use decoded VLC code,
1169 // otherwise, discard decoded data, since we will decoded again
1170 // using a different context
1171 t0 = (run == -1) ? t0 : 0;
1172
1173 // is run -1 or -2? this means a run has been consumed
1174 if (run < 0)
1175 run = mel_get_run(&mel); // get another run
1176 }
1177 //run -= (c_q == 0) ? 2 : 0;
1178 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1179 //if (run < 0)
1180 // run = mel_get_run(&mel); // get another run
1181 sp[0] = t0;
1182 x += 2;
1183
1184 // prepare context for the next quad; eqn. 1 in ITU T.814
1185 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1186
1187 //remove data from vlc stream (0 bits are removed if vlc is not used)
1188 vlc_val = rev_advance(&vlc, t0 & 0x7);
1189
1190 //second quad
1191 ui16 t1 = 0;
1192
1193 //decode VLC using the context c_q and the head of VLC bitstream
1194 t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)];
1195
1196 // if context is zero, use one MEL event
1197 if (c_q == 0 && x < width) //zero context
1198 {
1199 run -= 2; //subtract 2, since events number if multiplied by 2
1200
1201 // if event is 0, discard decoded t1
1202 t1 = (run == -1) ? t1 : 0;
1203
1204 if (run < 0) // have we consumed all events in a run
1205 run = mel_get_run(&mel); // if yes, then get another run
1206 }
1207 t1 = x < width ? t1 : 0;
1208 //run -= (c_q == 0 && x < width) ? 2 : 0;
1209 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1210 //if (run < 0)
1211 // run = mel_get_run(&mel); // get another run
1212 sp[2] = t1;
1213 x += 2;
1214
1215 //prepare context for the next quad, eqn. 1 in ITU T.814
1216 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1217
1218 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1219 vlc_val = rev_advance(&vlc, t1 & 0x7);
1220
1221 // decode u
1223 // uvlc_mode is made up of u_offset bits from the quad pair
1224 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1225 if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
1226 { // the MEL run of events
1227 run -= 2; //subtract 2, since events number if multiplied by 2
1228
1229 uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
1230 // is 0x40
1231
1232 if (run < 0)//if run is consumed (run is -1 or -2), get another run
1233 run = mel_get_run(&mel);
1234 }
1235 //run -= (uvlc_mode == 0xc0) ? 2 : 0;
1236 //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
1237 //if (run < 0)
1238 // run = mel_get_run(&mel); // get another run
1239
1240 //decode uvlc_mode to get u for both quads
1241 ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)];
1242 //remove total prefix length
1243 vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1244 uvlc_entry >>= 3;
1245 //extract suffixes for quad 0 and 1
1246 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1247 ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1248 vlc_val = rev_advance(&vlc, len);
1249 uvlc_entry >>= 4;
1250 // quad 0 length
1251 len = uvlc_entry & 0x7; // quad 0 suffix length
1252 uvlc_entry >>= 3;
1253 ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len))); //kap. 1
1254 sp[1] = u_q;
1255 u_q = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len)); //kappa == 1
1256 sp[3] = u_q;
1257 }
1258 sp[0] = sp[1] = 0;
1259
1260 //non initial quad rows
1261 for (ui32 y = 2; y < height; y += 2)
1262 {
1263 c_q = 0; // context
1264 ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads
1265
1266 for (ui32 x = 0; x < width; sp += 4)
1267 {
1268 // decode VLC
1270
1271 // sigma_q (n, ne, nf)
1272 c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
1273 c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
1274
1275 // first quad
1276 vlc_val = rev_fetch(&vlc);
1277
1278 //decode VLC using the context c_q and the head of VLC bitstream
1279 ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
1280
1281 // if context is zero, use one MEL event
1282 if (c_q == 0) //zero context
1283 {
1284 run -= 2; //subtract 2, since events number is multiplied by 2
1285
1286 // Is the run terminated in 1? if so, use decoded VLC code,
1287 // otherwise, discard decoded data, since we will decoded again
1288 // using a different context
1289 t0 = (run == -1) ? t0 : 0;
1290
1291 // is run -1 or -2? this means a run has been consumed
1292 if (run < 0)
1293 run = mel_get_run(&mel); // get another run
1294 }
1295 //run -= (c_q == 0) ? 2 : 0;
1296 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1297 //if (run < 0)
1298 // run = mel_get_run(&mel); // get another run
1299 sp[0] = t0;
1300 x += 2;
1301
1302 // prepare context for the next quad; eqn. 2 in ITU T.814
1303 // sigma_q (w, sw)
1304 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1305 // sigma_q (nw)
1306 c_q |= sp[0 - (si32)sstr] & 0x80;
1307 // sigma_q (n, ne, nf)
1308 c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
1309 c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
1310
1311 //remove data from vlc stream (0 bits are removed if vlc is unused)
1312 vlc_val = rev_advance(&vlc, t0 & 0x7);
1313
1314 //second quad
1315 ui16 t1 = 0;
1316
1317 //decode VLC using the context c_q and the head of VLC bitstream
1318 t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1319
1320 // if context is zero, use one MEL event
1321 if (c_q == 0 && x < width) //zero context
1322 {
1323 run -= 2; //subtract 2, since events number if multiplied by 2
1324
1325 // if event is 0, discard decoded t1
1326 t1 = (run == -1) ? t1 : 0;
1327
1328 if (run < 0) // have we consumed all events in a run
1329 run = mel_get_run(&mel); // if yes, then get another run
1330 }
1331 t1 = x < width ? t1 : 0;
1332 //run -= (c_q == 0 && x < width) ? 2 : 0;
1333 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1334 //if (run < 0)
1335 // run = mel_get_run(&mel); // get another run
1336 sp[2] = t1;
1337 x += 2;
1338
1339 // partial c_q, will be completed when we process the next quad
1340 // sigma_q (w, sw)
1341 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1342 // sigma_q (nw)
1343 c_q |= sp[2 - (si32)sstr] & 0x80;
1344
1345 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1346 vlc_val = rev_advance(&vlc, t1 & 0x7);
1347
1348 // decode u
1350 // uvlc_mode is made up of u_offset bits from the quad pair
1351 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1352 ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
1353 //remove total prefix length
1354 vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1355 uvlc_entry >>= 3;
1356 //extract suffixes for quad 0 and 1
1357 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1358 ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1359 vlc_val = rev_advance(&vlc, len);
1360 uvlc_entry >>= 4;
1361 // quad 0 length
1362 len = uvlc_entry & 0x7; // quad 0 suffix length
1363 uvlc_entry >>= 3;
1364 ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q
1365 sp[1] = u_q;
1366 u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
1367 sp[3] = u_q;
1368 }
1369 sp[0] = sp[1] = 0;
1370 }
1371 }
1372
1373 // step2 we decode magsgn
1374 // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit)
1375 // The 32 bit path decode 16 bits data, for which one would think
1376 // 16 bits are enough, because we want to put in the center of the
1377 // bin.
1378 // If you have mmsbp2 equals 16 bit, and reversible coding, and
1379 // no bitplanes are missing, then we can decoding using the 16 bit
1380 // path, but we are not doing this here.
1381 if (mmsbp2 >= 16)
1382 {
1383 // We allocate a scratch row for storing v_n values.
1384 // We have 512 quads horizontally.
1385 // We may go beyond the last entry by up to 4 entries.
1386 // Here we allocate additional 8 entries.
1387 // There are two rows in this structure, the bottom
1388 // row is used to store processed entries.
1389 const int v_n_size = 512 + 8;
1390 ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB
1391
1392 frwd_struct magsgn;
1393 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1394
1395 {
1396 ui16 *sp = scratch;
1397 ui32 *vp = v_n_scratch;
1398 ui32 *dp = decoded_data;
1399 vp[0] = 2; // for easy calculation of emax
1400
1401 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1402 {
1403 //here we process two quads
1404 __m128i w0, w1; // workers
1405 __m128i inf_u_q, U_q;
1406 // determine U_q
1407 {
1408 inf_u_q = _mm_loadu_si128((__m128i*)sp);
1409 U_q = _mm_srli_epi32(inf_u_q, 16);
1410
1411 w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1412 int i = _mm_movemask_epi8(w0);
1413 if (i & 0xFF) // only the lower two U_q
1414 return false;
1415 }
1416
1417 __m128i vn = _mm_set1_epi32(2);
1418 __m128i row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1419 __m128i row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1420 w0 = _mm_loadu_si128((__m128i*)vp);
1421 w0 = _mm_and_si128(w0, _mm_set_epi32(0,0,0,-1));
1422 w0 = _mm_or_si128(w0, vn);
1423 _mm_storeu_si128((__m128i*)vp, w0);
1424
1425 //interleave in ssse3 style
1426 w0 = _mm_unpacklo_epi32(row0, row1);
1427 w1 = _mm_unpackhi_epi32(row0, row1);
1428 row0 = _mm_unpacklo_epi32(w0, w1);
1429 row1 = _mm_unpackhi_epi32(w0, w1);
1430 _mm_store_si128((__m128i*)dp, row0);
1431 _mm_store_si128((__m128i*)(dp + stride), row1);
1432 }
1433 }
1434
1435 for (ui32 y = 2; y < height; y += 2)
1436 {
1437 {
1438 // perform 31 - count_leading_zeros(*vp) here
1439 ui32 *vp = v_n_scratch;
1440 const __m128i lut_lo = _mm_set_epi8(
1441 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 31
1442 );
1443 const __m128i lut_hi = _mm_set_epi8(
1444 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 31
1445 );
1446 const __m128i nibble_mask = _mm_set1_epi8(0x0F);
1447 const __m128i byte_offset8 = _mm_set1_epi16(8);
1448 const __m128i byte_offset16 = _mm_set1_epi16(16);
1449 const __m128i cc = _mm_set1_epi32(31);
1450 for (ui32 x = 0; x <= width; x += 8, vp += 4)
1451 {
1452 __m128i v, t; // workers
1453 v = _mm_loadu_si128((__m128i*)vp);
1454
1455 t = _mm_and_si128(nibble_mask, v);
1456 v = _mm_and_si128(_mm_srli_epi16(v, 4), nibble_mask);
1457 t = _mm_shuffle_epi8(lut_lo, t);
1458 v = _mm_shuffle_epi8(lut_hi, v);
1459 v = _mm_min_epu8(v, t);
1460
1461 t = _mm_srli_epi16(v, 8);
1462 v = _mm_or_si128(v, byte_offset8);
1463 v = _mm_min_epu8(v, t);
1464
1465 t = _mm_srli_epi32(v, 16);
1466 v = _mm_or_si128(v, byte_offset16);
1467 v = _mm_min_epu8(v, t);
1468
1469 v = _mm_sub_epi16(cc, v);
1470 _mm_storeu_si128((__m128i*)(vp + v_n_size), v);
1471 }
1472 }
1473
1474 ui32 *vp = v_n_scratch;
1475 ui16 *sp = scratch + (y >> 1) * sstr;
1476 ui32 *dp = decoded_data + y * stride;
1477 vp[0] = 2; // for easy calculation of emax
1478
1479 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1480 {
1481 //process two quads
1482 __m128i w0, w1; // workers
1483 __m128i inf_u_q, U_q;
1484 // determine U_q
1485 {
1486 __m128i gamma, emax, kappa, u_q; // needed locally
1487
1488 inf_u_q = _mm_loadu_si128((__m128i*)sp);
1489 gamma = _mm_and_si128(inf_u_q, _mm_set1_epi32(0xF0));
1490 w0 = _mm_sub_epi32(gamma, _mm_set1_epi32(1));
1491 gamma = _mm_and_si128(gamma, w0);
1492 gamma = _mm_cmpeq_epi32(gamma, _mm_setzero_si128());
1493
1494 emax = _mm_loadu_si128((__m128i*)(vp + v_n_size));
1495 w0 = _mm_bsrli_si128(emax, 4);
1496 emax = _mm_max_epi16(w0, emax); // no max_epi32 in ssse3
1497 emax = _mm_andnot_si128(gamma, emax);
1498
1499 kappa = _mm_set1_epi32(1);
1500 kappa = _mm_max_epi16(emax, kappa); // no max_epi32 in ssse3
1501
1502 u_q = _mm_srli_epi32(inf_u_q, 16);
1503 U_q = _mm_add_epi32(u_q, kappa);
1504
1505 w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1506 int i = _mm_movemask_epi8(w0);
1507 if (i & 0xFF) // only the lower two U_q
1508 return false;
1509 }
1510
1511 __m128i vn = _mm_set1_epi32(2);
1512 __m128i row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1513 __m128i row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1514 w0 = _mm_loadu_si128((__m128i*)vp);
1515 w0 = _mm_and_si128(w0, _mm_set_epi32(0,0,0,-1));
1516 w0 = _mm_or_si128(w0, vn);
1517 _mm_storeu_si128((__m128i*)vp, w0);
1518
1519 //interleave in ssse3 style
1520 w0 = _mm_unpacklo_epi32(row0, row1);
1521 w1 = _mm_unpackhi_epi32(row0, row1);
1522 row0 = _mm_unpacklo_epi32(w0, w1);
1523 row1 = _mm_unpackhi_epi32(w0, w1);
1524 _mm_store_si128((__m128i*)dp, row0);
1525 _mm_store_si128((__m128i*)(dp + stride), row1);
1526 }
1527 }
1528 }
1529 else
1530 {
1531 // reduce bitplane by 16 because we now have 16 bits instead of 32
1532 p -= 16;
1533
1534 // We allocate a scratch row for storing v_n values.
1535 // We have 512 quads horizontally.
1536 // We may go beyond the last entry by up to 8 entries.
1537 // Therefore we allocate additional 8 entries.
1538 // There are two rows in this structure, the bottom
1539 // row is used to store processed entries.
1540 const int v_n_size = 512 + 8;
1541 ui16 v_n_scratch[2 * v_n_size] = {0}; // 2+ kB
1542
1543 frwd_struct magsgn;
1544 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1545
1546 {
1547 ui16 *sp = scratch;
1548 ui16 *vp = v_n_scratch;
1549 ui32 *dp = decoded_data;
1550 vp[0] = 2; // for easy calculation of emax
1551
1552 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1553 {
1554 //here we process two quads
1555 __m128i w0, w1; // workers
1556 __m128i inf_u_q, U_q;
1557 // determine U_q
1558 {
1559 inf_u_q = _mm_loadu_si128((__m128i*)sp);
1560 U_q = _mm_srli_epi32(inf_u_q, 16);
1561
1562 w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1563 int i = _mm_movemask_epi8(w0);
1564 if (i & 0xFF) // only the lower two U_q
1565 return false;
1566 }
1567
1568 __m128i vn = _mm_set1_epi16(2);
1569 __m128i row = decode_two_quad16(inf_u_q, U_q, &magsgn, p, vn);
1570 w0 = _mm_loadu_si128((__m128i*)vp);
1571 w0 = _mm_and_si128(w0, _mm_set_epi16(0,0,0,0,0,0,0,-1));
1572 w0 = _mm_or_si128(w0, vn);
1573 _mm_storeu_si128((__m128i*)vp, w0);
1574
1575 //interleave in ssse3 style
1576 w0 = _mm_shuffle_epi8(row,
1577 _mm_set_epi16(0x0D0C, -1, 0x0908, -1,
1578 0x0504, -1, 0x0100, -1));
1579 _mm_store_si128((__m128i*)dp, w0);
1580 w1 = _mm_shuffle_epi8(row,
1581 _mm_set_epi16(0x0F0E, -1, 0x0B0A, -1,
1582 0x0706, -1, 0x0302, -1));
1583 _mm_store_si128((__m128i*)(dp + stride), w1);
1584 }
1585 }
1586
1587 for (ui32 y = 2; y < height; y += 2)
1588 {
1589 {
1590 // perform 15 - count_leading_zeros(*vp) here
1591 ui16 *vp = v_n_scratch;
1592 const __m128i lut_lo = _mm_set_epi8(
1593 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 15
1594 );
1595 const __m128i lut_hi = _mm_set_epi8(
1596 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 15
1597 );
1598 const __m128i nibble_mask = _mm_set1_epi8(0x0F);
1599 const __m128i byte_offset8 = _mm_set1_epi16(8);
1600 const __m128i cc = _mm_set1_epi16(15);
1601 for (ui32 x = 0; x <= width; x += 16, vp += 8)
1602 {
1603 __m128i v, t; // workers
1604 v = _mm_loadu_si128((__m128i*)vp);
1605
1606 t = _mm_and_si128(nibble_mask, v);
1607 v = _mm_and_si128(_mm_srli_epi16(v, 4), nibble_mask);
1608 t = _mm_shuffle_epi8(lut_lo, t);
1609 v = _mm_shuffle_epi8(lut_hi, v);
1610 v = _mm_min_epu8(v, t);
1611
1612 t = _mm_srli_epi16(v, 8);
1613 v = _mm_or_si128(v, byte_offset8);
1614 v = _mm_min_epu8(v, t);
1615
1616 v = _mm_sub_epi16(cc, v);
1617 _mm_storeu_si128((__m128i*)(vp + v_n_size), v);
1618 }
1619 }
1620
1621 ui16 *vp = v_n_scratch;
1622 ui16 *sp = scratch + (y >> 1) * sstr;
1623 ui32 *dp = decoded_data + y * stride;
1624 vp[0] = 2; // for easy calculation of emax
1625
1626 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1627 {
1628 //process two quads
1629 __m128i w0, w1; // workers
1630 __m128i inf_u_q, U_q;
1631 // determine U_q
1632 {
1633 __m128i gamma, emax, kappa, u_q; // needed locally
1634
1635 inf_u_q = _mm_loadu_si128((__m128i*)sp);
1636 gamma = _mm_and_si128(inf_u_q, _mm_set1_epi32(0xF0));
1637 w0 = _mm_sub_epi32(gamma, _mm_set1_epi32(1));
1638 gamma = _mm_and_si128(gamma, w0);
1639 gamma = _mm_cmpeq_epi32(gamma, _mm_setzero_si128());
1640
1641 emax = _mm_loadu_si128((__m128i*)(vp + v_n_size));
1642 w0 = _mm_bsrli_si128(emax, 2);
1643 emax = _mm_max_epi16(w0, emax); // no max_epi32 in ssse3
1644 emax = _mm_shuffle_epi8(emax,
1645 _mm_set_epi16(-1, 0x0706, -1, 0x0504,
1646 -1, 0x0302, -1, 0x0100));
1647 emax = _mm_andnot_si128(gamma, emax);
1648
1649 kappa = _mm_set1_epi32(1);
1650 kappa = _mm_max_epi16(emax, kappa); // no max_epi32 in ssse3
1651
1652 u_q = _mm_srli_epi32(inf_u_q, 16);
1653 U_q = _mm_add_epi32(u_q, kappa);
1654
1655 w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1656 int i = _mm_movemask_epi8(w0);
1657 if (i & 0xFF) // only the lower two U_q
1658 return false;
1659 }
1660
1661 __m128i vn = _mm_set1_epi16(2);
1662 __m128i row = decode_two_quad16(inf_u_q, U_q, &magsgn, p, vn);
1663 w0 = _mm_loadu_si128((__m128i*)vp);
1664 w0 = _mm_and_si128(w0, _mm_set_epi16(0,0,0,0,0,0,0,-1));
1665 w0 = _mm_or_si128(w0, vn);
1666 _mm_storeu_si128((__m128i*)vp, w0);
1667
1668 w0 = _mm_shuffle_epi8(row,
1669 _mm_set_epi16(0x0D0C, -1, 0x0908, -1,
1670 0x0504, -1, 0x0100, -1));
1671 _mm_store_si128((__m128i*)dp, w0);
1672 w1 = _mm_shuffle_epi8(row,
1673 _mm_set_epi16(0x0F0E, -1, 0x0B0A, -1,
1674 0x0706, -1, 0x0302, -1));
1675 _mm_store_si128((__m128i*)(dp + stride), w1);
1676 }
1677 }
1678
1679 // increase bitplane back by 16 because we need to process 32 bits
1680 p += 16;
1681 }
1682
1683 if (num_passes > 1)
1684 {
1685 // We use scratch again, we can divide it into multiple regions
1686 // sigma holds all the significant samples, and it cannot
1687 // be modified after it is set. it will be used during the
1688 // Magnitude Refinement Pass
1689 ui16* const sigma = scratch;
1690
1691 ui32 mstr = (width + 3u) >> 2; // divide by 4, since each
1692 // ui16 contains 4 columns
1693 mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
1694
1695 // We re-arrange quad significance, where each 4 consecutive
1696 // bits represent one quad, into column significance, where,
1697 // each 4 consequtive bits represent one column of 4 rows
1698 {
1699 ui32 y;
1700
1701 const __m128i mask_3 = _mm_set1_epi32(0x30);
1702 const __m128i mask_C = _mm_set1_epi32(0xC0);
1703 const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
1704 for (y = 0; y < height; y += 4)
1705 {
1706 ui16* sp = scratch + (y >> 1) * sstr;
1707 ui16* dp = sigma + (y >> 2) * mstr;
1708 for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1709 {
1710 __m128i s0, s1, u3, uC, t0, t1;
1711
1712 s0 = _mm_loadu_si128((__m128i*)(sp));
1713 u3 = _mm_and_si128(s0, mask_3);
1714 u3 = _mm_srli_epi32(u3, 4);
1715 uC = _mm_and_si128(s0, mask_C);
1716 uC = _mm_srli_epi32(uC, 2);
1717 t0 = _mm_or_si128(u3, uC);
1718
1719 s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
1720 u3 = _mm_and_si128(s1, mask_3);
1721 u3 = _mm_srli_epi32(u3, 2);
1722 uC = _mm_and_si128(s1, mask_C);
1723 t1 = _mm_or_si128(u3, uC);
1724
1725 __m128i r = _mm_or_si128(t0, t1);
1726 r = _mm_shuffle_epi8(r, shuffle_mask);
1727
1728 // _mm_storeu_si32 is not defined, so we use this workaround
1729 _mm_store_ss((float*)dp, _mm_castsi128_ps(r));
1730 }
1731 dp[0] = 0; // set an extra entry on the right with 0
1732 }
1733 {
1734 // reset one row after the codeblock
1735 ui16* dp = sigma + (y >> 2) * mstr;
1736 __m128i zero = _mm_setzero_si128();
1737 for (ui32 x = 0; x < width; x += 32, dp += 8)
1738 _mm_store_si128((__m128i*)dp, zero);
1739 dp[0] = 0; // set an extra entry on the right with 0
1740 }
1741 }
1742
1743 // We perform Significance Propagation Pass here
1744 {
1745 // This stores significance information of the previous
1746 // 4 rows. Significance information in this array includes
1747 // all signicant samples in bitplane p - 1; that is,
1748 // significant samples for bitplane p (discovered during the
1749 // cleanup pass and stored in sigma) and samples that have recently
1750 // became significant (during the SPP) in bitplane p-1.
1751 // We store enough for the widest row, containing 1024 columns,
1752 // which is equivalent to 256 of ui16, since each stores 4 columns.
1753 // We add an extra 8 entries, just in case we need more
1754 ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
1755
1756 frwd_struct sigprop;
1757 frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
1758
1759 for (ui32 y = 0; y < height; y += 4)
1760 {
1761 ui32 pattern = 0xFFFFu; // a pattern needed samples
1762 if (height - y < 4) {
1763 pattern = 0x7777u;
1764 if (height - y < 3) {
1765 pattern = 0x3333u;
1766 if (height - y < 2)
1767 pattern = 0x1111u;
1768 }
1769 }
1770
1771 // prev holds sign. info. for the previous quad, together
1772 // with the rows on top of it and below it.
1773 ui32 prev = 0;
1774 ui16 *prev_sig = prev_row_sig;
1775 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1776 ui32 *dpp = decoded_data + y * stride;
1777 for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1778 {
1779 // only rows and columns inside the stripe are included
1780 si32 s = (si32)x + 4 - (si32)width;
1781 s = ojph_max(s, 0);
1782 pattern = pattern >> (s * 4);
1783
1784 // We first find locations that need to be tested (potential
1785 // SPP members); these location will end up in mbr
1786 // In each iteration, we produce 16 bits because cwd can have
1787 // up to 16 bits of significance information, followed by the
1788 // corresponding 16 bits of sign information; therefore, it is
1789 // sufficient to fetch 32 bit data per loop.
1790
1791 // Althougth we are interested in 16 bits only, we load 32 bits.
1792 // For the 16 bits we are producing, we need the next 4 bits --
1793 // We need data for at least 5 columns out of 8.
1794 // Therefore loading 32 bits is easier than loading 16 bits
1795 // twice.
1796 ui32 ps = *(ui32*)prev_sig;
1797 ui32 ns = *(ui32*)(cur_sig + mstr);
1798 ui32 u = (ps & 0x88888888) >> 3; // the row on top
1799 if (!stripe_causal)
1800 u |= (ns & 0x11111111) << 3; // the row below
1801
1802 ui32 cs = *(ui32*)cur_sig;
1803 // vertical integration
1804 ui32 mbr = cs; // this sig. info.
1805 mbr |= (cs & 0x77777777) << 1; //above neighbors
1806 mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
1807 mbr |= u;
1808 // horizontal integration
1809 ui32 t = mbr;
1810 mbr |= t << 4; // neighbors on the left
1811 mbr |= t >> 4; // neighbors on the right
1812 mbr |= prev >> 12; // significance of previous group
1813
1814 // remove outside samples, and already significant samples
1815 mbr &= pattern;
1816 mbr &= ~cs;
1817
1818 // find samples that become significant during the SPP
1819 ui32 new_sig = mbr;
1820 if (new_sig)
1821 {
1822 __m128i cwd_vec = frwd_fetch<0>(&sigprop);
1823 ui32 cwd = (ui32)_mm_extract_epi16(cwd_vec, 0);
1824
1825 ui32 cnt = 0;
1826 ui32 col_mask = 0xFu;
1827 ui32 inv_sig = ~cs & pattern;
1828 for (int i = 0; i < 16; i += 4, col_mask <<= 4)
1829 {
1830 if ((col_mask & new_sig) == 0)
1831 continue;
1832
1833 //scan one column
1834 ui32 sample_mask = 0x1111u & col_mask;
1835 if (new_sig & sample_mask)
1836 {
1837 new_sig &= ~sample_mask;
1838 if (cwd & 1)
1839 {
1840 ui32 t = 0x33u << i;
1841 new_sig |= t & inv_sig;
1842 }
1843 cwd >>= 1; ++cnt;
1844 }
1845
1846 sample_mask <<= 1;
1847 if (new_sig & sample_mask)
1848 {
1849 new_sig &= ~sample_mask;
1850 if (cwd & 1)
1851 {
1852 ui32 t = 0x76u << i;
1853 new_sig |= t & inv_sig;
1854 }
1855 cwd >>= 1; ++cnt;
1856 }
1857
1858 sample_mask <<= 1;
1859 if (new_sig & sample_mask)
1860 {
1861 new_sig &= ~sample_mask;
1862 if (cwd & 1)
1863 {
1864 ui32 t = 0xECu << i;
1865 new_sig |= t & inv_sig;
1866 }
1867 cwd >>= 1; ++cnt;
1868 }
1869
1870 sample_mask <<= 1;
1871 if (new_sig & sample_mask)
1872 {
1873 new_sig &= ~sample_mask;
1874 if (cwd & 1)
1875 {
1876 ui32 t = 0xC8u << i;
1877 new_sig |= t & inv_sig;
1878 }
1879 cwd >>= 1; ++cnt;
1880 }
1881 }
1882
1883 if (new_sig)
1884 {
1885 cwd |= (ui32)_mm_extract_epi16(cwd_vec, 1) << (16 - cnt);
1886
1887 // Spread new_sig, such that each bit is in one byte with a
1888 // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1
1889 __m128i new_sig_vec = _mm_set1_epi16((si16)new_sig);
1890 new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
1891 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1892 new_sig_vec = _mm_and_si128(new_sig_vec,
1893 _mm_set1_epi64x((si64)0x8040201008040201));
1894 new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
1895 _mm_set1_epi64x((si64)0x8040201008040201));
1896
1897 // find cumulative sums
1898 // to find which bit in cwd we should extract
1899 __m128i inc_sum = new_sig_vec; // inclusive scan
1900 inc_sum = _mm_abs_epi8(inc_sum); // cvrt to 0 or 1
1901 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1902 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1903 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1904 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1905 cnt += (ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
1906 // exclusive scan
1907 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1908
1909 // Spread cwd, such that each bit is in one byte
1910 // with a value of 0 or 1.
1911 cwd_vec = _mm_set1_epi16((si16)cwd);
1912 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1913 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1914 cwd_vec = _mm_and_si128(cwd_vec,
1915 _mm_set1_epi64x((si64)0x8040201008040201));
1916 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1917 _mm_set1_epi64x((si64)0x8040201008040201));
1918 cwd_vec = _mm_abs_epi8(cwd_vec);
1919
1920 // Obtain bit from cwd_vec correspondig to ex_sum
1921 // Basically, collect needed bits from cwd_vec
1922 __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
1923
1924 // load data and set spp coefficients
1925 __m128i m =
1926 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1927 __m128i val = _mm_set1_epi32(3 << (p - 2));
1928 ui32 *dp = dpp;
1929 for (int c = 0; c < 4; ++ c) {
1930 __m128i s0, s0_ns, s0_val;
1931 // load coefficients
1932 s0 = _mm_load_si128((__m128i*)dp);
1933
1934 // epi32 is -1 only for coefficient that
1935 // are changed during the SPP
1936 s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
1937 s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
1938
1939 // obtain sign for coefficients in SPP
1940 s0_val = _mm_shuffle_epi8(v, m);
1941 s0_val = _mm_slli_epi32(s0_val, 31);
1942 s0_val = _mm_or_si128(s0_val, val);
1943 s0_val = _mm_and_si128(s0_val, s0_ns);
1944
1945 // update vector
1946 s0 = _mm_or_si128(s0, s0_val);
1947 // store coefficients
1948 _mm_store_si128((__m128i*)dp, s0);
1949 // prepare for next row
1950 dp += stride;
1951 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1952 }
1953 }
1954 frwd_advance(&sigprop, cnt);
1955 }
1956
1957 new_sig |= cs;
1958 *prev_sig = (ui16)(new_sig);
1959
1960 // vertical integration for the new sig. info.
1961 t = new_sig;
1962 new_sig |= (t & 0x7777) << 1; //above neighbors
1963 new_sig |= (t & 0xEEEE) >> 1; //below neighbors
1964 // add sig. info. from the row on top and below
1965 prev = new_sig | u;
1966 // we need only the bits in 0xF000
1967 prev &= 0xF000;
1968 }
1969 }
1970 }
1971
1972 // We perform Magnitude Refinement Pass here
1973 if (num_passes > 2)
1974 {
1975 rev_struct magref;
1976 rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
1977
1978 for (ui32 y = 0; y < height; y += 4)
1979 {
1980 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1981 ui32 *dpp = decoded_data + y * stride;
1982 for (ui32 i = 0; i < width; i += 4, dpp += 4)
1983 {
1984 //Process one entry from sigma array at a time
1985 // Each nibble (4 bits) in the sigma array represents 4 rows,
1986 ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
1987 ui16 sig = *cur_sig++; // 16 bit that will be processed now
1988 int total_bits = 0;
1989 if (sig) // if any of the 32 bits are set
1990 {
1991 // We work on 4 rows, with 4 samples each, since
1992 // data is 32 bit (4 bytes)
1993
1994 // spread the 16 bits in sig to 0 or 1 bytes in sig_vec
1995 __m128i sig_vec = _mm_set1_epi16((si16)sig);
1996 sig_vec = _mm_shuffle_epi8(sig_vec,
1997 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1998 sig_vec = _mm_and_si128(sig_vec,
1999 _mm_set1_epi64x((si64)0x8040201008040201));
2000 sig_vec = _mm_cmpeq_epi8(sig_vec,
2001 _mm_set1_epi64x((si64)0x8040201008040201));
2002 sig_vec = _mm_abs_epi8(sig_vec);
2003
2004 // find cumulative sums
2005 // to find which bit in cwd we should extract
2006 __m128i inc_sum = sig_vec; // inclusive scan
2007 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
2008 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
2009 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
2010 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
2011 total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
2012 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1); // exclusive scan
2013
2014 // Spread the 16 bits in cwd to inverted 0 or 1 bytes in
2015 // cwd_vec. Then, convert these to a form suitable
2016 // for coefficient modifications; in particular, a value
2017 // of 0 is presented as binary 11, and a value of 1 is
2018 // represented as binary 01
2019 __m128i cwd_vec = _mm_set1_epi16((si16)cwd);
2020 cwd_vec = _mm_shuffle_epi8(cwd_vec,
2021 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
2022 cwd_vec = _mm_and_si128(cwd_vec,
2023 _mm_set1_epi64x((si64)0x8040201008040201));
2024 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
2025 _mm_set1_epi64x((si64)0x8040201008040201));
2026 cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
2027 cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
2028 cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
2029
2030 // load data and insert the mrp bit
2031 __m128i m =
2032 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
2033 ui32 *dp = dpp;
2034 for (int c = 0; c < 4; ++c) {
2035 __m128i s0, s0_sig, s0_idx, s0_val;
2036 // load coefficients
2037 s0 = _mm_load_si128((__m128i*)dp);
2038 // find significant samples in this row
2039 s0_sig = _mm_shuffle_epi8(sig_vec, m);
2040 s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
2041 // get MRP bit index, and MRP pattern
2042 s0_idx = _mm_shuffle_epi8(ex_sum, m);
2043 s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
2044 // keep data from significant samples only
2045 s0_val = _mm_andnot_si128(s0_sig, s0_val);
2046 // move mrp bits to correct position, and employ
2047 s0_val = _mm_slli_epi32(s0_val, (si32)p - 2);
2048 s0 = _mm_xor_si128(s0, s0_val);
2049 // store coefficients
2050 _mm_store_si128((__m128i*)dp, s0);
2051 // prepare for next row
2052 dp += stride;
2053 m = _mm_add_epi32(m, _mm_set1_epi32(1));
2054 }
2055 }
2056 // consume data according to the number of bits set
2057 rev_advance_mrp(&magref, (ui32)total_bits);
2058 }
2059 }
2060 }
2061 }
2062
2063 return true;
2064 }
2065 }
2066}
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
static ui32 rev_fetch(rev_struct *vlcp)
Retrieves 32 bits from the head of a rev_struct structure.
static void rev_init_mrp(rev_struct *mrp, ui8 *data, int lcup, int len2)
Initialized rev_struct structure for MRP segment, and reads a number of bytes such that the next 32 b...
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
static void frwd_advance(frwd_struct *msp, ui32 num_bits)
Consume num_bits bits from the bitstream of frwd_struct.
bool ojph_decode_codeblock_ssse3(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
Decodes one codeblock, processing the cleanup, siginificance propagation, and magnitude refinement pa...
static __m128i decode_two_quad16(const __m128i inf_u_q, __m128i U_q, frwd_struct *magsgn, ui32 p, __m128i &vn)
decodes twos consecutive quads (one octet), using 16 bit data
static void rev_read_mrp(rev_struct *mrp)
Reads and unstuffs from rev_struct.
static ui32 rev_fetch_mrp(rev_struct *mrp)
Retrieves 32 bits from the head of a rev_struct structure.
static ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static void rev_read(rev_struct *vlcp)
Read and unstuff data from a backwardly-growing segment.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void rev_init(rev_struct *vlcp, ui8 *data, int lcup, int scup)
Initiates the rev_struct structure and reads a few bytes to move the read address to multiple of 4.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static void frwd_read(frwd_struct *msp)
Read and unstuffs 32 bits from forward-growing bitstream.
static ui32 frwd_fetch(frwd_struct *msp)
Fetches 32 bits from the frwd_struct bitstream.
static void frwd_init(frwd_struct *msp, const ui8 *data, int size)
Initialize frwd_struct struct and reads some bytes.
static __m128i decode_one_quad32(const __m128i inf_u_q, __m128i U_q, frwd_struct *magsgn, ui32 p, __m128i &vn)
decodes one quad, using 32 bit data
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
int64_t si64
Definition: ojph_defs.h:57
uint64_t ui64
Definition: ojph_defs.h:56
uint16_t ui16
Definition: ojph_defs.h:52
static ui32 count_leading_zeros(ui32 val)
Definition: ojph_arch.h:130
int32_t si32
Definition: ojph_defs.h:55
int16_t si16
Definition: ojph_defs.h:53
uint32_t ui32
Definition: ojph_defs.h:54
uint8_t ui8
Definition: ojph_defs.h:50
#define ojph_max(a, b)
Definition: ojph_defs.h:73
#define OJPH_WARN(t,...)
Definition: ojph_message.h:128
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)
State structure for reading and unstuffing of forward-growing bitstreams; these are: MagSgn and SPP b...
const ui8 * data
pointer to bitstream
ui32 bits
number of bits stored in tmp
ui64 tmp
temporary buffer of read data
ui32 unstuff
1 if a bit needs to be unstuffed from next byte
A structure for reading and unstuffing a segment that grows backward, such as VLC and MRP.
ui32 bits
number of bits stored in tmp
int size
number of bytes left
ui8 * data
pointer to where to read data
ui64 tmp
temporary buffer of read data