OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_block_decoder_wasm.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2022, Aous Naman
6// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2022, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_block_decoder_wasm.cpp
34// Author: Aous Naman
35// Date: 13 May 2022
36//***************************************************************************/
37
38//***************************************************************************/
43#include <string>
44#include <iostream>
45
46#include <cassert>
47#include <cstring>
48#include "ojph_block_common.h"
49#include "ojph_block_decoder.h"
50#include "ojph_arch.h"
51#include "ojph_message.h"
52
53#include <wasm_simd128.h>
54
55namespace ojph {
56 namespace local {
57
58 //************************************************************************/
61 #define OJPH_REPEAT2(a) a,a
62 #define OJPH_REPEAT4(a) a,a,a,a
63 #define OJPH_REPEAT8(a) a,a,a,a,a,a,a,a
64 #define OJPH_REPEAT16(a) a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a
65
66 //************************************************************************/
73 struct dec_mel_st {
74 dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
75 k(0), num_runs(0), runs(0)
76 {}
77 // data decoding machinary
78 ui8* data;
79 ui64 tmp;
80 int bits;
81 int size;
82 bool unstuff;
83 int k;
84
85 // queue of decoded runs
86 int num_runs;
87 ui64 runs;
88 };
89
90 //************************************************************************/
102 static inline
104 {
105 if (melp->bits > 32) //there are enough bits in the tmp variable
106 return; // return without reading new data
107
108 ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted
109 if (melp->size > 4) { // if there is data in the MEL segment
110 val = *(ui32*)melp->data; // read 32 bits from MEL data
111 melp->data += 4; // advance pointer
112 melp->size -= 4; // reduce counter
113 }
114 else if (melp->size > 0)
115 { // 4 or less
116 int i = 0;
117 while (melp->size > 1) {
118 ui32 v = *melp->data++; // read one byte at a time
119 ui32 m = ~(0xFFu << i); // mask of location
120 val = (val & m) | (v << i);// put one byte in its correct location
121 --melp->size;
122 i += 8;
123 }
124 // size equal to 1
125 ui32 v = *melp->data++; // the one before the last is different
126 v |= 0xF; // MEL and VLC segments can overlap
127 ui32 m = ~(0xFFu << i);
128 val = (val & m) | (v << i);
129 --melp->size;
130 }
131
132 // next we unstuff them before adding them to the buffer
133 int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
134 // the previously read byte requires
135 // unstuffing
136
137 // data is unstuffed and accumulated in t
138 // bits has the number of bits in t
139 ui32 t = val & 0xFF;
140 bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
141 bits -= unstuff; // there is one less bit in t if unstuffing is needed
142 t = t << (8 - unstuff); // move up to make room for the next byte
143
144 //this is a repeat of the above
145 t |= (val>>8) & 0xFF;
146 unstuff = (((val >> 8) & 0xFF) == 0xFF);
147 bits -= unstuff;
148 t = t << (8 - unstuff);
149
150 t |= (val>>16) & 0xFF;
151 unstuff = (((val >> 16) & 0xFF) == 0xFF);
152 bits -= unstuff;
153 t = t << (8 - unstuff);
154
155 t |= (val>>24) & 0xFF;
156 melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
157
158 // move t to tmp, and push the result all the way up, so we read from
159 // the MSB
160 melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
161 melp->bits += bits; //increment the number of bits in tmp
162 }
163
164 //************************************************************************/
179 static inline
181 {
182 static const int mel_exp[13] = { //MEL exponents
183 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
184 };
185
186 if (melp->bits < 6) // if there are less than 6 bits in tmp
187 mel_read(melp); // then read from the MEL bitstream
188 // 6 bits is the largest decodable MEL cwd
189
190 //repeat so long that there is enough decodable bits in tmp,
191 // and the runs store is not full (num_runs < 8)
192 while (melp->bits >= 6 && melp->num_runs < 8)
193 {
194 int eval = mel_exp[melp->k]; // number of bits associated with state
195 int run = 0;
196 if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
197 { //one is found
198 run = 1 << eval;
199 run--; // consecutive runs of 0 events - 1
200 melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
201 melp->tmp <<= 1; // consume one bit from tmp
202 melp->bits -= 1;
203 run = run << 1; // a stretch of zeros not terminating in one
204 }
205 else
206 { //0 is found
207 run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
208 melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
209 melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
210 melp->bits -= eval + 1;
211 run = (run << 1) + 1; // a stretch of zeros terminating with one
212 }
213 eval = melp->num_runs * 7; // 7 bits per run
214 melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
215 melp->runs |= ((ui64)run) << eval; // store the value in runs
216 melp->num_runs++; // increment count
217 }
218 }
219
220 //************************************************************************/
230 static inline
231 void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
232 {
233 melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
234 melp->bits = 0; // 0 bits in tmp
235 melp->tmp = 0; //
236 melp->unstuff = false; // no unstuffing
237 melp->size = scup - 1; // size is the length of MEL+VLC-1
238 melp->k = 0; // 0 for state
239 melp->num_runs = 0; // num_runs is 0
240 melp->runs = 0; //
241
242 //This code is borrowed; original is for a different architecture
243 //These few lines take care of the case where data is not at a multiple
244 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment
245 int num = 4 - (int)(intptr_t(melp->data) & 0x3);
246 for (int i = 0; i < num; ++i) { // this code is similar to mel_read
247 assert(melp->unstuff == false || melp->data[0] <= 0x8F);
248 ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
249 //set data to 0xFF
250 if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
251 // see the standard
252 melp->data += melp->size-- > 0; //increment if the end is not reached
253 int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
254 melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
255 melp->bits += d_bits; //increment tmp by number of bits
256 melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
257 //unstuffing
258 }
259 melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
260 // is the MSB
261 }
262
263 //************************************************************************/
269 static inline
271 {
272 if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment
273 mel_decode(melp);
274
275 int t = melp->runs & 0x7F; //retrieve one run
276 melp->runs >>= 7; // remove the retrieved run
277 melp->num_runs--;
278 return t; // return run
279 }
280
281 //************************************************************************/
285 struct rev_struct {
286 rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
287 {}
288 //storage
289 ui8* data;
290 ui64 tmp;
291 ui32 bits;
292 int size;
293 bool unstuff;
295 };
296
297 //************************************************************************/
317 static inline
319 {
320 //process 4 bytes at a time
321 if (vlcp->bits > 32) // if there are more than 32 bits in tmp, then
322 return; // reading 32 bits can overflow vlcp->tmp
323 ui32 val = 0;
324 //the next line (the if statement) needs to be tested first
325 if (vlcp->size > 3) // if there are more than 3 bytes left in VLC
326 {
327 // (vlcp->data - 3) move pointer back to read 32 bits at once
328 val = *(ui32*)(vlcp->data - 3); // then read 32 bits
329 vlcp->data -= 4; // move data pointer back by 4
330 vlcp->size -= 4; // reduce available byte by 4
331 }
332 else if (vlcp->size > 0)
333 { // 4 or less
334 int i = 24;
335 while (vlcp->size > 0) {
336 ui32 v = *vlcp->data--; // read one byte at a time
337 val |= (v << i); // put byte in its correct location
338 --vlcp->size;
339 i -= 8;
340 }
341 }
342
343 //accumulate in tmp, number of bits in tmp are stored in bits
344 ui32 tmp = val >> 24; //start with the MSB byte
345 ui32 bits;
346
347 // test unstuff (previous byte is >0x8F), and this byte is 0x7F
348 bits = 8 - ((vlcp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
349 bool unstuff = (val >> 24) > 0x8F; //this is for the next byte
350
351 tmp |= ((val >> 16) & 0xFF) << bits; //process the next byte
352 bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
353 unstuff = ((val >> 16) & 0xFF) > 0x8F;
354
355 tmp |= ((val >> 8) & 0xFF) << bits;
356 bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
357 unstuff = ((val >> 8) & 0xFF) > 0x8F;
358
359 tmp |= (val & 0xFF) << bits;
360 bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
361 unstuff = (val & 0xFF) > 0x8F;
362
363 // now move the read and unstuffed bits into vlcp->tmp
364 vlcp->tmp |= (ui64)tmp << vlcp->bits;
365 vlcp->bits += bits;
366 vlcp->unstuff = unstuff; // this for the next read
367 }
368
369 //************************************************************************/
383 static inline
384 void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup)
385 {
386 //first byte has only the upper 4 bits
387 vlcp->data = data + lcup - 2;
388
389 //size can not be larger than this, in fact it should be smaller
390 vlcp->size = scup - 2;
391
392 ui32 d = *vlcp->data--; // read one byte (this is a half byte)
393 vlcp->tmp = d >> 4; // both initialize and set
394 vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard
395 vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte
396
397 //This code is designed for an architecture that read address should
398 // align to the read size (address multiple of 4 if read size is 4)
399 //These few lines take care of the case where data is not at a multiple
400 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream.
401 // To read 32 bits, read from (vlcp->data - 3)
402 int num = 1 + (int)(intptr_t(vlcp->data) & 0x3);
403 int tnum = num < vlcp->size ? num : vlcp->size;
404 for (int i = 0; i < tnum; ++i) {
405 ui64 d;
406 d = *vlcp->data--; // read one byte and move read pointer
407 //check if the last byte was >0x8F (unstuff == true) and this is 0x7F
408 ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
409 vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp
410 vlcp->bits += d_bits;
411 vlcp->unstuff = d > 0x8F; // for next byte
412 }
413 vlcp->size -= tnum;
414 rev_read(vlcp); // read another 32 buts
415 }
416
417 //************************************************************************/
424 static inline
426 {
427 if (vlcp->bits < 32) // if there are less then 32 bits, read more
428 {
429 rev_read(vlcp); // read 32 bits, but unstuffing might reduce this
430 if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits
431 rev_read(vlcp); // read another 32
432 }
433 return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp
434 }
435
436 //************************************************************************/
442 static inline
444 {
445 assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
446 vlcp->tmp >>= num_bits; // remove bits
447 vlcp->bits -= num_bits; // decrement the number of bits
448 return (ui32)vlcp->tmp;
449 }
450
451 //************************************************************************/
462 static inline
464 {
465 //process 4 bytes at a time
466 if (mrp->bits > 32)
467 return;
468 ui32 val = 0;
469 if (mrp->size > 3) // If there are 3 byte or more
470 { // (mrp->data - 3) move pointer back to read 32 bits at once
471 val = *(ui32*)(mrp->data - 3); // read 32 bits
472 mrp->data -= 4; // move back pointer
473 mrp->size -= 4; // reduce count
474 }
475 else if (mrp->size > 0)
476 {
477 int i = 24;
478 while (mrp->size > 0) {
479 ui32 v = *mrp->data--; // read one byte at a time
480 val |= (v << i); // put byte in its correct location
481 --mrp->size;
482 i -= 8;
483 }
484 }
485
486 //accumulate in tmp, and keep count in bits
487 ui32 bits, tmp = val >> 24;
488
489 //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
490 bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
491 bool unstuff = (val >> 24) > 0x8F;
492
493 //process the next byte
494 tmp |= ((val >> 16) & 0xFF) << bits;
495 bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
496 unstuff = ((val >> 16) & 0xFF) > 0x8F;
497
498 tmp |= ((val >> 8) & 0xFF) << bits;
499 bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
500 unstuff = ((val >> 8) & 0xFF) > 0x8F;
501
502 tmp |= (val & 0xFF) << bits;
503 bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
504 unstuff = (val & 0xFF) > 0x8F;
505
506 mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
507 mrp->bits += bits;
508 mrp->unstuff = unstuff; // next byte
509 }
510
511 //************************************************************************/
526 static inline
527 void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
528 {
529 mrp->data = data + lcup + len2 - 1;
530 mrp->size = len2;
531 mrp->unstuff = true;
532 mrp->bits = 0;
533 mrp->tmp = 0;
534
535 //This code is designed for an architecture that read address should
536 // align to the read size (address multiple of 4 if read size is 4)
537 //These few lines take care of the case where data is not at a multiple
538 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream
539 int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
540 for (int i = 0; i < num; ++i) {
541 ui64 d;
542 //read a byte, 0 if no more data
543 d = (mrp->size-- > 0) ? *mrp->data-- : 0;
544 //check if unstuffing is needed
545 ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
546 mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
547 mrp->bits += d_bits;
548 mrp->unstuff = d > 0x8F; // for next byte
549 }
550 rev_read_mrp(mrp);
551 }
552
553 //************************************************************************/
560 static inline
562 {
563 if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
564 {
565 rev_read_mrp(mrp); // read 30-32 bits from mrp
566 if (mrp->bits < 32) // if there is a space of 32 bits
567 rev_read_mrp(mrp); // read more
568 }
569 return (ui32)mrp->tmp; // return the head of mrp->tmp
570 }
571
572 //************************************************************************/
578 inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
579 {
580 assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
581 mrp->tmp >>= num_bits; // discard the lowest num_bits bits
582 mrp->bits -= num_bits;
583 return (ui32)mrp->tmp; // return data after consumption
584 }
585
586 //************************************************************************/
590 struct frwd_struct {
591 const ui8* data;
592 ui8 tmp[48];
593 ui32 bits;
594 ui32 unstuff;
595 int size;
596 };
597
598 //************************************************************************/
616 template<int X>
617 static inline
619 {
620 assert(msp->bits <= 128);
621
622 v128_t offset, val, validity, all_xff;
623 val = wasm_v128_load(msp->data);
624 int bytes = msp->size >= 16 ? 16 : msp->size;
625 validity = wasm_i8x16_splat((char)bytes);
626 msp->data += bytes;
627 msp->size -= bytes;
628 ui32 bits = 128;
629 offset = wasm_i64x2_const(0x0706050403020100,0x0F0E0D0C0B0A0908);
630 validity = wasm_i8x16_gt(validity, offset);
631 all_xff = wasm_i8x16_const(OJPH_REPEAT16(-1));
632 if (X == 0xFF) // the compiler should remove this if statement
633 {
634 v128_t t = wasm_v128_xor(validity, all_xff); // complement
635 val = wasm_v128_or(t, val); // fill with 0xFF
636 }
637 else if (X == 0)
638 val = wasm_v128_and(validity, val); // fill with zeros
639 else
640 assert(0);
641
642 v128_t ff_bytes;
643 ff_bytes = wasm_i8x16_eq(val, all_xff);
644 ff_bytes = wasm_v128_and(ff_bytes, validity);
645 ui32 flags = wasm_i8x16_bitmask(ff_bytes);
646 flags <<= 1; // unstuff following byte
647 ui32 next_unstuff = flags >> 16;
648 flags |= msp->unstuff;
649 flags &= 0xFFFF;
650 while (flags)
651 { // bit unstuffing occurs on average once every 256 bytes
652 // therefore it is not an issue if it is a bit slow
653 // here we process 16 bytes
654 --bits; // consuming one stuffing bit
655
656 ui32 loc = 31 - count_leading_zeros(flags);
657 flags ^= 1 << loc;
658
659 v128_t m, t, c;
660 t = wasm_i8x16_splat((char)loc);
661 m = wasm_i8x16_gt(offset, t);
662
663 t = wasm_v128_and(m, val); // keep bits at locations larger than loc
664 c = wasm_u64x2_shr(t, 1); // 1 bits left
665 t = wasm_i64x2_shuffle(t, wasm_i64x2_const(0, 0), 1, 2);
666 t = wasm_i64x2_shl(t, 63); // keep the MSB only
667 t = wasm_v128_or(t, c); // combine the above 3 steps
668
669 val = wasm_v128_or(t, wasm_v128_andnot(val, m));
670 }
671
672 // combine with earlier data
673 assert(msp->bits >= 0 && msp->bits <= 128);
674 int cur_bytes = msp->bits >> 3;
675 ui32 cur_bits = msp->bits & 7;
676 v128_t b1, b2;
677 b1 = wasm_i64x2_shl(val, cur_bits);
678 //next shift 8 bytes right
679 b2 = wasm_i64x2_shuffle(wasm_i64x2_const(0, 0), val, 1, 2);
680 b2 = wasm_u64x2_shr(b2, 64u - cur_bits);
681 b2 = (cur_bits > 0) ? b2 : wasm_i64x2_const(0, 0);
682 b1 = wasm_v128_or(b1, b2);
683 b2 = wasm_v128_load(msp->tmp + cur_bytes);
684 b2 = wasm_v128_or(b1, b2);
685 wasm_v128_store(msp->tmp + cur_bytes, b2);
686
687 ui32 consumed_bits = bits < 128u - cur_bits ? bits : 128u - cur_bits;
688 cur_bytes = (msp->bits + consumed_bits + 7) >> 3; // round up
689 int upper = wasm_u16x8_extract_lane(val, 7);
690 upper >>= consumed_bits + 16 - 128;
691 msp->tmp[cur_bytes] = (ui8)upper; // copy byte
692
693 msp->bits += bits;
694 msp->unstuff = next_unstuff; // next unstuff
695 assert(msp->unstuff == 0 || msp->unstuff == 1);
696 }
697
698 //************************************************************************/
707 template<int X>
708 static inline
709 void frwd_init(frwd_struct *msp, const ui8* data, int size)
710 {
711 msp->data = data;
712 wasm_v128_store(msp->tmp, wasm_i64x2_const(0, 0));
713 wasm_v128_store(msp->tmp + 16, wasm_i64x2_const(0, 0));
714 wasm_v128_store(msp->tmp + 32, wasm_i64x2_const(0, 0));
715
716 msp->bits = 0;
717 msp->unstuff = 0;
718 msp->size = size;
719
720 frwd_read<X>(msp); // read 128 bits more
721 }
722
723 //************************************************************************/
729 static inline
730 void frwd_advance(frwd_struct *msp, ui32 num_bits)
731 {
732 assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
733 msp->bits -= num_bits;
734
735 v128_t *p = (v128_t*)(msp->tmp + ((num_bits >> 3) & 0x18));
736 num_bits &= 63;
737
738 v128_t v0, v1, c0, c1, t;
739 v0 = wasm_v128_load(p);
740 v1 = wasm_v128_load(p + 1);
741
742 // shift right by num_bits
743 c0 = wasm_u64x2_shr(v0, num_bits);
744 t = wasm_i64x2_shuffle(v0, wasm_i64x2_const(0, 0), 1, 2);
745 t = wasm_i64x2_shl(t, 64 - num_bits);
746 t = (num_bits > 0) ? t : wasm_i64x2_const(0, 0);
747 c0 = wasm_v128_or(c0, t);
748 t = wasm_i64x2_shuffle(wasm_i64x2_const(0, 0), v1, 1, 2);
749 t = wasm_i64x2_shl(t, 64 - num_bits);
750 t = (num_bits > 0) ? t : wasm_i64x2_const(0, 0);
751 c0 = wasm_v128_or(c0, t);
752
753 wasm_v128_store(msp->tmp, c0);
754
755 c1 = wasm_u64x2_shr(v1, num_bits);
756 t = wasm_i64x2_shuffle(v1, wasm_i64x2_const(0, 0), 1, 2);
757 t = wasm_i64x2_shl(t, 64 - num_bits);
758 t = (num_bits > 0) ? t : wasm_i64x2_const(0, 0);
759 c1 = wasm_v128_or(c1, t);
760
761 wasm_v128_store(msp->tmp + 16, c1);
762 }
763
764 //************************************************************************/
771 template<int X>
772 static inline
774 {
775 if (msp->bits <= 128)
776 {
777 frwd_read<X>(msp);
778 if (msp->bits <= 128) //need to test
779 frwd_read<X>(msp);
780 }
781 v128_t t = wasm_v128_load(msp->tmp);
782 return t;
783 }
784
785 //************************************************************************/
797 template <int N>
798 static inline
799 v128_t decode_one_quad32(const v128_t inf_u_q, v128_t U_q,
800 frwd_struct* magsgn, ui32 p, v128_t& vn)
801 {
802 v128_t w0; // workers
803 v128_t insig; // lanes hold FF's if samples are insignificant
804 v128_t flags; // lanes hold e_k, e_1, and rho
805 v128_t row; // decoded row
806
807 row = wasm_i64x2_const(0, 0);
808 w0 = wasm_i32x4_shuffle(inf_u_q, inf_u_q, N, N, N, N);
809 // we keeps e_k, e_1, and rho in w2
810 flags = wasm_v128_and(w0, wasm_i32x4_const(0x1110,0x2220,0x4440,0x8880));
811 insig = wasm_i32x4_eq(flags, wasm_i64x2_const(0, 0));
812 if (wasm_i8x16_bitmask(insig) != 0xFFFF) //are all insignificant?
813 {
814 U_q = wasm_i32x4_shuffle(U_q, U_q, N, N, N, N);
815 flags = wasm_i16x8_mul(flags, wasm_i16x8_const(8,8,4,4,2,2,1,1));
816 v128_t ms_vec = frwd_fetch<0xFF>(magsgn);
817
818 // U_q holds U_q for this quad
819 // flags has e_k, e_1, and rho such that e_k is sitting in the
820 // 0x8000, e_1 in 0x800, and rho in 0x80
821
822 // next e_k and m_n
823 v128_t m_n;
824 w0 = wasm_u32x4_shr(flags, 15); // e_k
825 m_n = wasm_i32x4_sub(U_q, w0);
826 m_n = wasm_v128_andnot(m_n, insig);
827
828 // find cumulative sums
829 // to find at which bit in ms_vec the sample starts
830 v128_t ex_sum, shfl, inc_sum = m_n; // inclusive scan
831 shfl = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
832 inc_sum = wasm_i32x4_add(inc_sum, shfl);
833 shfl = wasm_i64x2_shuffle(wasm_i64x2_const(0,0), inc_sum, 1, 2);
834 inc_sum = wasm_i32x4_add(inc_sum, shfl);
835 int total_mn = wasm_u16x8_extract_lane(inc_sum, 6);
836 ex_sum = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
837
838 // find the starting byte and starting bit
839 v128_t byte_idx = wasm_u32x4_shr(ex_sum, 3);
840 v128_t bit_idx =
841 wasm_v128_and(ex_sum, wasm_i32x4_const(OJPH_REPEAT4(7)));
842 byte_idx = wasm_i8x16_swizzle(byte_idx,
843 wasm_i32x4_const(0x00000000, 0x04040404, 0x08080808, 0x0C0C0C0C));
844 byte_idx =
845 wasm_i32x4_add(byte_idx, wasm_i32x4_const(OJPH_REPEAT4(0x03020100)));
846 v128_t d0 = wasm_i8x16_swizzle(ms_vec, byte_idx);
847 byte_idx =
848 wasm_i32x4_add(byte_idx, wasm_i32x4_const(OJPH_REPEAT4(0x01010101)));
849 v128_t d1 = wasm_i8x16_swizzle(ms_vec, byte_idx);
850
851 // shift samples values to correct location
852 bit_idx = wasm_v128_or(bit_idx, wasm_i32x4_shl(bit_idx, 16));
853 v128_t bit_shift = wasm_i8x16_swizzle(
854 wasm_i8x16_const(-1, 127, 63, 31, 15, 7, 3, 1,
855 -1, 127, 63, 31, 15, 7, 3, 1), bit_idx);
856 bit_shift =
857 wasm_i16x8_add(bit_shift, wasm_i16x8_const(OJPH_REPEAT8(0x0101)));
858 d0 = wasm_i16x8_mul(d0, bit_shift);
859 d0 = wasm_u16x8_shr(d0, 8); // we should have 8 bits in the LSB
860 d1 = wasm_i16x8_mul(d1, bit_shift);
861 d1 = // 8 in MSB
862 wasm_v128_and(d1, wasm_u32x4_const(OJPH_REPEAT4(0xFF00FF00)));
863 d0 = wasm_v128_or(d0, d1);
864
865 // find location of e_k and mask
866 v128_t shift;
867 v128_t ones = wasm_i32x4_const(OJPH_REPEAT4(1));
868 v128_t twos = wasm_i32x4_const(OJPH_REPEAT4(2));
869 ui32 U_q_m1 = wasm_u32x4_extract_lane(U_q, 0) - 1u;
870 w0 = wasm_i32x4_sub(twos, w0);
871 shift = wasm_i32x4_shl(w0, U_q_m1);
872 ms_vec = wasm_v128_and(d0, wasm_i32x4_sub(shift, ones));
873
874 // next e_1
875 w0 = wasm_v128_and(flags, wasm_i32x4_const(OJPH_REPEAT4(0x800)));
876 w0 = wasm_i32x4_eq(w0, wasm_i64x2_const(0, 0));
877 w0 = wasm_v128_andnot(shift, w0); // e_1 in correct position
878 ms_vec = wasm_v128_or(ms_vec, w0); // e_1
879 w0 = wasm_i32x4_shl(ms_vec, 31); // sign
880 ms_vec = wasm_v128_or(ms_vec, ones); // bin center
881 v128_t tvn = ms_vec;
882 ms_vec = wasm_i32x4_add(ms_vec, twos);// + 2
883 ms_vec = wasm_i32x4_shl(ms_vec, p - 1);
884 ms_vec = wasm_v128_or(ms_vec, w0); // sign
885 row = wasm_v128_andnot(ms_vec, insig); // significant only
886
887 ms_vec = wasm_v128_andnot(tvn, insig); // significant only
888 if (N == 0) // the compiler should remove one
889 tvn = wasm_i8x16_swizzle(ms_vec,
890 wasm_i32x4_const(0x07060504, 0x0F0E0D0C, -1, -1));
891 else if (N == 1)
892 tvn = wasm_i8x16_swizzle(ms_vec,
893 wasm_i32x4_const(-1, 0x07060504, 0x0F0E0D0C, -1));
894 else
895 assert(0);
896 vn = wasm_v128_or(vn, tvn);
897
898 if (total_mn)
899 frwd_advance(magsgn, (ui32)total_mn);
900 }
901 return row;
902 }
903
904 //************************************************************************/
914 static inline
915 v128_t decode_two_quad16(const v128_t inf_u_q, v128_t U_q,
916 frwd_struct* magsgn, ui32 p, v128_t& vn)
917 {
918 v128_t w0; // workers
919 v128_t insig; // lanes hold FF's if samples are insignificant
920 v128_t flags; // lanes hold e_k, e_1, and rho
921 v128_t row; // decoded row
922
923 row = wasm_i64x2_const(0, 0);
924 w0 = wasm_i8x16_swizzle(inf_u_q,
925 wasm_i16x8_const(0x0100, 0x0100, 0x0100, 0x0100,
926 0x0504, 0x0504, 0x0504, 0x0504));
927 // we keeps e_k, e_1, and rho in w2
928 flags = wasm_v128_and(w0,
929 wasm_u16x8_const(0x1110, 0x2220, 0x4440, 0x8880,
930 0x1110, 0x2220, 0x4440, 0x8880));
931 insig = wasm_i16x8_eq(flags, wasm_i64x2_const(0, 0));
932 if (wasm_i8x16_bitmask(insig) != 0xFFFF) //are all insignificant?
933 {
934 U_q = wasm_i8x16_swizzle(U_q,
935 wasm_i16x8_const(0x0100, 0x0100, 0x0100, 0x0100,
936 0x0504, 0x0504, 0x0504, 0x0504));
937 flags = wasm_i16x8_mul(flags, wasm_i16x8_const(8,4,2,1,8,4,2,1));
938 v128_t ms_vec = frwd_fetch<0xFF>(magsgn);
939
940 // U_q holds U_q for this quad
941 // flags has e_k, e_1, and rho such that e_k is sitting in the
942 // 0x8000, e_1 in 0x800, and rho in 0x80
943
944 // next e_k and m_n
945 v128_t m_n;
946 w0 = wasm_u16x8_shr(flags, 15); // e_k
947 m_n = wasm_i16x8_sub(U_q, w0);
948 m_n = wasm_v128_andnot(m_n, insig);
949
950 // find cumulative sums
951 // to find at which bit in ms_vec the sample starts
952 v128_t ex_sum, shfl, inc_sum = m_n; // inclusive scan
953 shfl = wasm_i16x8_shuffle(wasm_i64x2_const(0,0),
954 inc_sum, 7, 8, 9, 10, 11, 12, 13, 14);
955 inc_sum = wasm_i16x8_add(inc_sum, shfl);
956 shfl = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
957 inc_sum = wasm_i16x8_add(inc_sum, shfl);
958 shfl = wasm_i64x2_shuffle(wasm_i64x2_const(0,0), inc_sum, 1, 2);
959 inc_sum = wasm_i16x8_add(inc_sum, shfl);
960 int total_mn = wasm_u16x8_extract_lane(inc_sum, 7);
961 ex_sum = wasm_i16x8_shuffle(wasm_i64x2_const(0,0),
962 inc_sum, 7, 8, 9, 10, 11, 12, 13, 14);
963
964 // find the starting byte and starting bit
965 v128_t byte_idx = wasm_u16x8_shr(ex_sum, 3);
966 v128_t bit_idx =
967 wasm_v128_and(ex_sum, wasm_i16x8_const(OJPH_REPEAT8(7)));
968 byte_idx = wasm_i8x16_swizzle(byte_idx,
969 wasm_i16x8_const(0x0000, 0x0202, 0x0404, 0x0606,
970 0x0808, 0x0A0A, 0x0C0C, 0x0E0E));
971 byte_idx =
972 wasm_i16x8_add(byte_idx, wasm_i16x8_const(OJPH_REPEAT8(0x0100)));
973 v128_t d0 = wasm_i8x16_swizzle(ms_vec, byte_idx);
974 byte_idx =
975 wasm_i16x8_add(byte_idx, wasm_i16x8_const(OJPH_REPEAT8(0x0101)));
976 v128_t d1 = wasm_i8x16_swizzle(ms_vec, byte_idx);
977
978 // shift samples values to correct location
979 v128_t bit_shift = wasm_i8x16_swizzle(
980 wasm_i8x16_const(-1, 127, 63, 31, 15, 7, 3, 1,
981 -1, 127, 63, 31, 15, 7, 3, 1), bit_idx);
982 bit_shift =
983 wasm_i16x8_add(bit_shift, wasm_i16x8_const(OJPH_REPEAT8(0x0101)));
984 d0 = wasm_i16x8_mul(d0, bit_shift);
985 d0 = wasm_u16x8_shr(d0, 8); // we should have 8 bits in the LSB
986 d1 = wasm_i16x8_mul(d1, bit_shift);
987 d1 = // 8 in MSB
988 wasm_v128_and(d1, wasm_i16x8_const(OJPH_REPEAT8((si16)0xFF00)));
989 d0 = wasm_v128_or(d0, d1);
990
991 // find location of e_k and mask
992 v128_t shift, t0, t1;
993 v128_t ones = wasm_i16x8_const(OJPH_REPEAT8(1));
994 v128_t twos = wasm_i16x8_const(OJPH_REPEAT8(2));
995 v128_t U_q_m1 = wasm_i32x4_sub(U_q, ones);
996 ui32 Uq0 = wasm_u16x8_extract_lane(U_q_m1, 0);
997 ui32 Uq1 = wasm_u16x8_extract_lane(U_q_m1, 4);
998 w0 = wasm_i16x8_sub(twos, w0);
999 t0 = wasm_v128_and(w0, wasm_i64x2_const(-1, 0));
1000 t1 = wasm_v128_and(w0, wasm_i64x2_const(0, -1));
1001 t0 = wasm_i32x4_shl(t0, Uq0);
1002 t1 = wasm_i32x4_shl(t1, Uq1);
1003 shift = wasm_v128_or(t0, t1);
1004 ms_vec = wasm_v128_and(d0, wasm_i16x8_sub(shift, ones));
1005
1006 // next e_1
1007 w0 = wasm_v128_and(flags, wasm_i16x8_const(OJPH_REPEAT8(0x800)));
1008 w0 = wasm_i16x8_eq(w0, wasm_i64x2_const(0, 0));
1009 w0 = wasm_v128_andnot(shift, w0); // e_1 in correct position
1010 ms_vec = wasm_v128_or(ms_vec, w0); // e_1
1011 w0 = wasm_i16x8_shl(ms_vec, 15); // sign
1012 ms_vec = wasm_v128_or(ms_vec, ones); // bin center
1013 v128_t tvn = ms_vec;
1014 ms_vec = wasm_i16x8_add(ms_vec, twos);// + 2
1015 ms_vec = wasm_i16x8_shl(ms_vec, p - 1);
1016 ms_vec = wasm_v128_or(ms_vec, w0); // sign
1017 row = wasm_v128_andnot(ms_vec, insig); // significant only
1018
1019 ms_vec = wasm_v128_andnot(tvn, insig); // significant only
1020 w0 = wasm_i8x16_swizzle(ms_vec,
1021 wasm_i16x8_const(0x0302, 0x0706, -1, -1, -1, -1, -1, -1));
1022 vn = wasm_v128_or(vn, w0);
1023 w0 = wasm_i8x16_swizzle(ms_vec,
1024 wasm_i16x8_const(-1, 0x0B0A, 0x0F0E, -1, -1, -1, -1, -1));
1025 vn = wasm_v128_or(vn, w0);
1026
1027 if (total_mn)
1028 frwd_advance(magsgn, (ui32)total_mn);
1029 }
1030 return row;
1031 }
1032
1033
1034 //************************************************************************/
1051 bool ojph_decode_codeblock_wasm(ui8* coded_data, ui32* decoded_data,
1052 ui32 missing_msbs, ui32 num_passes,
1053 ui32 lengths1, ui32 lengths2,
1054 ui32 width, ui32 height, ui32 stride,
1055 bool stripe_causal)
1056 {
1057 static bool insufficient_precision = false;
1058 static bool modify_code = false;
1059 static bool truncate_spp_mrp = false;
1060
1061 if (num_passes > 1 && lengths2 == 0)
1062 {
1063 OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
1064 "one coding pass, but zero length for "
1065 "2nd and potential 3rd pass.\n");
1066 num_passes = 1;
1067 }
1068
1069 if (num_passes > 3)
1070 {
1071 OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
1072 "This codeblocks has %d passes.\n",
1073 num_passes);
1074 return false;
1075 }
1076
1077 if (missing_msbs > 30) // p < 0
1078 {
1079 if (insufficient_precision == false)
1080 {
1081 insufficient_precision = true;
1082 OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
1083 "codeblock. This message will not be "
1084 "displayed again.\n");
1085 }
1086 return false;
1087 }
1088 else if (missing_msbs == 30) // p == 0
1089 { // not enough precision to decode and set the bin center to 1
1090 if (modify_code == false) {
1091 modify_code = true;
1092 OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
1093 "pass. The code can be modified to support "
1094 "this case. This message will not be "
1095 "displayed again.\n");
1096 }
1097 return false; // 32 bits are not enough to decode this
1098 }
1099 else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
1100 {
1101 if (num_passes > 1) {
1102 num_passes = 1;
1103 if (truncate_spp_mrp == false) {
1104 truncate_spp_mrp = true;
1105 OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
1106 "nor MagRef passes; both will be skipped. "
1107 "This message will not be displayed "
1108 "again.\n");
1109 }
1110 }
1111 }
1112 ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP
1113 // There is a way to handle the case of p == 0, but a different path
1114 // is required
1115
1116 if (lengths1 < 2)
1117 {
1118 OJPH_WARN(0x00010006, "Wrong codeblock length.\n");
1119 return false;
1120 }
1121
1122 // read scup and fix the bytes there
1123 int lcup, scup;
1124 lcup = (int)lengths1; // length of CUP
1125 //scup is the length of MEL + VLC
1126 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1127 if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
1128 return false;
1129
1130 // The temporary storage scratch holds two types of data in an
1131 // interleaved fashion. The interleaving allows us to use one
1132 // memory pointer.
1133 // We have one entry for a decoded VLC code, and one entry for UVLC.
1134 // Entries are 16 bits each, corresponding to one quad,
1135 // but since we want to use XMM registers of the SSE family
1136 // of SIMD; we allocated 16 bytes or more per quad row; that is,
1137 // the width is no smaller than 16 bytes (or 8 entries), and the
1138 // height is 512 quads
1139 // Each VLC entry contains, in the following order, starting
1140 // from MSB
1141 // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
1142 // Each entry in UVLC contains u_q
1143 // One extra row to handle the case of SPP propagating downwards
1144 // when codeblock width is 4
1145 ui16 scratch[8 * 513] = {0}; // 8+ kB
1146
1147 // We need an extra two entries (one inf and one u_q) beyond
1148 // the last column.
1149 // If the block width is 4 (2 quads), then we use sstr of 8
1150 // (enough for 4 quads). If width is 8 (4 quads) we use
1151 // sstr is 16 (enough for 8 quads). For a width of 16 (8
1152 // quads), we use 24 (enough for 12 quads).
1153 ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
1154
1155 assert((stride & 0x3) == 0);
1156
1157 ui32 mmsbp2 = missing_msbs + 2;
1158
1159 // The cleanup pass is decoded in two steps; in step one,
1160 // the VLC and MEL segments are decoded, generating a record that
1161 // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
1162 // This information should be sufficient for the next step.
1163 // In step 2, we decode the MagSgn segment.
1164
1165 // step 1 decoding VLC and MEL segments
1166 {
1167 // init structures
1168 dec_mel_st mel;
1169 mel_init(&mel, coded_data, lcup, scup);
1170 rev_struct vlc;
1171 rev_init(&vlc, coded_data, lcup, scup);
1172
1173 int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
1174 // data represented as runs of 0 events
1175 // See mel_decode description
1176
1177 ui32 vlc_val;
1178 ui32 c_q = 0;
1179 ui16 *sp = scratch;
1180 //initial quad row
1181 for (ui32 x = 0; x < width; sp += 4)
1182 {
1183 // decode VLC
1185
1186 // first quad
1187 vlc_val = rev_fetch(&vlc);
1188
1189 //decode VLC using the context c_q and the head of VLC bitstream
1190 ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
1191
1192 // if context is zero, use one MEL event
1193 if (c_q == 0) //zero context
1194 {
1195 run -= 2; //subtract 2, since events number if multiplied by 2
1196
1197 // Is the run terminated in 1? if so, use decoded VLC code,
1198 // otherwise, discard decoded data, since we will decoded again
1199 // using a different context
1200 t0 = (run == -1) ? t0 : 0;
1201
1202 // is run -1 or -2? this means a run has been consumed
1203 if (run < 0)
1204 run = mel_get_run(&mel); // get another run
1205 }
1206 //run -= (c_q == 0) ? 2 : 0;
1207 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1208 //if (run < 0)
1209 // run = mel_get_run(&mel); // get another run
1210 sp[0] = t0;
1211 x += 2;
1212
1213 // prepare context for the next quad; eqn. 1 in ITU T.814
1214 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1215
1216 //remove data from vlc stream (0 bits are removed if vlc is not used)
1217 vlc_val = rev_advance(&vlc, t0 & 0x7);
1218
1219 //second quad
1220 ui16 t1 = 0;
1221
1222 //decode VLC using the context c_q and the head of VLC bitstream
1223 t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)];
1224
1225 // if context is zero, use one MEL event
1226 if (c_q == 0 && x < width) //zero context
1227 {
1228 run -= 2; //subtract 2, since events number if multiplied by 2
1229
1230 // if event is 0, discard decoded t1
1231 t1 = (run == -1) ? t1 : 0;
1232
1233 if (run < 0) // have we consumed all events in a run
1234 run = mel_get_run(&mel); // if yes, then get another run
1235 }
1236 t1 = x < width ? t1 : 0;
1237 //run -= (c_q == 0 && x < width) ? 2 : 0;
1238 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1239 //if (run < 0)
1240 // run = mel_get_run(&mel); // get another run
1241 sp[2] = t1;
1242 x += 2;
1243
1244 //prepare context for the next quad, eqn. 1 in ITU T.814
1245 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1246
1247 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1248 vlc_val = rev_advance(&vlc, t1 & 0x7);
1249
1250 // decode u
1252 // uvlc_mode is made up of u_offset bits from the quad pair
1253 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1254 if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
1255 { // the MEL run of events
1256 run -= 2; //subtract 2, since events number if multiplied by 2
1257
1258 uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
1259 // is 0x40
1260
1261 if (run < 0)//if run is consumed (run is -1 or -2), get another run
1262 run = mel_get_run(&mel);
1263 }
1264 //run -= (uvlc_mode == 0xc0) ? 2 : 0;
1265 //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
1266 //if (run < 0)
1267 // run = mel_get_run(&mel); // get another run
1268
1269 //decode uvlc_mode to get u for both quads
1270 ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)];
1271 //remove total prefix length
1272 vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1273 uvlc_entry >>= 3;
1274 //extract suffixes for quad 0 and 1
1275 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1276 ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1277 vlc_val = rev_advance(&vlc, len);
1278 uvlc_entry >>= 4;
1279 // quad 0 length
1280 len = uvlc_entry & 0x7; // quad 0 suffix length
1281 uvlc_entry >>= 3;
1282 ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len))); //kap. 1
1283 sp[1] = u_q;
1284 u_q = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len)); //kappa == 1
1285 sp[3] = u_q;
1286 }
1287 sp[0] = sp[1] = 0;
1288
1289 //non initial quad rows
1290 for (ui32 y = 2; y < height; y += 2)
1291 {
1292 c_q = 0; // context
1293 ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads
1294
1295 for (ui32 x = 0; x < width; sp += 4)
1296 {
1297 // decode VLC
1299
1300 // sigma_q (n, ne, nf)
1301 c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
1302 c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
1303
1304 // first quad
1305 vlc_val = rev_fetch(&vlc);
1306
1307 //decode VLC using the context c_q and the head of VLC bitstream
1308 ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
1309
1310 // if context is zero, use one MEL event
1311 if (c_q == 0) //zero context
1312 {
1313 run -= 2; //subtract 2, since events number is multiplied by 2
1314
1315 // Is the run terminated in 1? if so, use decoded VLC code,
1316 // otherwise, discard decoded data, since we will decoded again
1317 // using a different context
1318 t0 = (run == -1) ? t0 : 0;
1319
1320 // is run -1 or -2? this means a run has been consumed
1321 if (run < 0)
1322 run = mel_get_run(&mel); // get another run
1323 }
1324 //run -= (c_q == 0) ? 2 : 0;
1325 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1326 //if (run < 0)
1327 // run = mel_get_run(&mel); // get another run
1328 sp[0] = t0;
1329 x += 2;
1330
1331 // prepare context for the next quad; eqn. 2 in ITU T.814
1332 // sigma_q (w, sw)
1333 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1334 // sigma_q (nw)
1335 c_q |= sp[0 - (si32)sstr] & 0x80;
1336 // sigma_q (n, ne, nf)
1337 c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
1338 c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
1339
1340 //remove data from vlc stream (0 bits are removed if vlc is unused)
1341 vlc_val = rev_advance(&vlc, t0 & 0x7);
1342
1343 //second quad
1344 ui16 t1 = 0;
1345
1346 //decode VLC using the context c_q and the head of VLC bitstream
1347 t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1348
1349 // if context is zero, use one MEL event
1350 if (c_q == 0 && x < width) //zero context
1351 {
1352 run -= 2; //subtract 2, since events number if multiplied by 2
1353
1354 // if event is 0, discard decoded t1
1355 t1 = (run == -1) ? t1 : 0;
1356
1357 if (run < 0) // have we consumed all events in a run
1358 run = mel_get_run(&mel); // if yes, then get another run
1359 }
1360 t1 = x < width ? t1 : 0;
1361 //run -= (c_q == 0 && x < width) ? 2 : 0;
1362 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1363 //if (run < 0)
1364 // run = mel_get_run(&mel); // get another run
1365 sp[2] = t1;
1366 x += 2;
1367
1368 // partial c_q, will be completed when we process the next quad
1369 // sigma_q (w, sw)
1370 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1371 // sigma_q (nw)
1372 c_q |= sp[2 - (si32)sstr] & 0x80;
1373
1374 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1375 vlc_val = rev_advance(&vlc, t1 & 0x7);
1376
1377 // decode u
1379 // uvlc_mode is made up of u_offset bits from the quad pair
1380 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1381 ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
1382 //remove total prefix length
1383 vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1384 uvlc_entry >>= 3;
1385 //extract suffixes for quad 0 and 1
1386 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1387 ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1388 vlc_val = rev_advance(&vlc, len);
1389 uvlc_entry >>= 4;
1390 // quad 0 length
1391 len = uvlc_entry & 0x7; // quad 0 suffix length
1392 uvlc_entry >>= 3;
1393 ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q
1394 sp[1] = u_q;
1395 u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
1396 sp[3] = u_q;
1397 }
1398 sp[0] = sp[1] = 0;
1399 }
1400 }
1401
1402 // step2 we decode magsgn
1403 // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit)
1404 // The 32 bit path decode 16 bits data, for which one would think
1405 // 16 bits are enough, because we want to put in the center of the
1406 // bin.
1407 // If you have mmsbp2 equals 16 bit, and reversible coding, and
1408 // no bitplanes are missing, then we can decoding using the 16 bit
1409 // path, but we are not doing this here.
1410 if (mmsbp2 >= 16)
1411 {
1412 // We allocate a scratch row for storing v_n values.
1413 // We have 512 quads horizontally.
1414 // We may go beyond the last entry by up to 4 entries.
1415 // Here we allocate additional 8 entries.
1416 // There are two rows in this structure, the bottom
1417 // row is used to store processed entries.
1418 const int v_n_size = 512 + 8;
1419 ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB
1420
1421 frwd_struct magsgn;
1422 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1423
1424 {
1425 ui16 *sp = scratch;
1426 ui32 *vp = v_n_scratch;
1427 ui32 *dp = decoded_data;
1428 vp[0] = 2; // for easy calculation of emax
1429
1430 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1431 {
1432 //here we process two quads
1433 v128_t w0, w1; // workers
1434 v128_t inf_u_q, U_q;
1435 // determine U_q
1436 {
1437 inf_u_q = wasm_v128_load(sp);
1438 U_q = wasm_u32x4_shr(inf_u_q, 16);
1439
1440 w0 = wasm_i32x4_gt(U_q, wasm_u32x4_splat(mmsbp2));
1441 ui32 i = wasm_i8x16_bitmask(w0);
1442 if (i & 0xFF) // only the lower two U_q
1443 return false;
1444 }
1445
1446 v128_t vn = wasm_i32x4_const(OJPH_REPEAT4(2));
1447 v128_t row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1448 v128_t row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1449 w0 = wasm_v128_load(vp);
1450 w0 = wasm_v128_and(w0, wasm_i32x4_const(-1,0,0,0));
1451 w0 = wasm_v128_or(w0, vn);
1452 wasm_v128_store(vp, w0);
1453
1454 //interleave in ssse3 style
1455
1456 w0 = wasm_i32x4_shuffle(row0, row1, 0, 4, 1, 5);
1457 w1 = wasm_i32x4_shuffle(row0, row1, 2, 6, 3, 7);
1458 row0 = wasm_i32x4_shuffle(w0, w1, 0, 4, 1, 5);
1459 row1 = wasm_i32x4_shuffle(w0, w1, 2, 6, 3, 7);
1460 wasm_v128_store(dp, row0);
1461 wasm_v128_store(dp + stride, row1);
1462 }
1463 }
1464
1465 for (ui32 y = 2; y < height; y += 2)
1466 {
1467 {
1468 // perform 31 - count_leading_zeros(*vp) here
1469 ui32 *vp = v_n_scratch;
1470 const v128_t lut_lo = wasm_i8x16_const(
1471 31, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
1472 );
1473 const v128_t lut_hi = wasm_i8x16_const(
1474 31, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
1475 );
1476 const v128_t nibble_mask = wasm_i8x16_const(OJPH_REPEAT16(0x0F));
1477 const v128_t byte_offset8 = wasm_i16x8_const(OJPH_REPEAT8(8));
1478 const v128_t byte_offset16 = wasm_i16x8_const(OJPH_REPEAT8(16));
1479 const v128_t cc = wasm_i32x4_const(OJPH_REPEAT4(31));
1480 for (ui32 x = 0; x <= width; x += 8, vp += 4)
1481 {
1482 v128_t v, t; // workers
1483 v = wasm_v128_load(vp);
1484
1485 t = wasm_v128_and(nibble_mask, v);
1486 v = wasm_v128_and(wasm_u16x8_shr(v, 4), nibble_mask);
1487 t = wasm_i8x16_swizzle(lut_lo, t);
1488 v = wasm_i8x16_swizzle(lut_hi, v);
1489 v = wasm_u8x16_min(v, t);
1490
1491 t = wasm_u16x8_shr(v, 8);
1492 v = wasm_v128_or(v, byte_offset8);
1493 v = wasm_u8x16_min(v, t);
1494
1495 t = wasm_u32x4_shr(v, 16);
1496 v = wasm_v128_or(v, byte_offset16);
1497 v = wasm_u8x16_min(v, t);
1498
1499 v = wasm_i16x8_sub(cc, v);
1500 wasm_v128_store(vp + v_n_size, v);
1501 }
1502 }
1503
1504 ui32 *vp = v_n_scratch;
1505 ui16 *sp = scratch + (y >> 1) * sstr;
1506 ui32 *dp = decoded_data + y * stride;
1507 vp[0] = 2; // for easy calculation of emax
1508
1509 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1510 {
1511 //process two quads
1512 v128_t w0, w1; // workers
1513 v128_t inf_u_q, U_q;
1514 // determine U_q
1515 {
1516 v128_t gamma, emax, kappa, u_q; // needed locally
1517
1518 inf_u_q = wasm_v128_load(sp);
1519 gamma =
1520 wasm_v128_and(inf_u_q, wasm_i32x4_const(OJPH_REPEAT4(0xF0)));
1521 w0 = wasm_i32x4_sub(gamma, wasm_i32x4_const(OJPH_REPEAT4(1)));
1522 gamma = wasm_v128_and(gamma, w0);
1523 gamma = wasm_i32x4_eq(gamma, wasm_i64x2_const(0, 0));
1524
1525 emax = wasm_v128_load(vp + v_n_size);
1526 w0 = wasm_i32x4_shuffle(emax, wasm_i64x2_const(0,0), 1, 2, 3, 4);
1527 emax = wasm_i16x8_max(w0, emax); // no max_epi32 in ssse3
1528 emax = wasm_v128_andnot(emax, gamma);
1529
1530 kappa = wasm_i32x4_const(OJPH_REPEAT4(1));
1531 kappa = wasm_i16x8_max(emax, kappa); // no max_epi32 in ssse3
1532
1533 u_q = wasm_u32x4_shr(inf_u_q, 16);
1534 U_q = wasm_i32x4_add(u_q, kappa);
1535
1536 w0 = wasm_i32x4_gt(U_q, wasm_u32x4_splat(mmsbp2));
1537 ui32 i = wasm_i8x16_bitmask(w0);
1538 if (i & 0xFF) // only the lower two U_q
1539 return false;
1540 }
1541
1542 v128_t vn = wasm_i32x4_const(OJPH_REPEAT4(2));
1543 v128_t row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1544 v128_t row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1545 w0 = wasm_v128_load(vp);
1546 w0 = wasm_v128_and(w0, wasm_i32x4_const(-1,0,0,0));
1547 w0 = wasm_v128_or(w0, vn);
1548 wasm_v128_store(vp, w0);
1549
1550 //interleave in ssse3 style
1551 w0 = wasm_i32x4_shuffle(row0, row1, 0, 4, 1, 5);
1552 w1 = wasm_i32x4_shuffle(row0, row1, 2, 6, 3, 7);
1553 row0 = wasm_i32x4_shuffle(w0, w1, 0, 4, 1, 5);
1554 row1 = wasm_i32x4_shuffle(w0, w1, 2, 6, 3, 7);
1555 wasm_v128_store(dp, row0);
1556 wasm_v128_store(dp + stride, row1);
1557 }
1558 }
1559 }
1560 else
1561 {
1562 // reduce bitplane by 16 because we now have 16 bits instead of 32
1563 p -= 16;
1564
1565 // We allocate a scratch row for storing v_n values.
1566 // We have 512 quads horizontally.
1567 // We may go beyond the last entry by up to 8 entries.
1568 // Therefore we allocate additional 8 entries.
1569 // There are two rows in this structure, the bottom
1570 // row is used to store processed entries.
1571 const int v_n_size = 512 + 8;
1572 ui16 v_n_scratch[2 * v_n_size] = {0}; // 2+ kB
1573
1574 frwd_struct magsgn;
1575 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1576
1577 {
1578 ui16 *sp = scratch;
1579 ui16 *vp = v_n_scratch;
1580 ui32 *dp = decoded_data;
1581 vp[0] = 2; // for easy calculation of emax
1582
1583 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1584 {
1585 //here we process two quads
1586 v128_t w0, w1; // workers
1587 v128_t inf_u_q, U_q;
1588 // determine U_q
1589 {
1590 inf_u_q = wasm_v128_load(sp);
1591 U_q = wasm_u32x4_shr(inf_u_q, 16);
1592
1593 w0 = wasm_i32x4_gt(U_q, wasm_u32x4_splat(mmsbp2));
1594 ui32 i = wasm_i8x16_bitmask(w0);
1595 if (i & 0xFF) // only the lower two U_q
1596 return false;
1597 }
1598
1599 v128_t vn = wasm_i16x8_const(OJPH_REPEAT8(2));
1600 v128_t row = decode_two_quad16(inf_u_q, U_q, &magsgn, p, vn);
1601 w0 = wasm_v128_load(vp);
1602 w0 = wasm_v128_and(w0, wasm_i16x8_const(-1,0,0,0,0,0,0,0));
1603 w0 = wasm_v128_or(w0, vn);
1604 wasm_v128_store(vp, w0);
1605
1606 //interleave in ssse3 style
1607 w0 = wasm_i8x16_swizzle(row,
1608 wasm_i16x8_const(-1, 0x0100, -1, 0x0504,
1609 -1, 0x0908, -1, 0x0D0C));
1610 wasm_v128_store(dp, w0);
1611 w1 = wasm_i8x16_swizzle(row,
1612 wasm_i16x8_const(-1, 0x0302, -1, 0x0706,
1613 -1, 0x0B0A, -1, 0x0F0E));
1614 wasm_v128_store(dp + stride, w1);
1615 }
1616 }
1617
1618 for (ui32 y = 2; y < height; y += 2)
1619 {
1620 {
1621 // perform 15 - count_leading_zeros(*vp) here
1622 ui16 *vp = v_n_scratch;
1623 const v128_t lut_lo = wasm_i8x16_const(
1624 15, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
1625 );
1626 const v128_t lut_hi = wasm_i8x16_const(
1627 15, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
1628 );
1629 const v128_t nibble_mask = wasm_i8x16_const(OJPH_REPEAT16(0x0F));
1630 const v128_t byte_offset8 = wasm_i16x8_const(OJPH_REPEAT8(8));
1631 const v128_t cc = wasm_i16x8_const(OJPH_REPEAT8(15));
1632 for (ui32 x = 0; x <= width; x += 16, vp += 8)
1633 {
1634 v128_t v, t; // workers
1635 v = wasm_v128_load(vp);
1636
1637 t = wasm_v128_and(nibble_mask, v);
1638 v = wasm_v128_and(wasm_u16x8_shr(v, 4), nibble_mask);
1639 t = wasm_i8x16_swizzle(lut_lo, t);
1640 v = wasm_i8x16_swizzle(lut_hi, v);
1641 v = wasm_u8x16_min(v, t);
1642
1643 t = wasm_u16x8_shr(v, 8);
1644 v = wasm_v128_or(v, byte_offset8);
1645 v = wasm_u8x16_min(v, t);
1646
1647 v = wasm_i16x8_sub(cc, v);
1648 wasm_v128_store(vp + v_n_size, v);
1649 }
1650 }
1651
1652 ui16 *vp = v_n_scratch;
1653 ui16 *sp = scratch + (y >> 1) * sstr;
1654 ui32 *dp = decoded_data + y * stride;
1655 vp[0] = 2; // for easy calculation of emax
1656
1657 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1658 {
1659 //process two quads
1660 v128_t w0, w1; // workers
1661 v128_t inf_u_q, U_q;
1662 // determine U_q
1663 {
1664 v128_t gamma, emax, kappa, u_q; // needed locally
1665
1666 inf_u_q = wasm_v128_load(sp);
1667 gamma =
1668 wasm_v128_and(inf_u_q, wasm_i32x4_const(OJPH_REPEAT4(0xF0)));
1669 w0 = wasm_i32x4_sub(gamma, wasm_i32x4_const(OJPH_REPEAT4(1)));
1670 gamma = wasm_v128_and(gamma, w0);
1671 gamma = wasm_i32x4_eq(gamma, wasm_i64x2_const(0, 0));
1672
1673 emax = wasm_v128_load(vp + v_n_size);
1674 w0 = wasm_i16x8_shuffle(emax,
1675 wasm_i64x2_const(0, 0), 1, 2, 3, 4, 5, 6, 7, 8);
1676 emax = wasm_i16x8_max(w0, emax); // no max_epi32 in ssse3
1677 emax = wasm_i8x16_swizzle(emax,
1678 wasm_i16x8_const(0x0100, -1, 0x0302, -1,
1679 0x0504, -1, 0x0706, -1));
1680 emax = wasm_v128_andnot(emax, gamma);
1681
1682 kappa = wasm_i32x4_const(OJPH_REPEAT4(1));
1683 kappa = wasm_i16x8_max(emax, kappa); // no max_epi32 in ssse3
1684
1685 u_q = wasm_u32x4_shr(inf_u_q, 16);
1686 U_q = wasm_i32x4_add(u_q, kappa);
1687
1688 w0 = wasm_i32x4_gt(U_q, wasm_u32x4_splat(mmsbp2));
1689 ui32 i = wasm_i8x16_bitmask(w0);
1690 if (i & 0xFF) // only the lower two U_q
1691 return false;
1692 }
1693
1694 v128_t vn = wasm_i16x8_const(OJPH_REPEAT8(2));
1695 v128_t row = decode_two_quad16(inf_u_q, U_q, &magsgn, p, vn);
1696 w0 = wasm_v128_load(vp);
1697 w0 = wasm_v128_and(w0, wasm_i16x8_const(-1,0,0,0,0,0,0,0));
1698 w0 = wasm_v128_or(w0, vn);
1699 wasm_v128_store(vp, w0);
1700
1701 w0 = wasm_i8x16_swizzle(row,
1702 wasm_i16x8_const(-1, 0x0100, -1, 0x0504,
1703 -1, 0x0908, -1, 0x0D0C));
1704 wasm_v128_store(dp, w0);
1705 w1 = wasm_i8x16_swizzle(row,
1706 wasm_i16x8_const(-1, 0x0302, -1, 0x0706,
1707 -1, 0x0B0A, -1, 0x0F0E));
1708 wasm_v128_store(dp + stride, w1);
1709 }
1710 }
1711
1712 // increase bitplane back by 16 because we need to process 32 bits
1713 p += 16;
1714 }
1715
1716 if (num_passes > 1)
1717 {
1718 // We use scratch again, we can divide it into multiple regions
1719 // sigma holds all the significant samples, and it cannot
1720 // be modified after it is set. it will be used during the
1721 // Magnitude Refinement Pass
1722 ui16* const sigma = scratch;
1723
1724 ui32 mstr = (width + 3u) >> 2; // divide by 4, since each
1725 // ui16 contains 4 columns
1726 mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
1727
1728 // We re-arrange quad significance, where each 4 consecutive
1729 // bits represent one quad, into column significance, where,
1730 // each 4 consequtive bits represent one column of 4 rows
1731 {
1732 ui32 y;
1733
1734 const v128_t mask_3 = wasm_i32x4_const(OJPH_REPEAT4(0x30));
1735 const v128_t mask_C = wasm_i32x4_const(OJPH_REPEAT4(0xC0));
1736 const v128_t shuffle_mask = wasm_i32x4_const(0x0C080400,-1,-1,-1);
1737 for (y = 0; y < height; y += 4)
1738 {
1739 ui16* sp = scratch + (y >> 1) * sstr;
1740 ui16* dp = sigma + (y >> 2) * mstr;
1741 for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1742 {
1743 v128_t s0, s1, u3, uC, t0, t1;
1744
1745 s0 = wasm_v128_load(sp);
1746 u3 = wasm_v128_and(s0, mask_3);
1747 u3 = wasm_u32x4_shr(u3, 4);
1748 uC = wasm_v128_and(s0, mask_C);
1749 uC = wasm_u32x4_shr(uC, 2);
1750 t0 = wasm_v128_or(u3, uC);
1751
1752 s1 = wasm_v128_load(sp + sstr);
1753 u3 = wasm_v128_and(s1, mask_3);
1754 u3 = wasm_u32x4_shr(u3, 2);
1755 uC = wasm_v128_and(s1, mask_C);
1756 t1 = wasm_v128_or(u3, uC);
1757
1758 v128_t r = wasm_v128_or(t0, t1);
1759 r = wasm_i8x16_swizzle(r, shuffle_mask);
1760
1761 wasm_v128_store32_lane(dp, r, 0);
1762 }
1763 dp[0] = 0; // set an extra entry on the right with 0
1764 }
1765 {
1766 // reset one row after the codeblock
1767 ui16* dp = sigma + (y >> 2) * mstr;
1768 v128_t zero = wasm_i64x2_const(0, 0);
1769 for (ui32 x = 0; x < width; x += 32, dp += 8)
1770 wasm_v128_store(dp, zero);
1771 dp[0] = 0; // set an extra entry on the right with 0
1772 }
1773 }
1774
1775 // We perform Significance Propagation Pass here
1776 {
1777 // This stores significance information of the previous
1778 // 4 rows. Significance information in this array includes
1779 // all signicant samples in bitplane p - 1; that is,
1780 // significant samples for bitplane p (discovered during the
1781 // cleanup pass and stored in sigma) and samples that have recently
1782 // became significant (during the SPP) in bitplane p-1.
1783 // We store enough for the widest row, containing 1024 columns,
1784 // which is equivalent to 256 of ui16, since each stores 4 columns.
1785 // We add an extra 8 entries, just in case we need more
1786 ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
1787
1788 frwd_struct sigprop;
1789 frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
1790
1791 for (ui32 y = 0; y < height; y += 4)
1792 {
1793 ui32 pattern = 0xFFFFu; // a pattern needed samples
1794 if (height - y < 4) {
1795 pattern = 0x7777u;
1796 if (height - y < 3) {
1797 pattern = 0x3333u;
1798 if (height - y < 2)
1799 pattern = 0x1111u;
1800 }
1801 }
1802
1803 // prev holds sign. info. for the previous quad, together
1804 // with the rows on top of it and below it.
1805 ui32 prev = 0;
1806 ui16 *prev_sig = prev_row_sig;
1807 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1808 ui32 *dpp = decoded_data + y * stride;
1809 for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1810 {
1811 // only rows and columns inside the stripe are included
1812 si32 s = (si32)x + 4 - (si32)width;
1813 s = ojph_max(s, 0);
1814 pattern = pattern >> (s * 4);
1815
1816 // We first find locations that need to be tested (potential
1817 // SPP members); these location will end up in mbr
1818 // In each iteration, we produce 16 bits because cwd can have
1819 // up to 16 bits of significance information, followed by the
1820 // corresponding 16 bits of sign information; therefore, it is
1821 // sufficient to fetch 32 bit data per loop.
1822
1823 // Althougth we are interested in 16 bits only, we load 32 bits.
1824 // For the 16 bits we are producing, we need the next 4 bits --
1825 // We need data for at least 5 columns out of 8.
1826 // Therefore loading 32 bits is easier than loading 16 bits
1827 // twice.
1828 ui32 ps = *(ui32*)prev_sig;
1829 ui32 ns = *(ui32*)(cur_sig + mstr);
1830 ui32 u = (ps & 0x88888888) >> 3; // the row on top
1831 if (!stripe_causal)
1832 u |= (ns & 0x11111111) << 3; // the row below
1833
1834 ui32 cs = *(ui32*)cur_sig;
1835 // vertical integration
1836 ui32 mbr = cs; // this sig. info.
1837 mbr |= (cs & 0x77777777) << 1; //above neighbors
1838 mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
1839 mbr |= u;
1840 // horizontal integration
1841 ui32 t = mbr;
1842 mbr |= t << 4; // neighbors on the left
1843 mbr |= t >> 4; // neighbors on the right
1844 mbr |= prev >> 12; // significance of previous group
1845
1846 // remove outside samples, and already significant samples
1847 mbr &= pattern;
1848 mbr &= ~cs;
1849
1850 // find samples that become significant during the SPP
1851 ui32 new_sig = mbr;
1852 if (new_sig)
1853 {
1854 v128_t cwd_vec = frwd_fetch<0>(&sigprop);
1855 ui32 cwd = wasm_u32x4_extract_lane(cwd_vec, 0);
1856
1857 ui32 cnt = 0;
1858 ui32 col_mask = 0xFu;
1859 ui32 inv_sig = ~cs & pattern;
1860 for (int i = 0; i < 16; i += 4, col_mask <<= 4)
1861 {
1862 if ((col_mask & new_sig) == 0)
1863 continue;
1864
1865 //scan one column
1866 ui32 sample_mask = 0x1111u & col_mask;
1867 if (new_sig & sample_mask)
1868 {
1869 new_sig &= ~sample_mask;
1870 if (cwd & 1)
1871 {
1872 ui32 t = 0x33u << i;
1873 new_sig |= t & inv_sig;
1874 }
1875 cwd >>= 1; ++cnt;
1876 }
1877
1878 sample_mask <<= 1;
1879 if (new_sig & sample_mask)
1880 {
1881 new_sig &= ~sample_mask;
1882 if (cwd & 1)
1883 {
1884 ui32 t = 0x76u << i;
1885 new_sig |= t & inv_sig;
1886 }
1887 cwd >>= 1; ++cnt;
1888 }
1889
1890 sample_mask <<= 1;
1891 if (new_sig & sample_mask)
1892 {
1893 new_sig &= ~sample_mask;
1894 if (cwd & 1)
1895 {
1896 ui32 t = 0xECu << i;
1897 new_sig |= t & inv_sig;
1898 }
1899 cwd >>= 1; ++cnt;
1900 }
1901
1902 sample_mask <<= 1;
1903 if (new_sig & sample_mask)
1904 {
1905 new_sig &= ~sample_mask;
1906 if (cwd & 1)
1907 {
1908 ui32 t = 0xC8u << i;
1909 new_sig |= t & inv_sig;
1910 }
1911 cwd >>= 1; ++cnt;
1912 }
1913 }
1914
1915 if (new_sig)
1916 {
1917 // Spread new_sig, such that each bit is in one byte with a
1918 // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1
1919 v128_t new_sig_vec = wasm_i16x8_splat((si16)new_sig);
1920 new_sig_vec = wasm_i8x16_swizzle(new_sig_vec,
1921 wasm_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
1922 new_sig_vec = wasm_v128_and(new_sig_vec,
1923 wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
1924 new_sig_vec = wasm_i8x16_eq(new_sig_vec,
1925 wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
1926
1927 // find cumulative sums
1928 // to find which bit in cwd we should extract
1929 v128_t ex_sum, shfl, inc_sum = new_sig_vec; // inclusive scan
1930 inc_sum = wasm_i8x16_abs(inc_sum); // cvrt to 0 or 1
1931 shfl = wasm_i8x16_shuffle(wasm_i64x2_const(0,0), inc_sum,
1932 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
1933 inc_sum = wasm_i8x16_add(inc_sum, shfl);
1934 shfl = wasm_i16x8_shuffle(wasm_i64x2_const(0,0), inc_sum,
1935 7, 8, 9, 10, 11, 12, 13, 14);
1936 inc_sum = wasm_i8x16_add(inc_sum, shfl);
1937 shfl = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum,
1938 3, 4, 5, 6);
1939 inc_sum = wasm_i8x16_add(inc_sum, shfl);
1940 shfl = wasm_i64x2_shuffle(wasm_i64x2_const(0,0), inc_sum,
1941 1, 2);
1942 inc_sum = wasm_i8x16_add(inc_sum, shfl);
1943 cnt += wasm_u8x16_extract_lane(inc_sum, 15);
1944 // exclusive scan
1945 ex_sum = wasm_i8x16_shuffle(wasm_i64x2_const(0,0), inc_sum,
1946 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
1947
1948 // Spread cwd, such that each bit is in one byte
1949 // with a value of 0 or 1.
1950 cwd_vec = wasm_i16x8_splat((si16)cwd);
1951 cwd_vec = wasm_i8x16_swizzle(cwd_vec,
1952 wasm_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
1953 cwd_vec = wasm_v128_and(cwd_vec,
1954 wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
1955 cwd_vec = wasm_i8x16_eq(cwd_vec,
1956 wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
1957 cwd_vec = wasm_i8x16_abs(cwd_vec);
1958
1959 // Obtain bit from cwd_vec correspondig to ex_sum
1960 // Basically, collect needed bits from cwd_vec
1961 v128_t v = wasm_i8x16_swizzle(cwd_vec, ex_sum);
1962
1963 // load data and set spp coefficients
1964 v128_t m = wasm_i8x16_const(
1965 0,-1,-1,-1,4,-1,-1,-1,8,-1,-1,-1,12,-1,-1,-1);
1966 v128_t val = wasm_i32x4_splat(3 << (p - 2));
1967 ui32 *dp = dpp;
1968 for (int c = 0; c < 4; ++ c) {
1969 v128_t s0, s0_ns, s0_val;
1970 // load coefficients
1971 s0 = wasm_v128_load(dp);
1972
1973 // epi32 is -1 only for coefficient that
1974 // are changed during the SPP
1975 s0_ns = wasm_i8x16_swizzle(new_sig_vec, m);
1976 s0_ns = wasm_i32x4_eq(s0_ns,
1977 wasm_i32x4_const(OJPH_REPEAT4(0xFF)));
1978
1979 // obtain sign for coefficients in SPP
1980 s0_val = wasm_i8x16_swizzle(v, m);
1981 s0_val = wasm_i32x4_shl(s0_val, 31);
1982 s0_val = wasm_v128_or(s0_val, val);
1983 s0_val = wasm_v128_and(s0_val, s0_ns);
1984
1985 // update vector
1986 s0 = wasm_v128_or(s0, s0_val);
1987 // store coefficients
1988 wasm_v128_store(dp, s0);
1989 // prepare for next row
1990 dp += stride;
1991 m = wasm_i32x4_add(m, wasm_i32x4_const(OJPH_REPEAT4(1)));
1992 }
1993 }
1994 frwd_advance(&sigprop, cnt);
1995 }
1996
1997 new_sig |= cs;
1998 *prev_sig = (ui16)(new_sig);
1999
2000 // vertical integration for the new sig. info.
2001 t = new_sig;
2002 new_sig |= (t & 0x7777) << 1; //above neighbors
2003 new_sig |= (t & 0xEEEE) >> 1; //below neighbors
2004 // add sig. info. from the row on top and below
2005 prev = new_sig | u;
2006 // we need only the bits in 0xF000
2007 prev &= 0xF000;
2008 }
2009 }
2010 }
2011
2012 // We perform Magnitude Refinement Pass here
2013 if (num_passes > 2)
2014 {
2015 rev_struct magref;
2016 rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
2017
2018 for (ui32 y = 0; y < height; y += 4)
2019 {
2020 ui16 *cur_sig = sigma + (y >> 2) * mstr;
2021 ui32 *dpp = decoded_data + y * stride;
2022 for (ui32 i = 0; i < width; i += 4, dpp += 4)
2023 {
2024 //Process one entry from sigma array at a time
2025 // Each nibble (4 bits) in the sigma array represents 4 rows,
2026 ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
2027 ui16 sig = *cur_sig++; // 16 bit that will be processed now
2028 int total_bits = 0;
2029 if (sig) // if any of the 32 bits are set
2030 {
2031 // We work on 4 rows, with 4 samples each, since
2032 // data is 32 bit (4 bytes)
2033
2034 // spread the 16 bits in sig to 0 or 1 bytes in sig_vec
2035 v128_t sig_vec = wasm_i16x8_splat((si16)sig);
2036 sig_vec = wasm_i8x16_swizzle(sig_vec,
2037 wasm_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
2038 sig_vec = wasm_v128_and(sig_vec,
2039 wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2040 sig_vec = wasm_i8x16_eq(sig_vec,
2041 wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2042 sig_vec = wasm_i8x16_abs(sig_vec);
2043
2044 // find cumulative sums
2045 // to find which bit in cwd we should extract
2046 v128_t ex_sum, shfl, inc_sum = sig_vec; // inclusive scan
2047 shfl = wasm_i8x16_shuffle(wasm_i64x2_const(0,0), inc_sum,
2048 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
2049 inc_sum = wasm_i8x16_add(inc_sum, shfl);
2050 shfl = wasm_i16x8_shuffle(wasm_i64x2_const(0,0), inc_sum,
2051 7, 8, 9, 10, 11, 12, 13, 14);
2052 inc_sum = wasm_i8x16_add(inc_sum, shfl);
2053 shfl = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum,
2054 3, 4, 5, 6);
2055 inc_sum = wasm_i8x16_add(inc_sum, shfl);
2056 shfl = wasm_i64x2_shuffle(wasm_i64x2_const(0,0), inc_sum,
2057 1, 2);
2058 inc_sum = wasm_i8x16_add(inc_sum, shfl);
2059 total_bits = wasm_u8x16_extract_lane(inc_sum, 15);
2060 // exclusive scan
2061 ex_sum = wasm_i8x16_shuffle(wasm_i64x2_const(0,0), inc_sum,
2062 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
2063
2064 // Spread the 16 bits in cwd to inverted 0 or 1 bytes in
2065 // cwd_vec. Then, convert these to a form suitable
2066 // for coefficient modifications; in particular, a value
2067 // of 0 is presented as binary 11, and a value of 1 is
2068 // represented as binary 01
2069 v128_t cwd_vec = wasm_i16x8_splat((si16)cwd);
2070 cwd_vec = wasm_i8x16_swizzle(cwd_vec,
2071 wasm_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
2072 cwd_vec = wasm_v128_and(cwd_vec,
2073 wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2074 cwd_vec = wasm_i8x16_eq(cwd_vec,
2075 wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2076 cwd_vec =
2077 wasm_i8x16_add(cwd_vec, wasm_i8x16_const(OJPH_REPEAT16(1)));
2078 cwd_vec = wasm_i8x16_add(cwd_vec, cwd_vec);
2079 cwd_vec =
2080 wasm_v128_or(cwd_vec, wasm_i8x16_const(OJPH_REPEAT16(1)));
2081
2082 // load data and insert the mrp bit
2083 v128_t m = wasm_i8x16_const(0,-1,-1,-1,4,-1,-1,-1,
2084 8,-1,-1,-1,12,-1,-1,-1);
2085 ui32 *dp = dpp;
2086 for (int c = 0; c < 4; ++c) {
2087 v128_t s0, s0_sig, s0_idx, s0_val;
2088 // load coefficients
2089 s0 = wasm_v128_load(dp);
2090 // find significant samples in this row
2091 s0_sig = wasm_i8x16_swizzle(sig_vec, m);
2092 s0_sig = wasm_i8x16_eq(s0_sig, wasm_i64x2_const(0, 0));
2093 // get MRP bit index, and MRP pattern
2094 s0_idx = wasm_i8x16_swizzle(ex_sum, m);
2095 s0_val = wasm_i8x16_swizzle(cwd_vec, s0_idx);
2096 // keep data from significant samples only
2097 s0_val = wasm_v128_andnot(s0_val, s0_sig);
2098 // move mrp bits to correct position, and employ
2099 s0_val = wasm_i32x4_shl(s0_val, p - 2);
2100 s0 = wasm_v128_xor(s0, s0_val);
2101 // store coefficients
2102 wasm_v128_store(dp, s0);
2103 // prepare for next row
2104 dp += stride;
2105 m = wasm_i32x4_add(m, wasm_i32x4_const(OJPH_REPEAT4(1)));
2106 }
2107 }
2108 // consume data according to the number of bits set
2109 rev_advance_mrp(&magref, (ui32)total_bits);
2110 }
2111 }
2112 }
2113 }
2114
2115 return true;
2116 }
2117 }
2118}
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
bool ojph_decode_codeblock_wasm(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
Decodes one codeblock, processing the cleanup, siginificance propagation, and magnitude refinement pa...
static ui32 rev_fetch(rev_struct *vlcp)
Retrieves 32 bits from the head of a rev_struct structure.
static void rev_init_mrp(rev_struct *mrp, ui8 *data, int lcup, int len2)
Initialized rev_struct structure for MRP segment, and reads a number of bytes such that the next 32 b...
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
static void frwd_advance(frwd_struct *msp, ui32 num_bits)
Consume num_bits bits from the bitstream of frwd_struct.
static __m128i decode_two_quad16(const __m128i inf_u_q, __m128i U_q, frwd_struct *magsgn, ui32 p, __m128i &vn)
decodes twos consecutive quads (one octet), using 16 bit data
static void rev_read_mrp(rev_struct *mrp)
Reads and unstuffs from rev_struct.
static ui32 rev_fetch_mrp(rev_struct *mrp)
Retrieves 32 bits from the head of a rev_struct structure.
static ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static void rev_read(rev_struct *vlcp)
Read and unstuff data from a backwardly-growing segment.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void rev_init(rev_struct *vlcp, ui8 *data, int lcup, int scup)
Initiates the rev_struct structure and reads a few bytes to move the read address to multiple of 4.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static void frwd_read(frwd_struct *msp)
Read and unstuffs 32 bits from forward-growing bitstream.
static ui32 frwd_fetch(frwd_struct *msp)
Fetches 32 bits from the frwd_struct bitstream.
static void frwd_init(frwd_struct *msp, const ui8 *data, int size)
Initialize frwd_struct struct and reads some bytes.
static __m128i decode_one_quad32(const __m128i inf_u_q, __m128i U_q, frwd_struct *magsgn, ui32 p, __m128i &vn)
decodes one quad, using 32 bit data
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
uint64_t ui64
Definition: ojph_defs.h:56
uint16_t ui16
Definition: ojph_defs.h:52
static ui32 count_leading_zeros(ui32 val)
Definition: ojph_arch.h:130
int32_t si32
Definition: ojph_defs.h:55
int16_t si16
Definition: ojph_defs.h:53
uint32_t ui32
Definition: ojph_defs.h:54
uint8_t ui8
Definition: ojph_defs.h:50
#define OJPH_REPEAT2(a)
Macros that help with typing and space.
#define OJPH_REPEAT4(a)
#define OJPH_REPEAT16(a)
#define OJPH_REPEAT8(a)
#define ojph_max(a, b)
Definition: ojph_defs.h:73
#define OJPH_WARN(t,...)
Definition: ojph_message.h:128
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)
State structure for reading and unstuffing of forward-growing bitstreams; these are: MagSgn and SPP b...
const ui8 * data
pointer to bitstream
ui32 bits
number of bits stored in tmp
ui64 tmp
temporary buffer of read data
ui32 unstuff
1 if a bit needs to be unstuffed from next byte
A structure for reading and unstuffing a segment that grows backward, such as VLC and MRP.
ui32 bits
number of bits stored in tmp
int size
number of bytes left
ui8 * data
pointer to where to read data
ui64 tmp
temporary buffer of read data