48#ifdef OJPH_COMPILER_MSVC
50 #define unlikely(x) (x)
52 #define likely(x) __builtin_expect((x), 1)
53 #define unlikely(x) __builtin_expect((x), 0)
79 struct vlc_src_table {
int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
80 vlc_src_table tbl0[] = {
83 size_t tbl0_size =
sizeof(tbl0) /
sizeof(vlc_src_table);
85 si32 pattern_popcnt[16];
86 for (
ui32 i = 0; i < 16; ++i)
89 vlc_src_table* src_tbl = tbl0;
91 size_t tbl_size = tbl0_size;
92 for (
int i = 0; i < 2048; ++i)
94 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
95 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
99 vlc_src_table *best_entry = NULL;
103 for (
size_t j = 0; j < tbl_size; ++j)
105 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
106 if (src_tbl[j].u_off == 1)
107 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
111 int ones_count = pattern_popcnt[src_tbl[j].e_k];
112 if (ones_count >= best_e_k)
114 best_entry = src_tbl + j;
115 best_e_k = ones_count;
122 for (
size_t j = 0; j < tbl_size; ++j)
124 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
125 if (src_tbl[j].u_off == 0)
127 best_entry = src_tbl + j;
133 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
138 vlc_src_table tbl1[] = {
141 size_t tbl1_size =
sizeof(tbl1) /
sizeof(vlc_src_table);
145 tbl_size = tbl1_size;
146 for (
int i = 0; i < 2048; ++i)
148 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
149 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
153 vlc_src_table *best_entry = NULL;
157 for (
size_t j = 0; j < tbl_size; ++j)
159 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
160 if (src_tbl[j].u_off == 1)
161 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
165 int ones_count = pattern_popcnt[src_tbl[j].e_k];
166 if (ones_count >= best_e_k)
168 best_entry = src_tbl + j;
169 best_e_k = ones_count;
176 for (
size_t j = 0; j < tbl_size; ++j)
178 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
179 if (src_tbl[j].u_off == 0)
181 best_entry = src_tbl + j;
187 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
210 for (
int i = 5; i < 33; ++i)
269 melp->
tmp = (melp->
tmp << 1) + v;
283 static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
295 int t = mel_exp[melp->
k];
323 vlcp->buf = data + buffer_size - 1;
325 vlcp->buf_size = buffer_size;
330 vlcp->last_greater_than_8F =
true;
337 vlcp->tmp |= (
ui64)cwd << vlcp->used_bits;
338 vlcp->used_bits += cwd_len;
340 while (vlcp->used_bits >= 8) {
343 if (
unlikely(vlcp->last_greater_than_8F)) {
344 tmp = vlcp->tmp & 0x7F;
346 if (
likely(tmp != 0x7F)) {
347 tmp = vlcp->tmp & 0xFF;
348 *(vlcp->buf - vlcp->pos) = tmp;
349 vlcp->last_greater_than_8F = tmp > 0x8F;
351 vlcp->used_bits -= 8;
353 *(vlcp->buf - vlcp->pos) = tmp;
354 vlcp->last_greater_than_8F =
false;
356 vlcp->used_bits -= 7;
360 tmp = vlcp->tmp & 0xFF;
361 *(vlcp->buf - vlcp->pos) = tmp;
362 vlcp->last_greater_than_8F = tmp > 0x8F;
364 vlcp->used_bits -= 8;
382 int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
383 if ((mel_mask | vlc_mask) == 0)
387 OJPH_ERROR(0x00020003,
"mel encoder's buffer is full");
388 ui8 vlcp_tmp = (
ui8)vlcp->tmp;
389 int fuse = melp->
tmp | vlcp_tmp;
390 if ( ( ((fuse ^ melp->
tmp) & mel_mask)
391 | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
392 && (fuse != 0xFF) && vlcp->pos > 1)
398 if (vlcp->pos >= vlcp->buf_size)
399 OJPH_ERROR(0x00020004,
"vlc encoder's buffer is full");
401 *(vlcp->buf - vlcp->pos) = (
ui8)vlcp_tmp;
426 msp->buf_size = buffer_size;
438 if (msp->pos >= msp->buf_size)
439 OJPH_ERROR(0x00020005,
"magnitude sign encoder's buffer is full");
440 int t =
ojph_min(msp->max_bits - msp->used_bits, cwd_len);
441 msp->tmp |= ((
ui32)(cwd & ((1U << t) - 1))) << msp->used_bits;
445 if (msp->used_bits >= msp->max_bits)
447 msp->buf[msp->pos++] = (
ui8)msp->tmp;
448 msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
461 int t = msp->max_bits - msp->used_bits;
462 msp->tmp |= (0xFF & ((1U << t) - 1)) << msp->used_bits;
464 if (msp->tmp != 0xFF)
466 if (msp->pos >= msp->buf_size)
467 OJPH_ERROR(0x00020006,
"magnitude sign encoder's buffer is full");
468 msp->buf[msp->pos++] = (
ui8)msp->tmp;
471 else if (msp->max_bits == 7)
475#define ZERO _mm512_setzero_epi32()
476#define ONE _mm512_set1_epi32(1)
479static void print_epi32(
const char *msg, __m512i &val)
481 uint32_t A[16] = {0};
483 _mm512_store_epi32(A, val);
486 for (
int i = 0; i < 16; ++i) {
494 __m512i *eq_vec, __m512i *s_vec,
495 __m512i &rho_vec, __m512i &e_qmax_vec)
503 for (
ui32 i = 0; i < 4; ++i) {
505 val_vec[i] = _mm512_add_epi32(src_vec[i], src_vec[i]);
508 val_vec[i] = _mm512_srli_epi32(val_vec[i], p);
511 val_vec[i] = _mm512_and_epi32(val_vec[i], _mm512_set1_epi32((
int)~1u));
514 val_mask[i] = _mm512_cmpneq_epi32_mask(val_vec[i],
ZERO);
521 val_vec[i] = _mm512_mask_sub_epi32(
ZERO, val_mask[i], val_vec[i],
ONE);
522 _eq_vec[i] = _mm512_mask_lzcnt_epi32(
ZERO, val_mask[i], val_vec[i]);
523 _eq_vec[i] = _mm512_mask_sub_epi32(
ZERO, val_mask[i],
524 _mm512_set1_epi32(32), _eq_vec[i]);
531 val_vec[i] = _mm512_mask_sub_epi32(
ZERO, val_mask[i], val_vec[i],
ONE);
532 _s_vec[i] = _mm512_mask_srli_epi32(
ZERO, val_mask[i], src_vec[i], 31);
534 _mm512_mask_add_epi32(
ZERO, val_mask[i], _s_vec[i], val_vec[i]);
538 val_vec[0] = _mm512_mask_mov_epi32(
ZERO, val_mask[0],
ONE);
539 val_vec[1] = _mm512_mask_mov_epi32(
ZERO, val_mask[1],
ONE);
540 val_vec[2] = _mm512_mask_mov_epi32(
ZERO, val_mask[2],
ONE);
541 val_vec[3] = _mm512_mask_mov_epi32(
ZERO, val_mask[3],
ONE);
544 const __m512i idx[2] = {
545 _mm512_set_epi32(14, 12, 10, 8, 6, 4, 2, 0, 14, 12, 10, 8, 6, 4, 2, 0),
546 _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 15, 13, 11, 9, 7, 5, 3, 1),
560 for (
ui32 i = 0; i < 4; ++i) {
562 ui32 o_idx = i & 0x1;
564 eq_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _eq_vec[o_idx]);
565 eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00,
569 s_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _s_vec[o_idx]);
570 s_vec[i] = _mm512_mask_permutexvar_epi32(s_vec[i], 0xFF00,
574 _rho_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], val_vec[o_idx]);
575 _rho_vec[i] = _mm512_mask_permutexvar_epi32(_rho_vec[i], 0xFF00,
578 _rho_vec[i] = _mm512_slli_epi32(_rho_vec[i], i);
580 e_qmax_vec = _mm512_max_epi32(e_qmax_vec, eq_vec[i]);
583 rho_vec = _mm512_or_epi32(_rho_vec[0], _rho_vec[1]);
584 rho_vec = _mm512_or_epi32(rho_vec, _rho_vec[2]);
585 rho_vec = _mm512_or_epi32(rho_vec, _rho_vec[3]);
604 _matrix[0] = _mm512_unpacklo_epi32(matrix[0], matrix[1]);
605 _matrix[1] = _mm512_unpackhi_epi32(matrix[0], matrix[1]);
606 _matrix[2] = _mm512_unpacklo_epi32(matrix[2], matrix[3]);
607 _matrix[3] = _mm512_unpackhi_epi32(matrix[2], matrix[3]);
609 matrix[0] = _mm512_unpacklo_epi64(_matrix[0], _matrix[2]);
610 matrix[1] = _mm512_unpackhi_epi64(_matrix[0], _matrix[2]);
611 matrix[2] = _mm512_unpacklo_epi64(_matrix[1], _matrix[3]);
612 matrix[3] = _mm512_unpackhi_epi64(_matrix[1], _matrix[3]);
614 _matrix[0] = _mm512_shuffle_i32x4(matrix[0], matrix[1], 0x88);
615 _matrix[1] = _mm512_shuffle_i32x4(matrix[2], matrix[3], 0x88);
616 _matrix[2] = _mm512_shuffle_i32x4(matrix[0], matrix[1], 0xDD);
617 _matrix[3] = _mm512_shuffle_i32x4(matrix[2], matrix[3], 0xDD);
619 matrix[0] = _mm512_shuffle_i32x4(_matrix[0], _matrix[1], 0x88);
620 matrix[1] = _mm512_shuffle_i32x4(_matrix[2], _matrix[3], 0x88);
621 matrix[2] = _mm512_shuffle_i32x4(_matrix[0], _matrix[1], 0xDD);
622 matrix[3] = _mm512_shuffle_i32x4(_matrix[2], _matrix[3], 0xDD);
635 auto tmp = _mm512_and_epi32(tuple_vec,
ONE);
636 tmp = _mm512_sub_epi32(uq_vec, tmp);
637 auto tmp1 = _mm512_and_epi32(rho_vec,
ONE);
638 auto mask = _mm512_cmpneq_epi32_mask(tmp1,
ZERO);
639 m_vec[0] = _mm512_mask_mov_epi32(
ZERO, mask, tmp);
642 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(2));
643 tmp = _mm512_srli_epi32(tmp, 1);
644 tmp = _mm512_sub_epi32(uq_vec, tmp);
645 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(2));
646 mask = _mm512_cmpneq_epi32_mask(tmp1,
ZERO);
647 m_vec[1] = _mm512_mask_mov_epi32(
ZERO, mask, tmp);
650 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(4));
651 tmp = _mm512_srli_epi32(tmp, 2);
652 tmp = _mm512_sub_epi32(uq_vec, tmp);
653 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(4));
654 mask = _mm512_cmpneq_epi32_mask(tmp1,
ZERO);
655 m_vec[2] = _mm512_mask_mov_epi32(
ZERO, mask, tmp);
658 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(8));
659 tmp = _mm512_srli_epi32(tmp, 3);
660 tmp = _mm512_sub_epi32(uq_vec, tmp);
661 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(8));
662 mask = _mm512_cmpneq_epi32_mask(tmp1,
ZERO);
663 m_vec[3] = _mm512_mask_mov_epi32(
ZERO, mask, tmp);
685 for (
ui32 i = 0; i < 4; ++i) {
689 _mm512_store_epi32(cwd_len, m_vec[i]);
690 tmp = _mm512_sllv_epi32(
ONE, m_vec[i]);
691 tmp = _mm512_sub_epi32(tmp,
ONE);
692 tmp = _mm512_and_epi32(tmp, s_vec[i]);
693 _mm512_store_epi32(cwd, tmp);
695 for (
ui32 j = 0; j < 8; ++j) {
698 _cwd_len = cwd_len[idx];
699 _cwd |= ((
ui64)cwd[idx + 1]) << _cwd_len;
700 _cwd_len += cwd_len[idx + 1];
716 auto u_q_mask = _mm512_cmpgt_epi32_mask(u_q_vec,
ZERO);
718 auto mask = _mm512_cmpeq_epi32_mask(eq_vec[0], e_qmax_vec);
719 auto tmp = _mm512_mask_mov_epi32(
ZERO, mask,
ONE);
720 auto eps_vec = _mm512_mask_mov_epi32(
ZERO, u_q_mask, tmp);
722 mask = _mm512_cmpeq_epi32_mask(eq_vec[1], e_qmax_vec);
723 tmp = _mm512_mask_mov_epi32(
ZERO, mask,
ONE);
724 tmp = _mm512_slli_epi32(tmp, 1);
725 eps_vec = _mm512_mask_or_epi32(
ZERO, u_q_mask, eps_vec, tmp);
727 mask = _mm512_cmpeq_epi32_mask(eq_vec[2], e_qmax_vec);
728 tmp = _mm512_mask_mov_epi32(
ZERO, mask,
ONE);
729 tmp = _mm512_slli_epi32(tmp, 2);
730 eps_vec = _mm512_mask_or_epi32(
ZERO, u_q_mask, eps_vec, tmp);
732 mask = _mm512_cmpeq_epi32_mask(eq_vec[3], e_qmax_vec);
733 tmp = _mm512_mask_mov_epi32(
ZERO, mask,
ONE);
734 tmp = _mm512_slli_epi32(tmp, 3);
736 return _mm512_mask_or_epi32(
ZERO, u_q_mask, eps_vec, tmp);
740 __m512i *eq_vec, __m512i *e_val_vec,
741 const __m512i left_shift)
747 auto tmp = _mm512_mask_permutexvar_epi32(prev_e_val_vec, 0xFFFE,
748 left_shift, eq_vec[3]);
749 prev_e_val_vec = _mm512_mask_permutexvar_epi32(
ZERO, 0x1, left_shift,
751 e_val_vec[x] = _mm512_max_epi32(eq_vec[1], tmp);
756 __m512i &rho_vec, __m512i *cx_val_vec,
757 const __m512i left_shift)
763 auto tmp = _mm512_mask_permutexvar_epi32(prev_cx_val_vec, 0xFFFE,
764 left_shift, rho_vec);
765 prev_cx_val_vec = _mm512_mask_permutexvar_epi32(
ZERO, 0x1, left_shift,
768 tmp = _mm512_and_epi32(tmp, _mm512_set1_epi32(8));
769 tmp = _mm512_srli_epi32(tmp, 3);
771 auto tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(2));
772 tmp1 = _mm512_srli_epi32(tmp1, 1);
773 cx_val_vec[x] = _mm512_or_epi32(tmp, tmp1);
776static __m512i
cal_tuple(__m512i &cq_vec, __m512i &rho_vec,
777 __m512i &eps_vec,
ui32 *vlc_tbl)
780 auto tmp = _mm512_slli_epi32(cq_vec, 8);
781 auto tmp1 = _mm512_slli_epi32(rho_vec, 4);
782 tmp = _mm512_add_epi32(tmp, tmp1);
783 tmp = _mm512_add_epi32(tmp, eps_vec);
784 return _mm512_i32gather_epi32(tmp, vlc_tbl, 4);
788 const __m512i right_shift)
795 auto tmp = _mm512_srli_epi32(rho_vec, 1);
796 auto tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(1));
797 return _mm512_or_epi32(tmp, tmp1);
801 const __m512i right_shift)
805 auto lcxp1_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x]);
806 auto lcxp2_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x + 1]);
807 auto tmp = _mm512_permutexvar_epi32(right_shift, lcxp1_vec);
808 tmp = _mm512_mask_permutexvar_epi32(tmp, 0xC000, right_shift, lcxp2_vec);
809 tmp = _mm512_slli_epi32(tmp, 2);
810 auto tmp1 = _mm512_mask_mov_epi32(lcxp1_vec, 0x8000, lcxp2_vec);
811 tmp = _mm512_add_epi32(tmp1, tmp);
813 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(4));
814 tmp1 = _mm512_srli_epi32(tmp1, 1);
815 tmp = _mm512_or_epi32(tmp, tmp1);
817 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(8));
818 tmp1 = _mm512_srli_epi32(tmp1, 2);
820 return _mm512_or_epi32(tmp, tmp1);
826 __m512i &rho_vec, __m512i u_q_vec,
ui32 ignore,
827 const __m512i right_shift)
831 auto mel_need_encode = _mm512_cmpeq_epi32_mask(cq_vec,
ZERO);
833 auto mel_bit = _mm512_cmpneq_epi32_mask(rho_vec,
ZERO);
837 auto tmp = _mm512_permutexvar_epi32(right_shift, u_q_vec);
838 auto tmp1 = _mm512_min_epi32(u_q_vec, tmp);
839 auto mel_bit2 = (
ui16)_mm512_cmpgt_epi32_mask(tmp1, _mm512_set1_epi32(2));
842 auto mel_need_encode2 = (
ui16)_mm512_cmpgt_epi32_mask(u_q_vec,
ZERO);
844 mel_need_encode2 & (
ui16)_mm512_cmpgt_epi32_mask(tmp,
ZERO);
846 ui32 i_max = 16 - (ignore / 2);
848 for (
ui32 i = 0; i < i_max; i += 2) {
850 if (0 != (mel_need_encode & mask)) {
855 auto mask = 1 << (i + 1);
856 if (0 != (mel_need_encode & mask)) {
861 if (0 != (mel_need_encode2 & mask)) {
868 __m512i &rho_vec, __m512i u_q_vec,
ui32 ignore,
869 const __m512i right_shift)
876 auto mel_need_encode = _mm512_cmpeq_epi32_mask(cq_vec,
ZERO);
878 auto mel_bit = _mm512_cmpneq_epi32_mask(rho_vec,
ZERO);
881 ui32 i_max = 16 - (ignore / 2);
883 for (
ui32 i = 0; i < i_max; ++i) {
885 if (0 != (mel_need_encode & mask)) {
892 __m512i,
ui32,
const __m512i);
897 ui32 i_max = 16 - (ignore / 2);
899 for (
ui32 i = 0; i < i_max; i += 2) {
901 ui32 val = tuple[i + 0] >> 4;
902 int size = tuple[i + 0] & 7;
906 val |= (tuple[i + 1] >> 4) <<
size;
907 size += tuple[i + 1] & 7;
910 if (u_q[i] > 2 && u_q[i + 1] > 2) {
927 }
else if (u_q[i] > 2 && u_q[i + 1] > 0) {
933 val |= (u_q[i + 1] - 1) <<
size;
965 ui32 i_max = 16 - (ignore / 2);
967 for (
ui32 i = 0; i < i_max; i += 2) {
969 ui32 val = tuple[i + 0] >> 4;
970 int size = tuple[i + 0] & 7;
974 val |= (tuple[i + 1] >> 4) <<
size;
975 size += tuple[i + 1] & 7;
1008 ui32 width = (_width + 31) & ~31u;
1009 ui32 ignore = width - _width;
1010 const int ms_size = (16384 * 16 + 14) / 15;
1011 const int mel_vlc_size = 3072;
1012 const int mel_size = 192;
1013 const int vlc_size = mel_vlc_size - mel_size;
1015 ui8 ms_buf[ms_size];
1016 ui8 mel_vlc_buf[mel_vlc_size];
1017 ui8 *mel_buf = mel_vlc_buf;
1018 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1025 ms_init(&ms, ms_size, ms_buf);
1027 ui32 p = 30 - missing_msbs;
1038 const __m512i right_shift = _mm512_set_epi32(
1039 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
1042 const __m512i left_shift = _mm512_set_epi32(
1043 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15
1046 __m512i e_val_vec[33];
1047 for (
ui32 i = 0; i < 32; ++i) {
1048 e_val_vec[i] =
ZERO;
1050 __m512i prev_e_val_vec =
ZERO;
1052 __m512i cx_val_vec[33];
1053 __m512i prev_cx_val_vec =
ZERO;
1055 __m512i prev_cq_vec =
ZERO;
1067 ui32 n_loop = (width + 31) / 32;
1075 for (
ui32 y = 0; y < height; y += 2)
1077 e_val_vec[n_loop] = prev_e_val_vec;
1079 tmp = _mm512_and_epi32(prev_cx_val_vec, _mm512_set1_epi32(8));
1080 tmp = _mm512_srli_epi32(tmp, 3);
1081 cx_val_vec[n_loop] = tmp;
1083 prev_e_val_vec =
ZERO;
1084 prev_cx_val_vec =
ZERO;
1086 ui32 *sp = buf + y * stride;
1089 for (
ui32 x = 0; x < n_loop; ++x) {
1093 ui32 mask32 = 0xFFFFFFFFu;
1094 si32 entries = true_x + 32 - (
si32)_width;
1095 mask32 >>= ((entries >= 0) ? entries : 0);
1096 __mmask16 load_mask0 = _cvtu32_mask16(mask32);
1097 __mmask16 load_mask1 = _cvtu32_mask16(mask32 >> 16);
1100 src_vec[0] = _mm512_maskz_loadu_epi32(load_mask0, sp);
1101 src_vec[2] = _mm512_maskz_loadu_epi32(load_mask1, sp + 16);
1103 if (y + 1 < height) {
1104 src_vec[1] = _mm512_maskz_loadu_epi32(load_mask0, sp + stride);
1106 _mm512_maskz_loadu_epi32(load_mask1, sp + 16 + stride);
1119 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1122 tmp = _mm512_permutexvar_epi32(right_shift, e_val_vec[x]);
1123 tmp = _mm512_mask_permutexvar_epi32(tmp, 0x8000, right_shift,
1125 auto mask = _mm512_cmpgt_epi32_mask(e_val_vec[x], tmp);
1126 auto max_e_vec = _mm512_mask_mov_epi32(tmp, mask, e_val_vec[x]);
1127 max_e_vec = _mm512_sub_epi32(max_e_vec,
ONE);
1130 tmp = _mm512_max_epi32(max_e_vec,
ONE);
1131 tmp1 = _mm512_sub_epi32(rho_vec,
ONE);
1132 tmp1 = _mm512_and_epi32(rho_vec, tmp1);
1133 mask = _mm512_cmpneq_epi32_mask(tmp1,
ZERO);
1134 kappa_vec = _mm512_mask_mov_epi32(
ONE, mask, tmp);
1139 tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
1140 auto cq_vec = _mm512_mask_permutexvar_epi32(prev_cq_vec, 0xFFFE,
1142 prev_cq_vec = _mm512_mask_permutexvar_epi32(
ZERO, 0x1, left_shift,
1145 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1146 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1150 auto uq_vec = _mm512_max_epi32(kappa_vec, e_qmax_vec);
1151 auto u_q_vec = _mm512_sub_epi32(uq_vec, kappa_vec);
1153 auto eps_vec =
cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1154 __m512i tuple_vec =
cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1155 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1157 proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1170 tuple_vec = _mm512_srli_epi32(tuple_vec, 4);
1171 _mm512_store_epi32(tuple, tuple_vec);
1172 _mm512_store_epi32(u_q, u_q_vec);
1173 proc_vlc_encode(&vlc, tuple, u_q, _ignore);
1176 tmp = _mm512_permutexvar_epi32(right_shift, cx_val_vec[0]);
1177 tmp = _mm512_slli_epi32(tmp, 2);
1178 prev_cq_vec = _mm512_maskz_add_epi32(0x1, tmp, cx_val_vec[0]);
1190 lengths[0] = mel.
pos + vlc.pos + ms.pos;
1192 memcpy(coded->
buf, ms.buf, ms.pos);
1193 memcpy(coded->
buf + ms.pos, mel.
buf, mel.
pos);
1194 memcpy(coded->
buf + ms.pos + mel.
pos, vlc.buf - vlc.pos + 1, vlc.pos);
1197 ui32 num_bytes = mel.
pos + vlc.pos;
1198 coded->
buf[lengths[0]-1] = (
ui8)(num_bytes >> 4);
1199 coded->
buf[lengths[0]-2] = coded->
buf[lengths[0]-2] & 0xF0;
1200 coded->
buf[lengths[0]-2] =
1201 (
ui8)(coded->
buf[lengths[0]-2] | (num_bytes & 0xF));
void get_buffer(ui32 needed_bytes, coded_lists *&p)
static bool uvlc_init_tables()
Initializes uvlc_tbl0 and uvlc_tbl1 tables.
static bool vlc_init_tables()
Initializes vlc_tbl0 and vlc_tbl1 tables, from table0.h and table1.h.
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
static void ms_terminate(ms_struct *msp)
static void update_lep(ui32 x, __m512i &prev_e_val_vec, __m512i *eq_vec, __m512i *e_val_vec, const __m512i left_shift)
static int ulvc_cwd_suf[33]
static int ulvc_cwd_suf_len[33]
static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec, __m512i &rho_vec, __m512i u_q_vec, ui32 ignore, const __m512i right_shift)
static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec, __m512i &rho_vec, __m512i u_q_vec, ui32 ignore, const __m512i right_shift)
static void proc_vlc_encode2(vlc_struct *vlcp, ui32 *tuple, ui32 *u_q, ui32 ignore)
static void vlc_encode(vlc_struct *vlcp, int cwd, int cwd_len)
void(*)(mel_struct *, __m512i &, __m512i &, __m512i, ui32, const __m512i) fn_proc_mel_encode
static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec, __m512i &e_qmax_vec)
static void terminate_mel_vlc(mel_struct *melp, vlc_struct *vlcp)
static void proc_pixel(__m512i *src_vec, ui32 p, __m512i *eq_vec, __m512i *s_vec, __m512i &rho_vec, __m512i &e_qmax_vec)
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static void proc_ms_encode(ms_struct *msp, __m512i &tuple_vec, __m512i &uq_vec, __m512i &rho_vec, __m512i *s_vec)
void(*)(vlc_struct *, ui32 *, ui32 *, ui32) fn_proc_vlc_encode
static void rotate_matrix(__m512i *matrix)
static void ms_init(ms_struct *msp, ui32 buffer_size, ui8 *data)
static void ms_encode(ms_struct *msp, ui32 cwd, int cwd_len)
static int ulvc_cwd_pre_len[33]
static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec, const __m512i right_shift)
static int ulvc_cwd_pre[33]
static void mel_encode(mel_struct *melp, bool bit)
static void mel_emit_bit(mel_struct *melp, int v)
static void proc_vlc_encode1(vlc_struct *vlcp, ui32 *tuple, ui32 *u_q, ui32 ignore)
static __m512i cal_tuple(__m512i &cq_vec, __m512i &rho_vec, __m512i &eps_vec, ui32 *vlc_tbl)
static void update_lcxp(ui32 x, __m512i &prev_cx_val_vec, __m512i &rho_vec, __m512i *cx_val_vec, const __m512i left_shift)
__m512i(*)(ui32, __m512i *, __m512i &, const __m512i) fn_proc_cq
static bool tables_initialized
static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec, const __m512i right_shift)
static void vlc_init(vlc_struct *vlcp, ui32 buffer_size, ui8 *data)
void ojph_encode_codeblock_avx512(ui32 *buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32 *lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *&coded)
@ X86_CPU_EXT_LEVEL_AVX512
static ui32 population_count(ui32 val)
OJPH_EXPORT int get_cpu_ext_level()
#define OJPH_ERROR(t,...)
bool last_greater_than_8F