212 unsigned int framebits,
214 unsigned char* Branchtab)
217 for (i = 0; i < framebits + excess; i++) {
219 unsigned short* dec_short = (
unsigned short*)dec;
220 __m128i a100, a101, a103, a104, a105, a107, a108, a109, a76, a78, a79, a82, a84,
221 a85, a86, a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29,
222 m30, s18, s19, s24, s25, t14, t15, t17, t18;
225 s18 = ((__m128i*)X)[0];
226 s19 = ((__m128i*)X)[2];
227 a76 = _mm_set1_epi8(syms[2 * i]);
228 a78 = ((__m128i*)Branchtab)[0];
229 a79 = _mm_xor_si128(a76, a78);
230 a82 = _mm_set1_epi8(syms[2 * i + 1]);
231 a84 = ((__m128i*)Branchtab)[2];
232 a85 = _mm_xor_si128(a82, a84);
233 a86 = _mm_avg_epu8(a79, a85);
234 a88 = _mm_srli_epi16(a86, 2);
235 t14 = _mm_and_si128(a88, _mm_set1_epi8(63));
236 t15 = _mm_subs_epu8(_mm_set1_epi8(63), t14);
237 m23 = _mm_adds_epu8(s18, t14);
238 m24 = _mm_adds_epu8(s19, t15);
239 m25 = _mm_adds_epu8(s18, t15);
240 m26 = _mm_adds_epu8(s19, t14);
241 a89 = _mm_min_epu8(m24, m23);
242 d9 = _mm_cmpeq_epi8(a89, m24);
243 a90 = _mm_min_epu8(m26, m25);
244 d10 = _mm_cmpeq_epi8(a90, m26);
245 dec_short[4 * i] = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
246 dec_short[4 * i + 1] = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
247 ((__m128i*)Y)[0] = _mm_unpacklo_epi8(a89, a90);
248 ((__m128i*)Y)[1] = _mm_unpackhi_epi8(a89, a90);
251 s24 = ((__m128i*)X)[1];
252 s25 = ((__m128i*)X)[3];
253 a100 = ((__m128i*)Branchtab)[1];
254 a101 = _mm_xor_si128(a76, a100);
255 a103 = ((__m128i*)Branchtab)[3];
256 a104 = _mm_xor_si128(a82, a103);
257 a105 = _mm_avg_epu8(a101, a104);
258 a107 = _mm_srli_epi16(a105, 2);
259 t17 = _mm_and_si128(a107, _mm_set1_epi8(63));
260 t18 = _mm_subs_epu8(_mm_set1_epi8(63), t17);
261 m27 = _mm_adds_epu8(s24, t17);
262 m28 = _mm_adds_epu8(s25, t18);
263 m29 = _mm_adds_epu8(s24, t18);
264 m30 = _mm_adds_epu8(s25, t17);
265 a108 = _mm_min_epu8(m28, m27);
266 d11 = _mm_cmpeq_epi8(a108, m28);
267 a109 = _mm_min_epu8(m30, m29);
268 d12 = _mm_cmpeq_epi8(a109, m30);
269 dec_short[4 * i + 2] = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
270 dec_short[4 * i + 3] = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
271 ((__m128i*)Y)[2] = _mm_unpacklo_epi8(a108, a109);
272 ((__m128i*)Y)[3] = _mm_unpackhi_epi8(a108, a109);
276 m5 = ((__m128i*)Y)[0];
277 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
278 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
279 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
281 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
282 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
283 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
284 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
285 m7 = _mm_unpacklo_epi8(m7, m7);
286 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
287 m6 = _mm_unpacklo_epi64(m7, m7);
288 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
289 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
290 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
291 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
310 unsigned int framebits,
312 unsigned char* Branchtab)
315 for (i = 0; i < framebits + excess; i++) {
317 unsigned int* dec_int = (
unsigned int*)dec;
318 uint8x16_t a100, a101, a103, a104, a105, a108, a109, a76, a78, a79, a82, a84, a85,
319 a86, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
320 s19, s24, s25, t14, t15, t17, t18;
321 uint16x8_t high_bits;
324 uint8x8_t left, right;
328 s18 = ((uint8x16_t*)X)[0];
329 s19 = ((uint8x16_t*)X)[2];
330 a76 = vdupq_n_u8(syms[2 * i]);
331 a78 = ((uint8x16_t*)Branchtab)[0];
332 a79 = veorq_u8(a76, a78);
333 a82 = vdupq_n_u8(syms[2 * i + 1]);
334 a84 = ((uint8x16_t*)Branchtab)[2];
335 a85 = veorq_u8(a82, a84);
336 a86 = vrhaddq_u8(a79, a85);
337 t14 = vshrq_n_u8(a86, 2);
338 t15 = vqsubq_u8(vdupq_n_u8(63), t14);
339 m23 = vqaddq_u8(s18, t14);
340 m24 = vqaddq_u8(s19, t15);
341 m25 = vqaddq_u8(s18, t15);
342 m26 = vqaddq_u8(s19, t14);
343 a89 = vminq_u8(m24, m23);
344 d9 = vceqq_u8(a89, m24);
345 a90 = vminq_u8(m26, m25);
346 d10 = vceqq_u8(a90, m26);
347 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d9, 7));
348 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
349 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
350 dec_int[2 * i] = ((
unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
351 ((
unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
352 ((
unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
353 ((
unsigned int)vgetq_lane_u8(paired32, 12) << 24);
354 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d10, 7));
355 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
356 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
357 dec_int[2 * i] |= ((
unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
358 ((
unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
359 ((
unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
360 ((
unsigned int)vgetq_lane_u8(paired32, 12) << 25);
361 left = vget_low_u8(a89);
362 right = vget_low_u8(a90);
363 both = vzip_u8(left, right);
364 ((uint8x16_t*)Y)[0] = vcombine_u8(both.val[0], both.val[1]);
365 left = vget_high_u8(a89);
366 right = vget_high_u8(a90);
367 both = vzip_u8(left, right);
368 ((uint8x16_t*)Y)[1] = vcombine_u8(both.val[0], both.val[1]);
371 s24 = ((uint8x16_t*)X)[1];
372 s25 = ((uint8x16_t*)X)[3];
373 a100 = ((uint8x16_t*)Branchtab)[1];
374 a101 = veorq_u8(a76, a100);
375 a103 = ((uint8x16_t*)Branchtab)[3];
376 a104 = veorq_u8(a82, a103);
377 a105 = vrhaddq_u8(a101, a104);
378 t17 = vshrq_n_u8(a105, 2);
379 t18 = vqsubq_u8(vdupq_n_u8(63), t17);
380 m27 = vqaddq_u8(s24, t17);
381 m28 = vqaddq_u8(s25, t18);
382 m29 = vqaddq_u8(s24, t18);
383 m30 = vqaddq_u8(s25, t17);
384 a108 = vminq_u8(m28, m27);
385 d11 = vceqq_u8(a108, m28);
386 a109 = vminq_u8(m30, m29);
387 d12 = vceqq_u8(a109, m30);
388 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d11, 7));
389 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
390 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
391 dec_int[2 * i + 1] = ((
unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
392 ((
unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
393 ((
unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
394 ((
unsigned int)vgetq_lane_u8(paired32, 12) << 24);
395 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d12, 7));
396 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
397 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
398 dec_int[2 * i + 1] |= ((
unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
399 ((
unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
400 ((
unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
401 ((
unsigned int)vgetq_lane_u8(paired32, 12) << 25);
402 left = vget_low_u8(a108);
403 right = vget_low_u8(a109);
404 both = vzip_u8(left, right);
405 ((uint8x16_t*)Y)[2] = vcombine_u8(both.val[0], both.val[1]);
406 left = vget_high_u8(a108);
407 right = vget_high_u8(a109);
408 both = vzip_u8(left, right);
409 ((uint8x16_t*)Y)[3] = vcombine_u8(both.val[0], both.val[1]);
413 m5 = ((uint8x16_t*)Y)[0];
414 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[1]);
415 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[2]);
416 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[3]);
418 m7 = vpmin_u8(vget_low_u8(m5), vget_high_u8(m5));
419 m7 = vpmin_u8(m7, m7);
420 m7 = vpmin_u8(m7, m7);
421 m7 = vpmin_u8(m7, m7);
422 m6 = vcombine_u8(m7, m7);
423 ((uint8x16_t*)Y)[0] = vqsubq_u8(((uint8x16_t*)Y)[0], m6);
424 ((uint8x16_t*)Y)[1] = vqsubq_u8(((uint8x16_t*)Y)[1], m6);
425 ((uint8x16_t*)Y)[2] = vqsubq_u8(((uint8x16_t*)Y)[2], m6);
426 ((uint8x16_t*)Y)[3] = vqsubq_u8(((uint8x16_t*)Y)[3], m6);