diff options
Diffstat (limited to 'drivers/webp/dsp')
29 files changed, 2700 insertions, 1524 deletions
diff --git a/drivers/webp/dsp/common_sse2.h b/drivers/webp/dsp/common_sse2.h new file mode 100644 index 0000000000..7cea13fb3c --- /dev/null +++ b/drivers/webp/dsp/common_sse2.h @@ -0,0 +1,109 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// SSE2 code common to several files. +// +// Author: Vincent Rabaud (vrabaud@google.com) + +#ifndef WEBP_DSP_COMMON_SSE2_H_ +#define WEBP_DSP_COMMON_SSE2_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(WEBP_USE_SSE2) + +#include <emmintrin.h> + +//------------------------------------------------------------------------------ +// Quite useful macro for debugging. Left here for convenience. + +#if 0 +#include <stdio.h> +static WEBP_INLINE void PrintReg(const __m128i r, const char* const name, + int size) { + int n; + union { + __m128i r; + uint8_t i8[16]; + uint16_t i16[8]; + uint32_t i32[4]; + uint64_t i64[2]; + } tmp; + tmp.r = r; + fprintf(stderr, "%s\t: ", name); + if (size == 8) { + for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]); + } else if (size == 16) { + for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]); + } else if (size == 32) { + for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]); + } else { + for (n = 0; n < 2; ++n) fprintf(stderr, "%.16lx ", tmp.i64[n]); + } + fprintf(stderr, "\n"); +} +#endif + +//------------------------------------------------------------------------------ +// Math functions. + +// Return the sum of all the 8b in the register. +static WEBP_INLINE int VP8HorizontalAdd8b(const __m128i* const a) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sad8x2 = _mm_sad_epu8(*a, zero); + // sum the two sads: sad8x2[0:1] + sad8x2[8:9] + const __m128i sum = _mm_add_epi32(sad8x2, _mm_shuffle_epi32(sad8x2, 2)); + return _mm_cvtsi128_si32(sum); +} + +// Transpose two 4x4 16b matrices horizontally stored in registers. +static WEBP_INLINE void VP8Transpose_2_4x4_16b( + const __m128i* const in0, const __m128i* const in1, + const __m128i* const in2, const __m128i* const in3, __m128i* const out0, + __m128i* const out1, __m128i* const out2, __m128i* const out3) { + // Transpose the two 4x4. + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + const __m128i transpose0_0 = _mm_unpacklo_epi16(*in0, *in1); + const __m128i transpose0_1 = _mm_unpacklo_epi16(*in2, *in3); + const __m128i transpose0_2 = _mm_unpackhi_epi16(*in0, *in1); + const __m128i transpose0_3 = _mm_unpackhi_epi16(*in2, *in3); + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); + const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + *out0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); + *out1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); + *out2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); + *out3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 +} + +#endif // WEBP_USE_SSE2 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // WEBP_DSP_COMMON_SSE2_H_ diff --git a/drivers/webp/dsp/cpu.c b/drivers/webp/dsp/cpu.c index 35c2af7f58..cbb08db90a 100644 --- a/drivers/webp/dsp/cpu.c +++ b/drivers/webp/dsp/cpu.c @@ -13,6 +13,11 @@ #include "./dsp.h" +#if defined(WEBP_HAVE_NEON_RTCD) +#include <stdio.h> +#include <string.h> +#endif + #if defined(WEBP_ANDROID_NEON) #include <cpu-features.h> #endif @@ -31,6 +36,18 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) { : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) : "a"(info_type), "c"(0)); } +#elif defined(__x86_64__) && \ + (defined(__code_model_medium__) || defined(__code_model_large__)) && \ + defined(__PIC__) +static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) { + __asm__ volatile ( + "xchg{q}\t{%%rbx}, %q1\n" + "cpuid\n" + "xchg{q}\t{%%rbx}, %q1\n" + : "=a"(cpu_info[0]), "=&r"(cpu_info[1]), "=c"(cpu_info[2]), + "=d"(cpu_info[3]) + : "a"(info_type), "c"(0)); +} #elif defined(__i386__) || defined(__x86_64__) static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) { __asm__ volatile ( @@ -130,13 +147,33 @@ VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo; // define a dummy function to enable turning off NEON at runtime by setting // VP8DecGetCPUInfo = NULL static int armCPUInfo(CPUFeature feature) { - (void)feature; + if (feature != kNEON) return 0; +#if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD) + { + int has_neon = 0; + char line[200]; + FILE* const cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo == NULL) return 0; + while (fgets(line, sizeof(line), cpuinfo)) { + if (!strncmp(line, "Features", 8)) { + if (strstr(line, " neon ") != NULL) { + has_neon = 1; + break; + } + } + } + fclose(cpuinfo); + return has_neon; + } +#else return 1; +#endif } VP8CPUInfo VP8GetCPUInfo = armCPUInfo; -#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2) +#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2) || \ + defined(WEBP_USE_MSA) static int mipsCPUInfo(CPUFeature feature) { - if ((feature == kMIPS32) || (feature == kMIPSdspR2)) { + if ((feature == kMIPS32) || (feature == kMIPSdspR2) || (feature == kMSA)) { return 1; } else { return 0; diff --git a/drivers/webp/dsp/dec.c b/drivers/webp/dsp/dec.c index 77a00381c5..e92d693362 100644 --- a/drivers/webp/dsp/dec.c +++ b/drivers/webp/dsp/dec.c @@ -13,6 +13,7 @@ #include "./dsp.h" #include "../dec/vp8i.h" +#include "../utils/utils.h" //------------------------------------------------------------------------------ @@ -261,10 +262,10 @@ static void HE4(uint8_t* dst) { // horizontal const int C = dst[-1 + BPS]; const int D = dst[-1 + 2 * BPS]; const int E = dst[-1 + 3 * BPS]; - *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(A, B, C); - *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(B, C, D); - *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(C, D, E); - *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(D, E, E); + WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(A, B, C)); + WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(B, C, D)); + WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(C, D, E)); + WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E)); } static void DC4(uint8_t* dst) { // DC @@ -654,6 +655,23 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride, //------------------------------------------------------------------------------ +static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst, + int dst_stride) { + int i, j; + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) { + const int delta0 = dither[i] - VP8_DITHER_AMP_CENTER; + const int delta1 = + (delta0 + VP8_DITHER_DESCALE_ROUNDER) >> VP8_DITHER_DESCALE; + dst[i] = clip_8b((int)dst[i] + delta1); + } + dst += dst_stride; + dither += 8; + } +} + +//------------------------------------------------------------------------------ + VP8DecIdct2 VP8Transform; VP8DecIdct VP8TransformAC3; VP8DecIdct VP8TransformUV; @@ -673,11 +691,15 @@ VP8SimpleFilterFunc VP8SimpleHFilter16; VP8SimpleFilterFunc VP8SimpleVFilter16i; VP8SimpleFilterFunc VP8SimpleHFilter16i; +void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst, + int dst_stride); + extern void VP8DspInitSSE2(void); extern void VP8DspInitSSE41(void); extern void VP8DspInitNEON(void); extern void VP8DspInitMIPS32(void); extern void VP8DspInitMIPSdspR2(void); +extern void VP8DspInitMSA(void); static volatile VP8CPUInfo dec_last_cpuinfo_used = (VP8CPUInfo)&dec_last_cpuinfo_used; @@ -734,6 +756,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) { VP8PredChroma8[5] = DC8uvNoLeft; VP8PredChroma8[6] = DC8uvNoTopLeft; + VP8DitherCombine8x8 = DitherCombine8x8; + // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) @@ -761,6 +785,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) { VP8DspInitMIPSdspR2(); } #endif +#if defined(WEBP_USE_MSA) + if (VP8GetCPUInfo(kMSA)) { + VP8DspInitMSA(); + } +#endif } dec_last_cpuinfo_used = VP8GetCPUInfo; } diff --git a/drivers/webp/dsp/dec_msa.c b/drivers/webp/dsp/dec_msa.c new file mode 100644 index 0000000000..f76055cab0 --- /dev/null +++ b/drivers/webp/dsp/dec_msa.c @@ -0,0 +1,172 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// MSA version of dsp functions +// +// Author(s): Prashant Patil (prashant.patil@imgtec.com) + + +#include "./dsp.h" + +#if defined(WEBP_USE_MSA) + +#include "./msa_macro.h" + +//------------------------------------------------------------------------------ +// Transforms + +#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v4i32 a1_m, b1_m, c1_m, d1_m; \ + v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \ + const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \ + const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \ + \ + a1_m = in0 + in2; \ + b1_m = in0 - in2; \ + c_tmp1_m = (in1 * sinpi8sqrt2) >> 16; \ + c_tmp2_m = in3 + ((in3 * cospi8sqrt2minus1) >> 16); \ + c1_m = c_tmp1_m - c_tmp2_m; \ + d_tmp1_m = in1 + ((in1 * cospi8sqrt2minus1) >> 16); \ + d_tmp2_m = (in3 * sinpi8sqrt2) >> 16; \ + d1_m = d_tmp1_m + d_tmp2_m; \ + BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ +} +#define MULT1(a) ((((a) * 20091) >> 16) + (a)) +#define MULT2(a) (((a) * 35468) >> 16) + +static void TransformOne(const int16_t* in, uint8_t* dst) { + v8i16 input0, input1; + v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; + v4i32 res0, res1, res2, res3; + const v16i8 zero = { 0 }; + v16i8 dest0, dest1, dest2, dest3; + + LD_SH2(in, 8, input0, input1); + UNPCK_SH_SW(input0, in0, in1); + UNPCK_SH_SW(input1, in2, in3); + IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3); + TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); + IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3); + SRARI_W4_SW(vt0, vt1, vt2, vt3, 3); + TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); + LD_SB4(dst, BPS, dest0, dest1, dest2, dest3); + ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, + res0, res1, res2, res3); + ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, + res0, res1, res2, res3); + ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); + CLIP_SW4_0_255(res0, res1, res2, res3); + PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1); + res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1); + ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); +} + +static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { + TransformOne(in, dst); + if (do_two) { + TransformOne(in + 16, dst + 4); + } +} + +static void TransformWHT(const int16_t* in, int16_t* out) { + v8i16 input0, input1; + const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; + const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; + const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 }; + const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 }; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1; + + LD_SH2(in, 8, input0, input1); + input1 = SLDI_SH(input1, input1, 8); + tmp0 = input0 + input1; + tmp1 = input0 - input1; + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + out0 = tmp2 + tmp3; + out1 = tmp2 - tmp3; + VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1); + tmp0 = input0 + input1; + tmp1 = input0 - input1; + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + tmp0 = tmp2 + tmp3; + tmp1 = tmp2 - tmp3; + ADDVI_H2_SH(tmp0, 3, tmp1, 3, out0, out1); + SRAI_H2_SH(out0, out1, 3); + out[0] = __msa_copy_s_h(out0, 0); + out[16] = __msa_copy_s_h(out0, 4); + out[32] = __msa_copy_s_h(out1, 0); + out[48] = __msa_copy_s_h(out1, 4); + out[64] = __msa_copy_s_h(out0, 1); + out[80] = __msa_copy_s_h(out0, 5); + out[96] = __msa_copy_s_h(out1, 1); + out[112] = __msa_copy_s_h(out1, 5); + out[128] = __msa_copy_s_h(out0, 2); + out[144] = __msa_copy_s_h(out0, 6); + out[160] = __msa_copy_s_h(out1, 2); + out[176] = __msa_copy_s_h(out1, 6); + out[192] = __msa_copy_s_h(out0, 3); + out[208] = __msa_copy_s_h(out0, 7); + out[224] = __msa_copy_s_h(out1, 3); + out[240] = __msa_copy_s_h(out1, 7); +} + +static void TransformDC(const int16_t* in, uint8_t* dst) { + const int DC = (in[0] + 4) >> 3; + const v8i16 tmp0 = __msa_fill_h(DC); + ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS); +} + +static void TransformAC3(const int16_t* in, uint8_t* dst) { + const int a = in[0] + 4; + const int c4 = MULT2(in[4]); + const int d4 = MULT1(in[4]); + const int in2 = MULT2(in[1]); + const int in3 = MULT1(in[1]); + v4i32 tmp0 = { 0 }; + v4i32 out0 = __msa_fill_w(a + d4); + v4i32 out1 = __msa_fill_w(a + c4); + v4i32 out2 = __msa_fill_w(a - c4); + v4i32 out3 = __msa_fill_w(a - d4); + v4i32 res0, res1, res2, res3; + const v4i32 zero = { 0 }; + v16u8 dest0, dest1, dest2, dest3; + + INSERT_W4_SW(in3, in2, -in2, -in3, tmp0); + ADD4(out0, tmp0, out1, tmp0, out2, tmp0, out3, tmp0, + out0, out1, out2, out3); + SRAI_W4_SW(out0, out1, out2, out3, 3); + LD_UB4(dst, BPS, dest0, dest1, dest2, dest3); + ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, + res0, res1, res2, res3); + ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, + res0, res1, res2, res3); + ADD4(res0, out0, res1, out1, res2, out2, res3, out3, res0, res1, res2, res3); + CLIP_SW4_0_255(res0, res1, res2, res3); + PCKEV_B2_SW(res0, res1, res2, res3, out0, out1); + res0 = (v4i32)__msa_pckev_b((v16i8)out0, (v16i8)out1); + ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); +} + +//------------------------------------------------------------------------------ +// Entry point + +extern void VP8DspInitMSA(void); + +WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) { + VP8TransformWHT = TransformWHT; + VP8Transform = TransformTwo; + VP8TransformDC = TransformDC; + VP8TransformAC3 = TransformAC3; +} + +#else // !WEBP_USE_MSA + +WEBP_DSP_INIT_STUB(VP8DspInitMSA) + +#endif // WEBP_USE_MSA diff --git a/drivers/webp/dsp/dec_sse2.c b/drivers/webp/dsp/dec_sse2.c index d4838b9210..f0a8ddcaf3 100644 --- a/drivers/webp/dsp/dec_sse2.c +++ b/drivers/webp/dsp/dec_sse2.c @@ -21,7 +21,9 @@ // #define USE_TRANSFORM_AC3 #include <emmintrin.h> +#include "./common_sse2.h" #include "../dec/vp8i.h" +#include "../utils/utils.h" //------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) @@ -102,34 +104,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) { const __m128i tmp3 = _mm_sub_epi16(a, d); // Transpose the two 4x4. - // a00 a01 a02 a03 b00 b01 b02 b03 - // a10 a11 a12 a13 b10 b11 b12 b13 - // a20 a21 a22 a23 b20 b21 b22 b23 - // a30 a31 a32 a33 b30 b31 b32 b33 - const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); - const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); - const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); - const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); - const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); - T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); - T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); - T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3); } // Horizontal pass and subsequent transpose. @@ -164,34 +139,8 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) { const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); // Transpose the two 4x4. - // a00 a01 a02 a03 b00 b01 b02 b03 - // a10 a11 a12 a13 b10 b11 b12 b13 - // a20 a21 a22 a23 b20 b21 b22 b23 - // a30 a31 a32 a33 b30 b31 b32 b33 - const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); - const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); - const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); - const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); - const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); - T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); - T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); - T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1, + &T2, &T3); } // Add inverse transform to 'dst' and store. @@ -207,10 +156,10 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) { dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS)); } else { // Load four bytes/pixels per line. - dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS)); - dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS)); - dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS)); - dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS)); + dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS)); + dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS)); + dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS)); + dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS)); } // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); @@ -236,10 +185,10 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) { _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3); } else { // Store four bytes/pixels per line. - *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0); - *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1); - *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2); - *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3); + WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0)); + WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1)); + WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2)); + WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3)); } } } @@ -262,10 +211,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { const __m128i m3 = _mm_subs_epi16(B, d4); const __m128i zero = _mm_setzero_si128(); // Load the source pixels. - __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS)); - __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS)); - __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS)); - __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS)); + __m128i dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS)); + __m128i dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS)); + __m128i dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS)); + __m128i dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS)); // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); dst1 = _mm_unpacklo_epi8(dst1, zero); @@ -282,10 +231,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { dst2 = _mm_packus_epi16(dst2, dst2); dst3 = _mm_packus_epi16(dst3, dst3); // Store the results. - *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0); - *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1); - *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2); - *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3); + WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0)); + WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1)); + WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2)); + WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3)); } #undef MUL #endif // USE_TRANSFORM_AC3 @@ -517,24 +466,17 @@ static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1, } } -// memcpy() is the safe way of moving potentially unaligned 32b memory. -static WEBP_INLINE uint32_t MemToUint32(const uint8_t* const ptr) { - uint32_t A; - memcpy(&A, (const int*)ptr, sizeof(A)); - return A; -} - // reads 8 rows across a vertical edge. static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride, __m128i* const p, __m128i* const q) { // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00 // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10 const __m128i A0 = _mm_set_epi32( - MemToUint32(&b[6 * stride]), MemToUint32(&b[2 * stride]), - MemToUint32(&b[4 * stride]), MemToUint32(&b[0 * stride])); + WebPMemToUint32(&b[6 * stride]), WebPMemToUint32(&b[2 * stride]), + WebPMemToUint32(&b[4 * stride]), WebPMemToUint32(&b[0 * stride])); const __m128i A1 = _mm_set_epi32( - MemToUint32(&b[7 * stride]), MemToUint32(&b[3 * stride]), - MemToUint32(&b[5 * stride]), MemToUint32(&b[1 * stride])); + WebPMemToUint32(&b[7 * stride]), WebPMemToUint32(&b[3 * stride]), + WebPMemToUint32(&b[5 * stride]), WebPMemToUint32(&b[1 * stride])); // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 @@ -592,7 +534,7 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0, static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) { int i; for (i = 0; i < 4; ++i, dst += stride) { - *((int32_t*)dst) = _mm_cvtsi128_si32(*x); + WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x)); *x = _mm_srli_si128(*x, 4); } } @@ -963,7 +905,7 @@ static void VE4(uint8_t* dst) { // vertical const uint32_t vals = _mm_cvtsi128_si32(avg); int i; for (i = 0; i < 4; ++i) { - *(uint32_t*)(dst + i * BPS) = vals; + WebPUint32ToMem(dst + i * BPS, vals); } } @@ -977,10 +919,10 @@ static void LD4(uint8_t* dst) { // Down-Left const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0); - *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( abcdefg ); - *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)); - *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)); - *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)); + WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg )); + WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); + WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); + WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } static void VR4(uint8_t* dst) { // Vertical-Right @@ -998,10 +940,10 @@ static void VR4(uint8_t* dst) { // Vertical-Right const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i efgh = _mm_avg_epu8(avg2, XABCD); - *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( abcd ); - *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32( efgh ); - *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)); - *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)); + WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd )); + WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh )); + WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1))); + WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1))); // these two are hard to implement in SSE2, so we keep the C-version: DST(0, 2) = AVG3(J, I, X); @@ -1023,10 +965,10 @@ static void VL4(uint8_t* dst) { // Vertical-Left const __m128i lsb2 = _mm_and_si128(abbc, lsb1); const __m128i avg4 = _mm_subs_epu8(avg3, lsb2); const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4)); - *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( avg1 ); - *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32( avg4 ); - *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)); - *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)); + WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 )); + WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 )); + WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1))); + WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1))); // these two are hard to get and irregular DST(3, 2) = (extra_out >> 0) & 0xff; @@ -1050,10 +992,10 @@ static void RD4(uint8_t* dst) { // Down-right const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_); - *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32( abcdefg ); - *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)); - *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)); - *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)); + WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg )); + WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); + WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); + WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } #undef DST @@ -1067,13 +1009,13 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { const __m128i zero = _mm_setzero_si128(); int y; if (size == 4) { - const __m128i top_values = _mm_cvtsi32_si128(MemToUint32(top)); + const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top)); const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); for (y = 0; y < 4; ++y, dst += BPS) { const int val = dst[-1] - top[-1]; const __m128i base = _mm_set1_epi16(val); const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); - *(int*)dst = _mm_cvtsi128_si32(out); + WebPUint32ToMem(dst, _mm_cvtsi128_si32(out)); } } else if (size == 8) { const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); diff --git a/drivers/webp/dsp/dec_sse41.c b/drivers/webp/dsp/dec_sse41.c index dc1e70428d..8d6aed13e6 100644 --- a/drivers/webp/dsp/dec_sse41.c +++ b/drivers/webp/dsp/dec_sse41.c @@ -17,12 +17,13 @@ #include <smmintrin.h> #include "../dec/vp8i.h" +#include "../utils/utils.h" static void HE16(uint8_t* dst) { // horizontal int j; const __m128i kShuffle3 = _mm_set1_epi8(3); for (j = 16; j > 0; --j) { - const __m128i in = _mm_cvtsi32_si128(*(int*)(dst - 4)); + const __m128i in = _mm_cvtsi32_si128(WebPMemToUint32(dst - 4)); const __m128i values = _mm_shuffle_epi8(in, kShuffle3); _mm_storeu_si128((__m128i*)dst, values); dst += BPS; diff --git a/drivers/webp/dsp/dsp.h b/drivers/webp/dsp/dsp.h index 4613d9c3ff..2469f7d3ac 100644 --- a/drivers/webp/dsp/dsp.h +++ b/drivers/webp/dsp/dsp.h @@ -75,7 +75,8 @@ extern "C" { // The intrinsics currently cause compiler errors with arm-nacl-gcc and the // inline assembly would need to be modified for use with Native Client. #if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \ - defined(__aarch64__)) && !defined(__native_client__) + defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \ + !defined(__native_client__) #define WEBP_USE_NEON #endif @@ -95,6 +96,10 @@ extern "C" { #endif #endif +#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5) +#define WEBP_USE_MSA +#endif + // This macro prevents thread_sanitizer from reporting known concurrent writes. #define WEBP_TSAN_IGNORE_FUNCTION #if defined(__has_feature) @@ -104,6 +109,27 @@ extern "C" { #endif #endif +#define WEBP_UBSAN_IGNORE_UNDEF +#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW +#if !defined(WEBP_FORCE_ALIGNED) && defined(__clang__) && \ + defined(__has_attribute) +#if __has_attribute(no_sanitize) +// This macro prevents the undefined behavior sanitizer from reporting +// failures. This is only meant to silence unaligned loads on platforms that +// are known to support them. +#undef WEBP_UBSAN_IGNORE_UNDEF +#define WEBP_UBSAN_IGNORE_UNDEF \ + __attribute__((no_sanitize("undefined"))) + +// This macro prevents the undefined behavior sanitizer from reporting +// failures related to unsigned integer overflows. This is only meant to +// silence cases where this well defined behavior is expected. +#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW +#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \ + __attribute__((no_sanitize("unsigned-integer-overflow"))) +#endif +#endif + typedef enum { kSSE2, kSSE3, @@ -112,7 +138,8 @@ typedef enum { kAVX2, kNEON, kMIPS32, - kMIPSdspR2 + kMIPSdspR2, + kMSA } CPUFeature; // returns true if the CPU supports the feature. typedef int (*VP8CPUInfo)(CPUFeature feature); @@ -154,6 +181,8 @@ typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref); extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4; typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref, const uint16_t* const weights); +// The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major +// 4 by 4 symmetric matrix. extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16; typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst); @@ -217,6 +246,35 @@ extern VP8GetResidualCostFunc VP8GetResidualCost; void VP8EncDspCostInit(void); //------------------------------------------------------------------------------ +// SSIM utils + +// struct for accumulating statistical moments +typedef struct { + double w; // sum(w_i) : sum of weights + double xm, ym; // sum(w_i * x_i), sum(w_i * y_i) + double xxm, xym, yym; // sum(w_i * x_i * x_i), etc. +} VP8DistoStats; + +#define VP8_SSIM_KERNEL 3 // total size of the kernel: 2 * VP8_SSIM_KERNEL + 1 +typedef void (*VP8SSIMAccumulateClippedFunc)(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2, + int xo, int yo, // center position + int W, int H, // plane dimension + VP8DistoStats* const stats); + +// This version is called with the guarantee that you can load 8 bytes and +// 8 rows at offset src1 and src2 +typedef void (*VP8SSIMAccumulateFunc)(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2, + VP8DistoStats* const stats); + +extern VP8SSIMAccumulateFunc VP8SSIMAccumulate; // unclipped / unchecked +extern VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped; // with clipping + +// must be called before using any of the above directly +void VP8SSIMDspInit(void); + +//------------------------------------------------------------------------------ // Decoding typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst); @@ -268,6 +326,15 @@ extern VP8LumaFilterFunc VP8HFilter16i; extern VP8ChromaFilterFunc VP8VFilter8i; // filtering u and v altogether extern VP8ChromaFilterFunc VP8HFilter8i; +// Dithering. Combines dithering values (centered around 128) with dst[], +// according to: dst[] = clip(dst[] + (((dither[]-128) + 8) >> 4) +#define VP8_DITHER_DESCALE 4 +#define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1)) +#define VP8_DITHER_AMP_BITS 7 +#define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS) +extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst, + int dst_stride); + // must be called before anything using the above void VP8DspInit(void); @@ -475,8 +542,10 @@ typedef enum { // Filter types. typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height, int stride, uint8_t* out); -typedef void (*WebPUnfilterFunc)(int width, int height, int stride, - int row, int num_rows, uint8_t* data); +// In-place un-filtering. +// Warning! 'prev_line' pointer can be equal to 'cur_line' or 'preds'. +typedef void (*WebPUnfilterFunc)(const uint8_t* prev_line, const uint8_t* preds, + uint8_t* cur_line, int width); // Filter the given data using the given predictor. // 'in' corresponds to a 2-dimensional pixel array of size (stride * height) diff --git a/drivers/webp/dsp/enc.c b/drivers/webp/dsp/enc.c index 95e63f89ab..f639f5570c 100644 --- a/drivers/webp/dsp/enc.c +++ b/drivers/webp/dsp/enc.c @@ -69,7 +69,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, // Convert coefficients to bin. for (k = 0; k < 16; ++k) { - const int v = abs(out[k]) >> 3; // TODO(skal): add rounding? + const int v = abs(out[k]) >> 3; const int clipped_value = clip_max(v, MAX_COEFF_THRESH); ++distribution[clipped_value]; } @@ -357,10 +357,10 @@ static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal const int J = top[-3]; const int K = top[-4]; const int L = top[-5]; - *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J); - *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K); - *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L); - *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L); + WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J)); + WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K)); + WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L)); + WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); } static void DC4(uint8_t* dst, const uint8_t* top) { @@ -559,6 +559,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. +// w[] contains a row-major 4 by 4 symmetric matrix. static int TTransform(const uint8_t* in, const uint16_t* w) { int sum = 0; int tmp[16]; @@ -636,7 +637,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], int level = QUANTDIV(coeff, iQ, B); if (level > MAX_LEVEL) level = MAX_LEVEL; if (sign) level = -level; - in[j] = level * Q; + in[j] = level * (int)Q; out[n] = level; if (level) last = n; } else { @@ -670,7 +671,7 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16], int level = QUANTDIV(coeff, iQ, B); if (level > MAX_LEVEL) level = MAX_LEVEL; if (sign) level = -level; - in[j] = level * Q; + in[j] = level * (int)Q; out[n] = level; if (level) last = n; } else { @@ -702,6 +703,68 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) { } //------------------------------------------------------------------------------ + +static void SSIMAccumulateClipped(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2, + int xo, int yo, int W, int H, + VP8DistoStats* const stats) { + const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL; + const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1 + : yo + VP8_SSIM_KERNEL; + const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL; + const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1 + : xo + VP8_SSIM_KERNEL; + int x, y; + src1 += ymin * stride1; + src2 += ymin * stride2; + for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) { + for (x = xmin; x <= xmax; ++x) { + const int s1 = src1[x]; + const int s2 = src2[x]; + stats->w += 1; + stats->xm += s1; + stats->ym += s2; + stats->xxm += s1 * s1; + stats->xym += s1 * s2; + stats->yym += s2 * s2; + } + } +} + +static void SSIMAccumulate(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2, + VP8DistoStats* const stats) { + int x, y; + for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) { + for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) { + const int s1 = src1[x]; + const int s2 = src2[x]; + stats->w += 1; + stats->xm += s1; + stats->ym += s2; + stats->xxm += s1 * s1; + stats->xym += s1 * s2; + stats->yym += s2 * s2; + } + } +} + +VP8SSIMAccumulateFunc VP8SSIMAccumulate; +VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped; + +static volatile VP8CPUInfo ssim_last_cpuinfo_used = + (VP8CPUInfo)&ssim_last_cpuinfo_used; + +WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) { + if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return; + + VP8SSIMAccumulate = SSIMAccumulate; + VP8SSIMAccumulateClipped = SSIMAccumulateClipped; + + ssim_last_cpuinfo_used = VP8GetCPUInfo; +} + +//------------------------------------------------------------------------------ // Initialization // Speed-critical function pointers. We have to initialize them to the default diff --git a/drivers/webp/dsp/enc_mips_dsp_r2.c b/drivers/webp/dsp/enc_mips_dsp_r2.c index 7c814fa04a..7ab96f6800 100644 --- a/drivers/webp/dsp/enc_mips_dsp_r2.c +++ b/drivers/webp/dsp/enc_mips_dsp_r2.c @@ -1393,8 +1393,6 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { "absq_s.ph %[temp1], %[temp1] \n\t" \ "absq_s.ph %[temp2], %[temp2] \n\t" \ "absq_s.ph %[temp3], %[temp3] \n\t" \ - /* TODO(skal): add rounding ? shra_r.ph : shra.ph */ \ - /* for following 4 instructions */ \ "shra.ph %[temp0], %[temp0], 3 \n\t" \ "shra.ph %[temp1], %[temp1], 3 \n\t" \ "shra.ph %[temp2], %[temp2], 3 \n\t" \ diff --git a/drivers/webp/dsp/enc_neon.c b/drivers/webp/dsp/enc_neon.c index c2aef58e70..46f6bf9a33 100644 --- a/drivers/webp/dsp/enc_neon.c +++ b/drivers/webp/dsp/enc_neon.c @@ -560,21 +560,6 @@ static void FTransformWHT(const int16_t* src, int16_t* out) { // a 26ae, b 26ae // a 37bf, b 37bf // -static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) { - const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]); - const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]); - const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]), - vreinterpret_u16_u8(d2_tmp1.val[0])); - const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]), - vreinterpret_u16_u8(d2_tmp1.val[1])); - - d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]); - d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]); - d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]); - d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]); - return d4_in; -} - static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) { const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]); const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]); @@ -589,41 +574,40 @@ static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) { return q4_in; } -static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) { +static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) { // {a0, a1} = {in[0] + in[2], in[1] + in[3]} // {a3, a2} = {in[0] - in[2], in[1] - in[3]} - const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0], - d4_in.val[2])); - const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1], - d4_in.val[3])); - const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0], - d4_in.val[2])); - const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1], - d4_in.val[3])); + const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]); + const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]); + const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]); + const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]); int16x8x4_t q4_out; // tmp[0] = a0 + a1 // tmp[1] = a3 + a2 // tmp[2] = a3 - a2 // tmp[3] = a0 - a1 INIT_VECTOR4(q4_out, - vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2), - vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1)); + vabsq_s16(vaddq_s16(q_a0, q_a1)), + vabsq_s16(vaddq_s16(q_a3, q_a2)), + vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1)); return q4_out; } -static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) { - const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]); - const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]); - const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]); - const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]); +static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) { + const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0], + q4_in.val[2])); + const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1], + q4_in.val[3])); + const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1], + q4_in.val[3])); + const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0], + q4_in.val[2])); + int16x8x4_t q4_out; - q4_in.val[0] = vaddq_s16(q_a0, q_a1); - q4_in.val[1] = vaddq_s16(q_a3, q_a2); - q4_in.val[2] = vabdq_s16(q_a3, q_a2); - q4_in.val[3] = vabdq_s16(q_a0, q_a1); - q4_in.val[0] = vabsq_s16(q4_in.val[0]); - q4_in.val[1] = vabsq_s16(q4_in.val[1]); - return q4_in; + INIT_VECTOR4(q4_out, + vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2), + vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1)); + return q4_out; } static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) { @@ -667,6 +651,7 @@ static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in, // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. +// w[] contains a row-major 4 by 4 symmetric matrix. static int Disto4x4(const uint8_t* const a, const uint8_t* const b, const uint16_t* const w) { uint32x2_t d_in_ab_0123 = vdup_n_u32(0); @@ -691,18 +676,19 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b, vreinterpret_u8_u32(d_in_ab_cdef)); { - // horizontal pass - const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in); - const int16x8x4_t q4_h = DistoHorizontalPass(d4_t); + // Vertical pass first to avoid a transpose (vertical and horizontal passes + // are commutative because w/kWeightY is symmetric) and subsequent + // transpose. + const int16x8x4_t q4_v = DistoVerticalPass(d4_in); const int16x4x4_t d4_w = DistoLoadW(w); - // vertical pass - const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h); - const int16x8x4_t q4_v = DistoVerticalPass(q4_t); - int32x2_t d_sum = DistoSum(q4_v, d4_w); + // horizontal pass + const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v); + const int16x8x4_t q4_h = DistoHorizontalPass(q4_t); + int32x2_t d_sum = DistoSum(q4_h, d4_w); // abs(sum2 - sum1) >> 5 d_sum = vabs_s32(d_sum); - d_sum = vshr_n_s32(d_sum, 5); + d_sum = vshr_n_s32(d_sum, 5); return vget_lane_s32(d_sum, 0); } } diff --git a/drivers/webp/dsp/enc_sse2.c b/drivers/webp/dsp/enc_sse2.c index 63d9cecd85..4a2e3ce14f 100644 --- a/drivers/webp/dsp/enc_sse2.c +++ b/drivers/webp/dsp/enc_sse2.c @@ -17,48 +17,9 @@ #include <stdlib.h> // for abs() #include <emmintrin.h> +#include "./common_sse2.h" #include "../enc/cost.h" #include "../enc/vp8enci.h" -#include "../utils/utils.h" - -//------------------------------------------------------------------------------ -// Quite useful macro for debugging. Left here for convenience. - -#if 0 -#include <stdio.h> -static void PrintReg(const __m128i r, const char* const name, int size) { - int n; - union { - __m128i r; - uint8_t i8[16]; - uint16_t i16[8]; - uint32_t i32[4]; - uint64_t i64[2]; - } tmp; - tmp.r = r; - fprintf(stderr, "%s\t: ", name); - if (size == 8) { - for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]); - } else if (size == 16) { - for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]); - } else if (size == 32) { - for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]); - } else { - for (n = 0; n < 2; ++n) fprintf(stderr, "%.16lx ", tmp.i64[n]); - } - fprintf(stderr, "\n"); -} -#endif - -//------------------------------------------------------------------------------ -// util for unaligned loads. - -// memcpy() is the safe way of moving potentially unaligned 32b memory. -static WEBP_INLINE uint32_t MemToUint32(const uint8_t* const ptr) { - uint32_t A; - memcpy(&A, (const int*)ptr, sizeof(A)); - return A; -} //------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) @@ -142,34 +103,7 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, const __m128i tmp3 = _mm_sub_epi16(a, d); // Transpose the two 4x4. - // a00 a01 a02 a03 b00 b01 b02 b03 - // a10 a11 a12 a13 b10 b11 b12 b13 - // a20 a21 a22 a23 b20 b21 b22 b23 - // a30 a31 a32 a33 b30 b31 b32 b33 - const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); - const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); - const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); - const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); - const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); - T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); - T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); - T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3); } // Horizontal pass and subsequent transpose. @@ -204,34 +138,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); // Transpose the two 4x4. - // a00 a01 a02 a03 b00 b01 b02 b03 - // a10 a11 a12 a13 b10 b11 b12 b13 - // a20 a21 a22 a23 b20 b21 b22 b23 - // a30 a31 a32 a33 b30 b31 b32 b33 - const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); - const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); - const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); - const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); - const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); - T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); - T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); - T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1, + &T2, &T3); } // Add inverse transform to 'ref' and store. @@ -247,10 +155,10 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); } else { // Load four bytes/pixels per line. - ref0 = _mm_cvtsi32_si128(MemToUint32(&ref[0 * BPS])); - ref1 = _mm_cvtsi32_si128(MemToUint32(&ref[1 * BPS])); - ref2 = _mm_cvtsi32_si128(MemToUint32(&ref[2 * BPS])); - ref3 = _mm_cvtsi32_si128(MemToUint32(&ref[3 * BPS])); + ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS])); + ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS])); + ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS])); + ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS])); } // Convert to 16b. ref0 = _mm_unpacklo_epi8(ref0, zero); @@ -276,10 +184,10 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); } else { // Store four bytes/pixels per line. - *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0); - *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1); - *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2); - *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3); + WebPUint32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0)); + WebPUint32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1)); + WebPUint32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2)); + WebPUint32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3)); } } } @@ -384,42 +292,42 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32, static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); - - // Load src and convert to 16b. + // Load src. const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); - const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); - const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); - const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); - const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); - // Load ref and convert to 16b. + // 00 01 02 03 * + // 10 11 12 13 * + // 20 21 22 23 * + // 30 31 32 33 * + // Shuffle. + const __m128i src_0 = _mm_unpacklo_epi16(src0, src1); + const __m128i src_1 = _mm_unpacklo_epi16(src2, src3); + // 00 01 10 11 02 03 12 13 * * ... + // 20 21 30 31 22 22 32 33 * * ... + + // Load ref. const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); - const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); - const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); - const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); - const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); - // Compute difference. -> 00 01 02 03 00 00 00 00 - const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); - const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); - const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); - const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); - - // Unpack and shuffle - // 00 01 02 03 0 0 0 0 - // 10 11 12 13 0 0 0 0 - // 20 21 22 23 0 0 0 0 - // 30 31 32 33 0 0 0 0 - const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1); - const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3); + const __m128i ref_0 = _mm_unpacklo_epi16(ref0, ref1); + const __m128i ref_1 = _mm_unpacklo_epi16(ref2, ref3); + + // Convert both to 16 bit. + const __m128i src_0_16b = _mm_unpacklo_epi8(src_0, zero); + const __m128i src_1_16b = _mm_unpacklo_epi8(src_1, zero); + const __m128i ref_0_16b = _mm_unpacklo_epi8(ref_0, zero); + const __m128i ref_1_16b = _mm_unpacklo_epi8(ref_1, zero); + + // Compute the difference. + const __m128i row01 = _mm_sub_epi16(src_0_16b, ref_0_16b); + const __m128i row23 = _mm_sub_epi16(src_1_16b, ref_1_16b); __m128i v01, v32; // First pass - FTransformPass1(&shuf01, &shuf23, &v01, &v32); + FTransformPass1(&row01, &row23, &v01, &v32); // Second pass FTransformPass2(&v01, &v32, out); @@ -474,8 +382,7 @@ static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { } static void FTransformWHTRow(const int16_t* const in, __m128i* const out) { - const __m128i kMult1 = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1); - const __m128i kMult2 = _mm_set_epi16(0, 0, 0, 0, -1, 1, -1, 1); + const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1); const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]); const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]); @@ -484,33 +391,38 @@ static void FTransformWHTRow(const int16_t* const in, __m128i* const out) { const __m128i A23 = _mm_unpacklo_epi16(src2, src3); // A2 A3 | ... const __m128i B0 = _mm_adds_epi16(A01, A23); // a0 | a1 | ... const __m128i B1 = _mm_subs_epi16(A01, A23); // a3 | a2 | ... - const __m128i C0 = _mm_unpacklo_epi32(B0, B1); // a0 | a1 | a3 | a2 - const __m128i C1 = _mm_unpacklo_epi32(B1, B0); // a3 | a2 | a0 | a1 - const __m128i D0 = _mm_madd_epi16(C0, kMult1); // out0, out1 - const __m128i D1 = _mm_madd_epi16(C1, kMult2); // out2, out3 - *out = _mm_unpacklo_epi64(D0, D1); + const __m128i C0 = _mm_unpacklo_epi32(B0, B1); // a0 | a1 | a3 | a2 | ... + const __m128i C1 = _mm_unpacklo_epi32(B1, B0); // a3 | a2 | a0 | a1 | ... + const __m128i D = _mm_unpacklo_epi64(C0, C1); // a0 a1 a3 a2 a3 a2 a0 a1 + *out = _mm_madd_epi16(D, kMult); } static void FTransformWHT(const int16_t* in, int16_t* out) { + // Input is 12b signed. __m128i row0, row1, row2, row3; + // Rows are 14b signed. FTransformWHTRow(in + 0 * 64, &row0); FTransformWHTRow(in + 1 * 64, &row1); FTransformWHTRow(in + 2 * 64, &row2); FTransformWHTRow(in + 3 * 64, &row3); { + // The a* are 15b signed. const __m128i a0 = _mm_add_epi32(row0, row2); const __m128i a1 = _mm_add_epi32(row1, row3); const __m128i a2 = _mm_sub_epi32(row1, row3); const __m128i a3 = _mm_sub_epi32(row0, row2); - const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1); - const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1); - const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1); - const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1); - const __m128i out0 = _mm_packs_epi32(b0, b1); - const __m128i out1 = _mm_packs_epi32(b2, b3); - _mm_storeu_si128((__m128i*)&out[0], out0); - _mm_storeu_si128((__m128i*)&out[8], out1); + const __m128i a0a3 = _mm_packs_epi32(a0, a3); + const __m128i a1a2 = _mm_packs_epi32(a1, a2); + + // The b* are 16b signed. + const __m128i b0b1 = _mm_add_epi16(a0a3, a1a2); + const __m128i b3b2 = _mm_sub_epi16(a0a3, a1a2); + const __m128i tmp_b2b3 = _mm_unpackhi_epi64(b3b2, b3b2); + const __m128i b2b3 = _mm_unpacklo_epi64(tmp_b2b3, b3b2); + + _mm_storeu_si128((__m128i*)&out[0], _mm_srai_epi16(b0b1, 1)); + _mm_storeu_si128((__m128i*)&out[8], _mm_srai_epi16(b2b3, 1)); } } @@ -703,12 +615,10 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left, const uint8_t* top) { - const __m128i zero = _mm_setzero_si128(); const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); const __m128i left_values = _mm_loadl_epi64((const __m128i*)left); - const __m128i sum_top = _mm_sad_epu8(top_values, zero); - const __m128i sum_left = _mm_sad_epu8(left_values, zero); - const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 8; + const __m128i combined = _mm_unpacklo_epi64(top_values, left_values); + const int DC = VP8HorizontalAdd8b(&combined) + 8; Put8x8uv(DC >> 4, dst); } @@ -746,27 +656,16 @@ static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left, const uint8_t* top) { - const __m128i zero = _mm_setzero_si128(); const __m128i top_row = _mm_load_si128((const __m128i*)top); const __m128i left_row = _mm_load_si128((const __m128i*)left); - const __m128i sad8x2 = _mm_sad_epu8(top_row, zero); - // sum the two sads: sad8x2[0:1] + sad8x2[8:9] - const __m128i sum_top = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2)); - const __m128i sad8x2_left = _mm_sad_epu8(left_row, zero); - // sum the two sads: sad8x2[0:1] + sad8x2[8:9] - const __m128i sum_left = - _mm_add_epi16(sad8x2_left, _mm_shuffle_epi32(sad8x2_left, 2)); - const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 16; + const int DC = + VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16; Put16(DC >> 5, dst); } static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) { - const __m128i zero = _mm_setzero_si128(); const __m128i top_row = _mm_load_si128((const __m128i*)top); - const __m128i sad8x2 = _mm_sad_epu8(top_row, zero); - // sum the two sads: sad8x2[0:1] + sad8x2[8:9] - const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2)); - const int DC = _mm_cvtsi128_si32(sum) + 8; + const int DC = VP8HorizontalAdd8b(&top_row) + 8; Put16(DC >> 4, dst); } @@ -821,7 +720,7 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical const uint32_t vals = _mm_cvtsi128_si32(avg); int i; for (i = 0; i < 4; ++i) { - *(uint32_t*)(dst + i * BPS) = vals; + WebPUint32ToMem(dst + i * BPS, vals); } } @@ -831,10 +730,10 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal const int J = top[-3]; const int K = top[-4]; const int L = top[-5]; - *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J); - *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K); - *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L); - *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L); + WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J)); + WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K)); + WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L)); + WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); } static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) { @@ -854,10 +753,10 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0); - *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( abcdefg ); - *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)); - *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)); - *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)); + WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg )); + WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); + WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); + WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } static WEBP_INLINE void VR4(uint8_t* dst, @@ -876,10 +775,10 @@ static WEBP_INLINE void VR4(uint8_t* dst, const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i efgh = _mm_avg_epu8(avg2, XABCD); - *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( abcd ); - *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32( efgh ); - *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)); - *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)); + WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd )); + WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh )); + WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1))); + WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1))); // these two are hard to implement in SSE2, so we keep the C-version: DST(0, 2) = AVG3(J, I, X); @@ -902,10 +801,10 @@ static WEBP_INLINE void VL4(uint8_t* dst, const __m128i lsb2 = _mm_and_si128(abbc, lsb1); const __m128i avg4 = _mm_subs_epu8(avg3, lsb2); const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4)); - *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( avg1 ); - *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32( avg4 ); - *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)); - *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)); + WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 )); + WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 )); + WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1))); + WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1))); // these two are hard to get and irregular DST(3, 2) = (extra_out >> 0) & 0xff; @@ -922,10 +821,10 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_); - *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32( abcdefg ); - *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)); - *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)); - *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)); + WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg )); + WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); + WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); + WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) { @@ -968,14 +867,14 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { const __m128i zero = _mm_setzero_si128(); - const __m128i top_values = _mm_cvtsi32_si128(MemToUint32(top)); + const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top)); const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); int y; for (y = 0; y < 4; ++y, dst += BPS) { const int val = top[-2 - y] - top[-1]; const __m128i base = _mm_set1_epi16(val); const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); - *(int*)dst = _mm_cvtsi128_si32(out); + WebPUint32ToMem(dst, _mm_cvtsi128_si32(out)); } } @@ -1153,15 +1052,15 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { // reconstructed samples. // Hadamard transform -// Returns the difference between the weighted sum of the absolute value of -// transformed coefficients. +// Returns the weighted sum of the absolute value of transformed coefficients. +// w[] contains a row-major 4 by 4 symmetric matrix. static int TTransform(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); - // Load, combine and transpose inputs. + // Load and combine inputs. { const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); @@ -1173,37 +1072,22 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB, const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). - const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); - const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); - const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); - const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); - // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 - // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 - // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 - // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 - - // Transpose the two 4x4, discarding the filling zeroes. - const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); - const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); - // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 - // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); - // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 - - // Convert to 16b. - tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero); - tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero); - tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero); - tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0); + const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1); + const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2); + const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3); + tmp_0 = _mm_unpacklo_epi8(inAB_0, zero); + tmp_1 = _mm_unpacklo_epi8(inAB_1, zero); + tmp_2 = _mm_unpacklo_epi8(inAB_2, zero); + tmp_3 = _mm_unpacklo_epi8(inAB_3, zero); + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 } - // Horizontal pass and subsequent transpose. + // Vertical pass first to avoid a transpose (vertical and horizontal passes + // are commutative because w/kWeightY is symmetric) and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); @@ -1220,37 +1104,12 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB, // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. - const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); - const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); - const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); - const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); - const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); - tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); - tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); - tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3); } - // Vertical pass and difference of weighted sums. + // Horizontal pass and difference of weighted sums. { // Load all inputs. - // TODO(cduvivier): Make variable declarations and allocations aligned so - // we can use _mm_load_si128 instead of _mm_loadu_si128. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); @@ -1328,8 +1187,6 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], __m128i packed_out; // Load all inputs. - // TODO(cduvivier): Make variable declarations and allocations aligned so that - // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]); diff --git a/drivers/webp/dsp/enc_sse41.c b/drivers/webp/dsp/enc_sse41.c index 27f4189833..a1783901a6 100644 --- a/drivers/webp/dsp/enc_sse41.c +++ b/drivers/webp/dsp/enc_sse41.c @@ -17,6 +17,7 @@ #include <smmintrin.h> #include <stdlib.h> // for abs() +#include "./common_sse2.h" #include "../enc/vp8enci.h" //------------------------------------------------------------------------------ @@ -67,55 +68,45 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, // reconstructed samples. // Hadamard transform -// Returns the difference between the weighted sum of the absolute value of -// transformed coefficients. +// Returns the weighted sum of the absolute value of transformed coefficients. +// w[] contains a row-major 4 by 4 symmetric matrix. static int TTransform(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { + int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; - // Load, combine and transpose inputs. + // Load and combine inputs. { - const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); - const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); - const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); + const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]); + const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]); + const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]); + // In SSE4.1, with gcc 4.8 at least (maybe other versions), + // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump + // of inA and inB, _mm_loadl_epi64 is still used not to have an out of + // bound read. const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); - const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); - const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); - const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); + const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]); + const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]); + const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). - const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); - const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); - const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); - const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); - // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 - // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 - // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 - // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 - - // Transpose the two 4x4, discarding the filling zeroes. - const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); - const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); - // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 - // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); - // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 - - // Convert to 16b. - tmp_0 = _mm_cvtepu8_epi16(transpose1_0); - tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8)); - tmp_2 = _mm_cvtepu8_epi16(transpose1_1); - tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8)); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0); + const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1); + const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2); + const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3); + tmp_0 = _mm_cvtepu8_epi16(inAB_0); + tmp_1 = _mm_cvtepu8_epi16(inAB_1); + tmp_2 = _mm_cvtepu8_epi16(inAB_2); + tmp_3 = _mm_cvtepu8_epi16(inAB_3); + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 } - // Horizontal pass and subsequent transpose. + // Vertical pass first to avoid a transpose (vertical and horizontal passes + // are commutative because w/kWeightY is symmetric) and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); @@ -132,33 +123,10 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB, // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. - const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); - const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); - const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); - const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); - const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); - tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); - tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); - tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3); } - // Vertical pass and difference of weighted sums. + // Horizontal pass and difference of weighted sums. { // Load all inputs. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); @@ -195,11 +163,9 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB, // difference of weighted sums A_b2 = _mm_sub_epi32(A_b0, B_b0); - // cascading summation of the differences - B_b0 = _mm_hadd_epi32(A_b2, A_b2); - B_b2 = _mm_hadd_epi32(B_b0, B_b0); - return _mm_cvtsi128_si32(B_b2); + _mm_storeu_si128((__m128i*)&sum[0], A_b2); } + return sum[0] + sum[1] + sum[2] + sum[3]; } static int Disto4x4(const uint8_t* const a, const uint8_t* const b, @@ -240,8 +206,6 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], __m128i packed_out; // Load all inputs. - // TODO(cduvivier): Make variable declarations and allocations aligned so that - // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]); diff --git a/drivers/webp/dsp/filters.c b/drivers/webp/dsp/filters.c index 5c30f2e457..9f04faf0cb 100644 --- a/drivers/webp/dsp/filters.c +++ b/drivers/webp/dsp/filters.c @@ -184,19 +184,40 @@ static void GradientFilter(const uint8_t* data, int width, int height, //------------------------------------------------------------------------------ -static void VerticalUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data); +static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + uint8_t pred = (prev == NULL) ? 0 : prev[0]; + int i; + for (i = 0; i < width; ++i) { + out[i] = pred + in[i]; + pred = out[i]; + } } -static void HorizontalUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data); +static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter(NULL, in, out, width); + } else { + int i; + for (i = 0; i < width; ++i) out[i] = prev[i] + in[i]; + } } -static void GradientUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoGradientFilter(data, width, height, stride, row, num_rows, 1, data); +static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter(NULL, in, out, width); + } else { + uint8_t top = prev[0], top_left = top, left = top; + int i; + for (i = 0; i < width; ++i) { + top = prev[i]; // need to read this first, in case prev==out + left = in[i] + GradientPredictor(left, top, top_left); + top_left = top; + out[i] = left; + } + } } //------------------------------------------------------------------------------ diff --git a/drivers/webp/dsp/filters_mips_dsp_r2.c b/drivers/webp/dsp/filters_mips_dsp_r2.c index 8134af511b..1d82e3c2e1 100644 --- a/drivers/webp/dsp/filters_mips_dsp_r2.c +++ b/drivers/webp/dsp/filters_mips_dsp_r2.c @@ -33,10 +33,6 @@ assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ (void)height; // Silence unused warning. -// if INVERSE -// preds == &dst[-1] == &src[-1] -// else -// preds == &src[-1] != &dst[-1] #define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do { \ const uint8_t* psrc = (uint8_t*)(SRC); \ uint8_t* pdst = (uint8_t*)(DST); \ @@ -45,27 +41,28 @@ __asm__ volatile ( \ ".set push \n\t" \ ".set noreorder \n\t" \ - "srl %[temp0], %[length], 0x2 \n\t" \ + "srl %[temp0], %[length], 2 \n\t" \ "beqz %[temp0], 4f \n\t" \ - " andi %[temp6], %[length], 0x3 \n\t" \ + " andi %[temp6], %[length], 3 \n\t" \ ".if " #INVERSE " \n\t" \ - "lbu %[temp1], -1(%[src]) \n\t" \ "1: \n\t" \ + "lbu %[temp1], -1(%[dst]) \n\t" \ "lbu %[temp2], 0(%[src]) \n\t" \ "lbu %[temp3], 1(%[src]) \n\t" \ "lbu %[temp4], 2(%[src]) \n\t" \ "lbu %[temp5], 3(%[src]) \n\t" \ + "addu %[temp1], %[temp1], %[temp2] \n\t" \ + "addu %[temp2], %[temp1], %[temp3] \n\t" \ + "addu %[temp3], %[temp2], %[temp4] \n\t" \ + "addu %[temp4], %[temp3], %[temp5] \n\t" \ + "sb %[temp1], 0(%[dst]) \n\t" \ + "sb %[temp2], 1(%[dst]) \n\t" \ + "sb %[temp3], 2(%[dst]) \n\t" \ + "sb %[temp4], 3(%[dst]) \n\t" \ "addiu %[src], %[src], 4 \n\t" \ "addiu %[temp0], %[temp0], -1 \n\t" \ - "addu %[temp2], %[temp2], %[temp1] \n\t" \ - "addu %[temp3], %[temp3], %[temp2] \n\t" \ - "addu %[temp4], %[temp4], %[temp3] \n\t" \ - "addu %[temp1], %[temp5], %[temp4] \n\t" \ - "sb %[temp2], -4(%[src]) \n\t" \ - "sb %[temp3], -3(%[src]) \n\t" \ - "sb %[temp4], -2(%[src]) \n\t" \ "bnez %[temp0], 1b \n\t" \ - " sb %[temp1], -1(%[src]) \n\t" \ + " addiu %[dst], %[dst], 4 \n\t" \ ".else \n\t" \ "1: \n\t" \ "ulw %[temp1], -1(%[src]) \n\t" \ @@ -81,16 +78,16 @@ "beqz %[temp6], 3f \n\t" \ " nop \n\t" \ "2: \n\t" \ - "lbu %[temp1], -1(%[src]) \n\t" \ "lbu %[temp2], 0(%[src]) \n\t" \ - "addiu %[src], %[src], 1 \n\t" \ ".if " #INVERSE " \n\t" \ + "lbu %[temp1], -1(%[dst]) \n\t" \ "addu %[temp3], %[temp1], %[temp2] \n\t" \ - "sb %[temp3], -1(%[src]) \n\t" \ ".else \n\t" \ + "lbu %[temp1], -1(%[src]) \n\t" \ "subu %[temp3], %[temp1], %[temp2] \n\t" \ - "sb %[temp3], 0(%[dst]) \n\t" \ ".endif \n\t" \ + "addiu %[src], %[src], 1 \n\t" \ + "sb %[temp3], 0(%[dst]) \n\t" \ "addiu %[temp6], %[temp6], -1 \n\t" \ "bnez %[temp6], 2b \n\t" \ " addiu %[dst], %[dst], 1 \n\t" \ @@ -105,12 +102,8 @@ } while (0) static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst, - int length, int inverse) { - if (inverse) { - DO_PREDICT_LINE(src, dst, length, 1); - } else { - DO_PREDICT_LINE(src, dst, length, 0); - } + int length) { + DO_PREDICT_LINE(src, dst, length, 0); } #define DO_PREDICT_LINE_VERTICAL(SRC, PRED, DST, LENGTH, INVERSE) do { \ @@ -172,16 +165,12 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst, ); \ } while (0) -#define PREDICT_LINE_ONE_PASS(SRC, PRED, DST, INVERSE) do { \ +#define PREDICT_LINE_ONE_PASS(SRC, PRED, DST) do { \ int temp1, temp2, temp3; \ __asm__ volatile ( \ "lbu %[temp1], 0(%[src]) \n\t" \ "lbu %[temp2], 0(%[pred]) \n\t" \ - ".if " #INVERSE " \n\t" \ - "addu %[temp3], %[temp1], %[temp2] \n\t" \ - ".else \n\t" \ "subu %[temp3], %[temp1], %[temp2] \n\t" \ - ".endif \n\t" \ "sb %[temp3], 0(%[dst]) \n\t" \ : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \ : [pred]"r"((PRED)), [dst]"r"((DST)), [src]"r"((SRC)) \ @@ -192,10 +181,10 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst, //------------------------------------------------------------------------------ // Horizontal filter. -#define FILTER_LINE_BY_LINE(INVERSE) do { \ +#define FILTER_LINE_BY_LINE do { \ while (row < last_row) { \ - PREDICT_LINE_ONE_PASS(in, preds - stride, out, INVERSE); \ - DO_PREDICT_LINE(in + 1, out + 1, width - 1, INVERSE); \ + PREDICT_LINE_ONE_PASS(in, preds - stride, out); \ + DO_PREDICT_LINE(in + 1, out + 1, width - 1, 0); \ ++row; \ preds += stride; \ in += stride; \ @@ -206,19 +195,19 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst, static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, int width, int height, int stride, int row, int num_rows, - int inverse, uint8_t* out) { + uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); in += start_offset; out += start_offset; - preds = inverse ? out : in; + preds = in; if (row == 0) { // Leftmost pixel is the same as input for topmost scanline. out[0] = in[0]; - PredictLine(in + 1, out + 1, width - 1, inverse); + PredictLine(in + 1, out + 1, width - 1); row = 1; preds += stride; in += stride; @@ -226,31 +215,21 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, } // Filter line-by-line. - if (inverse) { - FILTER_LINE_BY_LINE(1); - } else { - FILTER_LINE_BY_LINE(0); - } + FILTER_LINE_BY_LINE; } - #undef FILTER_LINE_BY_LINE static void HorizontalFilter(const uint8_t* data, int width, int height, int stride, uint8_t* filtered_data) { - DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data); -} - -static void HorizontalUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data); + DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data); } //------------------------------------------------------------------------------ // Vertical filter. -#define FILTER_LINE_BY_LINE(INVERSE) do { \ +#define FILTER_LINE_BY_LINE do { \ while (row < last_row) { \ - DO_PREDICT_LINE_VERTICAL(in, preds, out, width, INVERSE); \ + DO_PREDICT_LINE_VERTICAL(in, preds, out, width, 0); \ ++row; \ preds += stride; \ in += stride; \ @@ -260,21 +239,20 @@ static void HorizontalUnfilter(int width, int height, int stride, int row, static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, int width, int height, int stride, - int row, int num_rows, - int inverse, uint8_t* out) { + int row, int num_rows, uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); in += start_offset; out += start_offset; - preds = inverse ? out : in; + preds = in; if (row == 0) { // Very first top-left pixel is copied. out[0] = in[0]; // Rest of top scan-line is left-predicted. - PredictLine(in + 1, out + 1, width - 1, inverse); + PredictLine(in + 1, out + 1, width - 1); row = 1; in += stride; out += stride; @@ -284,24 +262,13 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, } // Filter line-by-line. - if (inverse) { - FILTER_LINE_BY_LINE(1); - } else { - FILTER_LINE_BY_LINE(0); - } + FILTER_LINE_BY_LINE; } - #undef FILTER_LINE_BY_LINE -#undef DO_PREDICT_LINE_VERTICAL static void VerticalFilter(const uint8_t* data, int width, int height, int stride, uint8_t* filtered_data) { - DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data); -} - -static void VerticalUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data); + DoVerticalFilter(data, width, height, stride, 0, height, filtered_data); } //------------------------------------------------------------------------------ @@ -321,10 +288,10 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) { return temp0; } -#define FILTER_LINE_BY_LINE(INVERSE, PREDS, OPERATION) do { \ +#define FILTER_LINE_BY_LINE(PREDS, OPERATION) do { \ while (row < last_row) { \ int w; \ - PREDICT_LINE_ONE_PASS(in, PREDS - stride, out, INVERSE); \ + PREDICT_LINE_ONE_PASS(in, PREDS - stride, out); \ for (w = 1; w < width; ++w) { \ const int pred = GradientPredictor(PREDS[w - 1], \ PREDS[w - stride], \ @@ -339,20 +306,19 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) { static WEBP_INLINE void DoGradientFilter(const uint8_t* in, int width, int height, int stride, - int row, int num_rows, - int inverse, uint8_t* out) { + int row, int num_rows, uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); in += start_offset; out += start_offset; - preds = inverse ? out : in; + preds = in; // left prediction for top scan-line if (row == 0) { out[0] = in[0]; - PredictLine(in + 1, out + 1, width - 1, inverse); + PredictLine(in + 1, out + 1, width - 1); row = 1; preds += stride; in += stride; @@ -360,25 +326,49 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, } // Filter line-by-line. - if (inverse) { - FILTER_LINE_BY_LINE(1, out, +); - } else { - FILTER_LINE_BY_LINE(0, in, -); - } + FILTER_LINE_BY_LINE(in, -); } - #undef FILTER_LINE_BY_LINE static void GradientFilter(const uint8_t* data, int width, int height, int stride, uint8_t* filtered_data) { - DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data); + DoGradientFilter(data, width, height, stride, 0, height, filtered_data); } -static void GradientUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoGradientFilter(data, width, height, stride, row, num_rows, 1, data); +//------------------------------------------------------------------------------ + +static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + out[0] = in[0] + (prev == NULL ? 0 : prev[0]); + DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1); } +static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter(NULL, in, out, width); + } else { + DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1); + } +} + +static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter(NULL, in, out, width); + } else { + uint8_t top = prev[0], top_left = top, left = top; + int i; + for (i = 0; i < width; ++i) { + top = prev[i]; // need to read this first, in case prev==dst + left = in[i] + GradientPredictor(left, top, top_left); + top_left = top; + out[i] = left; + } + } +} + +#undef DO_PREDICT_LINE_VERTICAL #undef PREDICT_LINE_ONE_PASS #undef DO_PREDICT_LINE #undef SANITY_CHECK @@ -389,13 +379,13 @@ static void GradientUnfilter(int width, int height, int stride, int row, extern void VP8FiltersInitMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) { - WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter; - WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter; - WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter; - WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter; WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter; WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter; + + WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter; + WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter; + WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter; } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/drivers/webp/dsp/filters_sse2.c b/drivers/webp/dsp/filters_sse2.c index bf93342eb7..67f77999e6 100644 --- a/drivers/webp/dsp/filters_sse2.c +++ b/drivers/webp/dsp/filters_sse2.c @@ -33,82 +33,39 @@ (void)height; // Silence unused warning. static void PredictLineTop(const uint8_t* src, const uint8_t* pred, - uint8_t* dst, int length, int inverse) { + uint8_t* dst, int length) { int i; const int max_pos = length & ~31; assert(length >= 0); - if (inverse) { - for (i = 0; i < max_pos; i += 32) { - const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i + 0]); - const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]); - const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i + 0]); - const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]); - const __m128i C0 = _mm_add_epi8(A0, B0); - const __m128i C1 = _mm_add_epi8(A1, B1); - _mm_storeu_si128((__m128i*)&dst[i + 0], C0); - _mm_storeu_si128((__m128i*)&dst[i + 16], C1); - } - for (; i < length; ++i) dst[i] = src[i] + pred[i]; - } else { - for (i = 0; i < max_pos; i += 32) { - const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i + 0]); - const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]); - const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i + 0]); - const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]); - const __m128i C0 = _mm_sub_epi8(A0, B0); - const __m128i C1 = _mm_sub_epi8(A1, B1); - _mm_storeu_si128((__m128i*)&dst[i + 0], C0); - _mm_storeu_si128((__m128i*)&dst[i + 16], C1); - } - for (; i < length; ++i) dst[i] = src[i] - pred[i]; + for (i = 0; i < max_pos; i += 32) { + const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i + 0]); + const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]); + const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i + 0]); + const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]); + const __m128i C0 = _mm_sub_epi8(A0, B0); + const __m128i C1 = _mm_sub_epi8(A1, B1); + _mm_storeu_si128((__m128i*)&dst[i + 0], C0); + _mm_storeu_si128((__m128i*)&dst[i + 16], C1); } + for (; i < length; ++i) dst[i] = src[i] - pred[i]; } // Special case for left-based prediction (when preds==dst-1 or preds==src-1). -static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length, - int inverse) { +static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) { int i; - if (length <= 0) return; - if (inverse) { - const int max_pos = length & ~7; - __m128i last = _mm_set_epi32(0, 0, 0, dst[-1]); - for (i = 0; i < max_pos; i += 8) { - const __m128i A0 = _mm_loadl_epi64((const __m128i*)(src + i)); - const __m128i A1 = _mm_add_epi8(A0, last); - const __m128i A2 = _mm_slli_si128(A1, 1); - const __m128i A3 = _mm_add_epi8(A1, A2); - const __m128i A4 = _mm_slli_si128(A3, 2); - const __m128i A5 = _mm_add_epi8(A3, A4); - const __m128i A6 = _mm_slli_si128(A5, 4); - const __m128i A7 = _mm_add_epi8(A5, A6); - _mm_storel_epi64((__m128i*)(dst + i), A7); - last = _mm_srli_epi64(A7, 56); - } - for (; i < length; ++i) dst[i] = src[i] + dst[i - 1]; - } else { - const int max_pos = length & ~31; - for (i = 0; i < max_pos; i += 32) { - const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i + 0 )); - const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i + 0 - 1)); - const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16 )); - const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1)); - const __m128i C0 = _mm_sub_epi8(A0, B0); - const __m128i C1 = _mm_sub_epi8(A1, B1); - _mm_storeu_si128((__m128i*)(dst + i + 0), C0); - _mm_storeu_si128((__m128i*)(dst + i + 16), C1); - } - for (; i < length; ++i) dst[i] = src[i] - src[i - 1]; - } -} - -static void PredictLineC(const uint8_t* src, const uint8_t* pred, - uint8_t* dst, int length, int inverse) { - int i; - if (inverse) { - for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i]; - } else { - for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i]; + const int max_pos = length & ~31; + assert(length >= 0); + for (i = 0; i < max_pos; i += 32) { + const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i + 0 )); + const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i + 0 - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16 )); + const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1)); + const __m128i C0 = _mm_sub_epi8(A0, B0); + const __m128i C1 = _mm_sub_epi8(A1, B1); + _mm_storeu_si128((__m128i*)(dst + i + 0), C0); + _mm_storeu_si128((__m128i*)(dst + i + 16), C1); } + for (; i < length; ++i) dst[i] = src[i] - src[i - 1]; } //------------------------------------------------------------------------------ @@ -117,21 +74,18 @@ static void PredictLineC(const uint8_t* src, const uint8_t* pred, static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, int width, int height, int stride, int row, int num_rows, - int inverse, uint8_t* out) { - const uint8_t* preds; + uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); in += start_offset; out += start_offset; - preds = inverse ? out : in; if (row == 0) { // Leftmost pixel is the same as input for topmost scanline. out[0] = in[0]; - PredictLineLeft(in + 1, out + 1, width - 1, inverse); + PredictLineLeft(in + 1, out + 1, width - 1); row = 1; - preds += stride; in += stride; out += stride; } @@ -139,10 +93,9 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, // Filter line-by-line. while (row < last_row) { // Leftmost pixel is predicted from above. - PredictLineC(in, preds - stride, out, 1, inverse); - PredictLineLeft(in + 1, out + 1, width - 1, inverse); + out[0] = in[0] - in[-stride]; + PredictLineLeft(in + 1, out + 1, width - 1); ++row; - preds += stride; in += stride; out += stride; } @@ -153,34 +106,27 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, int width, int height, int stride, - int row, int num_rows, - int inverse, uint8_t* out) { - const uint8_t* preds; + int row, int num_rows, uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); in += start_offset; out += start_offset; - preds = inverse ? out : in; if (row == 0) { // Very first top-left pixel is copied. out[0] = in[0]; // Rest of top scan-line is left-predicted. - PredictLineLeft(in + 1, out + 1, width - 1, inverse); + PredictLineLeft(in + 1, out + 1, width - 1); row = 1; in += stride; out += stride; - } else { - // We are starting from in-between. Make sure 'preds' points to prev row. - preds -= stride; } // Filter line-by-line. while (row < last_row) { - PredictLineTop(in, preds, out, width, inverse); + PredictLineTop(in, in - stride, out, width); ++row; - preds += stride; in += stride; out += stride; } @@ -219,49 +165,10 @@ static void GradientPredictDirect(const uint8_t* const row, } } -static void GradientPredictInverse(const uint8_t* const in, - const uint8_t* const top, - uint8_t* const row, int length) { - if (length > 0) { - int i; - const int max_pos = length & ~7; - const __m128i zero = _mm_setzero_si128(); - __m128i A = _mm_set_epi32(0, 0, 0, row[-1]); // left sample - for (i = 0; i < max_pos; i += 8) { - const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]); - const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]); - const __m128i B = _mm_unpacklo_epi8(tmp0, zero); - const __m128i C = _mm_unpacklo_epi8(tmp1, zero); - const __m128i tmp2 = _mm_loadl_epi64((const __m128i*)&in[i]); - const __m128i D = _mm_unpacklo_epi8(tmp2, zero); // base input - const __m128i E = _mm_sub_epi16(B, C); // unclipped gradient basis B - C - __m128i out = zero; // accumulator for output - __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff); - int k = 8; - while (1) { - const __m128i tmp3 = _mm_add_epi16(A, E); // delta = A + B - C - const __m128i tmp4 = _mm_min_epi16(tmp3, mask_hi); - const __m128i tmp5 = _mm_max_epi16(tmp4, zero); // clipped delta - const __m128i tmp6 = _mm_add_epi16(tmp5, D); // add to in[] values - A = _mm_and_si128(tmp6, mask_hi); // 1-complement clip - out = _mm_or_si128(out, A); // accumulate output - if (--k == 0) break; - A = _mm_slli_si128(A, 2); // rotate left sample - mask_hi = _mm_slli_si128(mask_hi, 2); // rotate mask - } - A = _mm_srli_si128(A, 14); // prepare left sample for next iteration - _mm_storel_epi64((__m128i*)&row[i], _mm_packus_epi16(out, zero)); - } - for (; i < length; ++i) { - row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]); - } - } -} - static WEBP_INLINE void DoGradientFilter(const uint8_t* in, int width, int height, int stride, int row, int num_rows, - int inverse, uint8_t* out) { + uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); @@ -271,7 +178,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, // left prediction for top scan-line if (row == 0) { out[0] = in[0]; - PredictLineLeft(in + 1, out + 1, width - 1, inverse); + PredictLineLeft(in + 1, out + 1, width - 1); row = 1; in += stride; out += stride; @@ -279,13 +186,8 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, // Filter line-by-line. while (row < last_row) { - if (inverse) { - PredictLineC(in, out - stride, out, 1, inverse); // predict from above - GradientPredictInverse(in + 1, out + 1 - stride, out + 1, width - 1); - } else { - PredictLineC(in, in - stride, out, 1, inverse); - GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1); - } + out[0] = in[0] - in[-stride]; + GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1); ++row; in += stride; out += stride; @@ -298,36 +200,112 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, static void HorizontalFilter(const uint8_t* data, int width, int height, int stride, uint8_t* filtered_data) { - DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data); + DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data); } static void VerticalFilter(const uint8_t* data, int width, int height, int stride, uint8_t* filtered_data) { - DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data); + DoVerticalFilter(data, width, height, stride, 0, height, filtered_data); } - static void GradientFilter(const uint8_t* data, int width, int height, int stride, uint8_t* filtered_data) { - DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data); + DoGradientFilter(data, width, height, stride, 0, height, filtered_data); } - //------------------------------------------------------------------------------ +// Inverse transforms -static void VerticalUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data); +static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + int i; + __m128i last; + out[0] = in[0] + (prev == NULL ? 0 : prev[0]); + if (width <= 1) return; + last = _mm_set_epi32(0, 0, 0, out[0]); + for (i = 1; i + 8 <= width; i += 8) { + const __m128i A0 = _mm_loadl_epi64((const __m128i*)(in + i)); + const __m128i A1 = _mm_add_epi8(A0, last); + const __m128i A2 = _mm_slli_si128(A1, 1); + const __m128i A3 = _mm_add_epi8(A1, A2); + const __m128i A4 = _mm_slli_si128(A3, 2); + const __m128i A5 = _mm_add_epi8(A3, A4); + const __m128i A6 = _mm_slli_si128(A5, 4); + const __m128i A7 = _mm_add_epi8(A5, A6); + _mm_storel_epi64((__m128i*)(out + i), A7); + last = _mm_srli_epi64(A7, 56); + } + for (; i < width; ++i) out[i] = in[i] + out[i - 1]; } -static void HorizontalUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data); +static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter(NULL, in, out, width); + } else { + int i; + const int max_pos = width & ~31; + assert(width >= 0); + for (i = 0; i < max_pos; i += 32) { + const __m128i A0 = _mm_loadu_si128((const __m128i*)&in[i + 0]); + const __m128i A1 = _mm_loadu_si128((const __m128i*)&in[i + 16]); + const __m128i B0 = _mm_loadu_si128((const __m128i*)&prev[i + 0]); + const __m128i B1 = _mm_loadu_si128((const __m128i*)&prev[i + 16]); + const __m128i C0 = _mm_add_epi8(A0, B0); + const __m128i C1 = _mm_add_epi8(A1, B1); + _mm_storeu_si128((__m128i*)&out[i + 0], C0); + _mm_storeu_si128((__m128i*)&out[i + 16], C1); + } + for (; i < width; ++i) out[i] = in[i] + prev[i]; + } } -static void GradientUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoGradientFilter(data, width, height, stride, row, num_rows, 1, data); +static void GradientPredictInverse(const uint8_t* const in, + const uint8_t* const top, + uint8_t* const row, int length) { + if (length > 0) { + int i; + const int max_pos = length & ~7; + const __m128i zero = _mm_setzero_si128(); + __m128i A = _mm_set_epi32(0, 0, 0, row[-1]); // left sample + for (i = 0; i < max_pos; i += 8) { + const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]); + const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]); + const __m128i B = _mm_unpacklo_epi8(tmp0, zero); + const __m128i C = _mm_unpacklo_epi8(tmp1, zero); + const __m128i D = _mm_loadl_epi64((const __m128i*)&in[i]); // base input + const __m128i E = _mm_sub_epi16(B, C); // unclipped gradient basis B - C + __m128i out = zero; // accumulator for output + __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff); + int k = 8; + while (1) { + const __m128i tmp3 = _mm_add_epi16(A, E); // delta = A + B - C + const __m128i tmp4 = _mm_packus_epi16(tmp3, zero); // saturate delta + const __m128i tmp5 = _mm_add_epi8(tmp4, D); // add to in[] + A = _mm_and_si128(tmp5, mask_hi); // 1-complement clip + out = _mm_or_si128(out, A); // accumulate output + if (--k == 0) break; + A = _mm_slli_si128(A, 1); // rotate left sample + mask_hi = _mm_slli_si128(mask_hi, 1); // rotate mask + A = _mm_unpacklo_epi8(A, zero); // convert 8b->16b + } + A = _mm_srli_si128(A, 7); // prepare left sample for next iteration + _mm_storel_epi64((__m128i*)&row[i], out); + } + for (; i < length; ++i) { + row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]); + } + } +} + +static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter(NULL, in, out, width); + } else { + out[0] = in[0] + prev[0]; // predict from above + GradientPredictInverse(in + 1, prev + 1, out + 1, width - 1); + } } //------------------------------------------------------------------------------ diff --git a/drivers/webp/dsp/lossless.c b/drivers/webp/dsp/lossless.c index 5702eb3b17..af913efccb 100644 --- a/drivers/webp/dsp/lossless.c +++ b/drivers/webp/dsp/lossless.c @@ -28,9 +28,7 @@ // In-place sum of each component with mod 256. static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) { - const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u); - const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu); - *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu); + *a = VP8LAddPixels(*a, b); } static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) { @@ -490,7 +488,7 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst, #if !defined(WORDS_BIGENDIAN) #if !defined(WEBP_REFERENCE_IMPLEMENTATION) - *(uint32_t*)dst = BSwap32(argb); + WebPUint32ToMem(dst, BSwap32(argb)); #else // WEBP_REFERENCE_IMPLEMENTATION dst[0] = (argb >> 24) & 0xff; dst[1] = (argb >> 16) & 0xff; diff --git a/drivers/webp/dsp/lossless.h b/drivers/webp/dsp/lossless.h index 149c6a01d3..7709b4fe85 100644 --- a/drivers/webp/dsp/lossless.h +++ b/drivers/webp/dsp/lossless.h @@ -29,9 +29,6 @@ extern "C" { #include "../enc/delta_palettization.h" #endif // WEBP_EXPERIMENTAL_FEATURES -// Not a trivial literal symbol. -#define VP8L_NON_TRIVIAL_SYM (0xffffffff) - //------------------------------------------------------------------------------ // Decoding @@ -161,7 +158,8 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride, void VP8LResidualImage(int width, int height, int bits, int low_effort, uint32_t* const argb, uint32_t* const argb_scratch, - uint32_t* const image); + uint32_t* const image, int near_lossless, int exact, + int used_subtract_green); void VP8LColorSpaceTransform(int width, int height, int bits, int quality, uint32_t* const argb, uint32_t* image); @@ -175,8 +173,27 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size, return (size + (1 << sampling_bits) - 1) >> sampling_bits; } +// Converts near lossless quality into max number of bits shaved off. +static WEBP_INLINE int VP8LNearLosslessBits(int near_lossless_quality) { + // 100 -> 0 + // 80..99 -> 1 + // 60..79 -> 2 + // 40..59 -> 3 + // 20..39 -> 4 + // 0..19 -> 5 + return 5 - near_lossless_quality / 20; +} + // ----------------------------------------------------------------------------- // Faster logarithm for integers. Small values use a look-up table. + +// The threshold till approximate version of log_2 can be used. +// Practically, we can get rid of the call to log() as the two values match to +// very high degree (the ratio of these two is 0.99999x). +// Keeping a high threshold for now. +#define APPROX_LOG_WITH_CORRECTION_MAX 65536 +#define APPROX_LOG_MAX 4096 +#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086 #define LOG_LOOKUP_IDX_MAX 256 extern const float kLog2Table[LOG_LOOKUP_IDX_MAX]; extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX]; @@ -199,42 +216,55 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) { typedef double (*VP8LCostFunc)(const uint32_t* population, int length); typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y, int length); +typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256], + const int Y[256]); extern VP8LCostFunc VP8LExtraCost; extern VP8LCostCombinedFunc VP8LExtraCostCombined; +extern VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy; typedef struct { // small struct to hold counters int counts[2]; // index: 0=zero steak, 1=non-zero streak int streaks[2][2]; // [zero/non-zero][streak<3 / streak>=3] } VP8LStreaks; -typedef VP8LStreaks (*VP8LCostCountFunc)(const uint32_t* population, - int length); typedef VP8LStreaks (*VP8LCostCombinedCountFunc)(const uint32_t* X, const uint32_t* Y, int length); -extern VP8LCostCountFunc VP8LHuffmanCostCount; extern VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount; -// Get the symbol entropy for the distribution 'population'. -// Set 'trivial_sym', if there's only one symbol present in the distribution. -double VP8LPopulationCost(const uint32_t* const population, int length, - uint32_t* const trivial_sym); - -// Get the combined symbol entropy for the distributions 'X' and 'Y'. -double VP8LGetCombinedEntropy(const uint32_t* const X, - const uint32_t* const Y, int length); - -double VP8LBitsEntropy(const uint32_t* const array, int n, - uint32_t* const trivial_symbol); - -// Estimate how many bits the combined entropy of literals and distance -// approximately maps to. -double VP8LHistogramEstimateBits(const VP8LHistogram* const p); - -// This function estimates the cost in bits excluding the bits needed to -// represent the entropy code itself. -double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p); +typedef struct { // small struct to hold bit entropy results + double entropy; // entropy + uint32_t sum; // sum of the population + int nonzeros; // number of non-zero elements in the population + uint32_t max_val; // maximum value in the population + uint32_t nonzero_code; // index of the last non-zero in the population +} VP8LBitEntropy; + +void VP8LBitEntropyInit(VP8LBitEntropy* const entropy); + +// Get the combined symbol bit entropy and Huffman cost stats for the +// distributions 'X' and 'Y'. Those results can then be refined according to +// codec specific heuristics. +void VP8LGetCombinedEntropyUnrefined(const uint32_t* const X, + const uint32_t* const Y, int length, + VP8LBitEntropy* const bit_entropy, + VP8LStreaks* const stats); +// Get the entropy for the distribution 'X'. +void VP8LGetEntropyUnrefined(const uint32_t* const X, int length, + VP8LBitEntropy* const bit_entropy, + VP8LStreaks* const stats); + +void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n, + VP8LBitEntropy* const entropy); + +typedef void (*GetEntropyUnrefinedHelperFunc)(uint32_t val, int i, + uint32_t* const val_prev, + int* const i_prev, + VP8LBitEntropy* const bit_entropy, + VP8LStreaks* const stats); +// Internal function used by VP8LGet*EntropyUnrefined. +extern GetEntropyUnrefinedHelperFunc VP8LGetEntropyUnrefinedHelper; typedef void (*VP8LHistogramAddFunc)(const VP8LHistogram* const a, const VP8LHistogram* const b, @@ -244,6 +274,11 @@ extern VP8LHistogramAddFunc VP8LHistogramAdd; // ----------------------------------------------------------------------------- // PrefixEncode() +typedef int (*VP8LVectorMismatchFunc)(const uint32_t* const array1, + const uint32_t* const array2, int length); +// Returns the first index where array1 and array2 are different. +extern VP8LVectorMismatchFunc VP8LVectorMismatch; + static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) { const int log_floor = BitsLog2Floor(n); if (n == (n & ~(n - 1))) // zero or a power of two. @@ -306,7 +341,14 @@ static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code, } } -// In-place difference of each component with mod 256. +// Sum of each component, mod 256. +static WEBP_INLINE uint32_t VP8LAddPixels(uint32_t a, uint32_t b) { + const uint32_t alpha_and_green = (a & 0xff00ff00u) + (b & 0xff00ff00u); + const uint32_t red_and_blue = (a & 0x00ff00ffu) + (b & 0x00ff00ffu); + return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu); +} + +// Difference of each component, mod 256. static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) { const uint32_t alpha_and_green = 0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u); diff --git a/drivers/webp/dsp/lossless_enc.c b/drivers/webp/dsp/lossless_enc.c index b3036f5384..256f6f5f8b 100644 --- a/drivers/webp/dsp/lossless_enc.c +++ b/drivers/webp/dsp/lossless_enc.c @@ -24,6 +24,9 @@ #define MAX_DIFF_COST (1e30f) +static const int kPredLowEffort = 11; +static const uint32_t kMaskAlpha = 0xff000000; + // lookup table for small values of log2(int) const float kLog2Table[LOG_LOOKUP_IDX_MAX] = { 0.0000000000000000f, 0.0000000000000000f, @@ -326,13 +329,6 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = { 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126 }; -// The threshold till approximate version of log_2 can be used. -// Practically, we can get rid of the call to log() as the two values match to -// very high degree (the ratio of these two is 0.99999x). -// Keeping a high threshold for now. -#define APPROX_LOG_WITH_CORRECTION_MAX 65536 -#define APPROX_LOG_MAX 4096 -#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086 static float FastSLog2Slow(uint32_t v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { @@ -386,6 +382,7 @@ static float FastLog2Slow(uint32_t v) { // Mostly used to reduce code size + readability static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; } +static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; } //------------------------------------------------------------------------------ // Methods to calculate Entropy (Shannon). @@ -410,15 +407,15 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) { int sumX = 0, sumXY = 0; for (i = 0; i < 256; ++i) { const int x = X[i]; - const int xy = x + Y[i]; if (x != 0) { + const int xy = x + Y[i]; sumX += x; retval -= VP8LFastSLog2(x); sumXY += xy; retval -= VP8LFastSLog2(xy); - } else if (xy != 0) { - sumXY += xy; - retval -= VP8LFastSLog2(xy); + } else if (Y[i] != 0) { + sumXY += Y[i]; + retval -= VP8LFastSLog2(Y[i]); } } retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); @@ -432,225 +429,327 @@ static float PredictionCostSpatialHistogram(const int accumulated[4][256], for (i = 0; i < 4; ++i) { const double kExpValue = 0.94; retval += PredictionCostSpatial(tile[i], 1, kExpValue); - retval += CombinedShannonEntropy(tile[i], accumulated[i]); + retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]); } return (float)retval; } -static WEBP_INLINE double BitsEntropyRefine(int nonzeros, int sum, int max_val, - double retval) { - double mix; - if (nonzeros < 5) { - if (nonzeros <= 1) { - return 0; - } - // Two symbols, they will be 0 and 1 in a Huffman code. - // Let's mix in a bit of entropy to favor good clustering when - // distributions of these are combined. - if (nonzeros == 2) { - return 0.99 * sum + 0.01 * retval; - } - // No matter what the entropy says, we cannot be better than min_limit - // with Huffman coding. I am mixing a bit of entropy into the - // min_limit since it produces much better (~0.5 %) compression results - // perhaps because of better entropy clustering. - if (nonzeros == 3) { - mix = 0.95; - } else { - mix = 0.7; // nonzeros == 4. - } - } else { - mix = 0.627; - } - - { - double min_limit = 2 * sum - max_val; - min_limit = mix * min_limit + (1.0 - mix) * retval; - return (retval < min_limit) ? min_limit : retval; - } +void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) { + entropy->entropy = 0.; + entropy->sum = 0; + entropy->nonzeros = 0; + entropy->max_val = 0; + entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM; } -// Returns the entropy for the symbols in the input array. -// Also sets trivial_symbol to the code value, if the array has only one code -// value. Otherwise, set it to VP8L_NON_TRIVIAL_SYM. -double VP8LBitsEntropy(const uint32_t* const array, int n, - uint32_t* const trivial_symbol) { - double retval = 0.; - uint32_t sum = 0; - uint32_t nonzero_code = VP8L_NON_TRIVIAL_SYM; - int nonzeros = 0; - uint32_t max_val = 0; +void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n, + VP8LBitEntropy* const entropy) { int i; + + VP8LBitEntropyInit(entropy); + for (i = 0; i < n; ++i) { if (array[i] != 0) { - sum += array[i]; - nonzero_code = i; - ++nonzeros; - retval -= VP8LFastSLog2(array[i]); - if (max_val < array[i]) { - max_val = array[i]; + entropy->sum += array[i]; + entropy->nonzero_code = i; + ++entropy->nonzeros; + entropy->entropy -= VP8LFastSLog2(array[i]); + if (entropy->max_val < array[i]) { + entropy->max_val = array[i]; } } } - retval += VP8LFastSLog2(sum); - if (trivial_symbol != NULL) { - *trivial_symbol = (nonzeros == 1) ? nonzero_code : VP8L_NON_TRIVIAL_SYM; - } - return BitsEntropyRefine(nonzeros, sum, max_val, retval); + entropy->entropy += VP8LFastSLog2(entropy->sum); } -static double InitialHuffmanCost(void) { - // Small bias because Huffman code length is typically not stored in - // full length. - static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3; - static const double kSmallBias = 9.1; - return kHuffmanCodeOfHuffmanCodeSize - kSmallBias; -} - -// Finalize the Huffman cost based on streak numbers and length type (<3 or >=3) -static double FinalHuffmanCost(const VP8LStreaks* const stats) { - double retval = InitialHuffmanCost(); - retval += stats->counts[0] * 1.5625 + 0.234375 * stats->streaks[0][1]; - retval += stats->counts[1] * 2.578125 + 0.703125 * stats->streaks[1][1]; - retval += 1.796875 * stats->streaks[0][0]; - retval += 3.28125 * stats->streaks[1][0]; - return retval; -} +static WEBP_INLINE void GetEntropyUnrefinedHelper( + uint32_t val, int i, uint32_t* const val_prev, int* const i_prev, + VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) { + const int streak = i - *i_prev; + + // Gather info for the bit entropy. + if (*val_prev != 0) { + bit_entropy->sum += (*val_prev) * streak; + bit_entropy->nonzeros += streak; + bit_entropy->nonzero_code = *i_prev; + bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak; + if (bit_entropy->max_val < *val_prev) { + bit_entropy->max_val = *val_prev; + } + } -// Trampolines -static double HuffmanCost(const uint32_t* const population, int length) { - const VP8LStreaks stats = VP8LHuffmanCostCount(population, length); - return FinalHuffmanCost(&stats); -} + // Gather info for the Huffman cost. + stats->counts[*val_prev != 0] += (streak > 3); + stats->streaks[*val_prev != 0][(streak > 3)] += streak; -// Aggregated costs -double VP8LPopulationCost(const uint32_t* const population, int length, - uint32_t* const trivial_sym) { - return - VP8LBitsEntropy(population, length, trivial_sym) + - HuffmanCost(population, length); + *val_prev = val; + *i_prev = i; } -double VP8LGetCombinedEntropy(const uint32_t* const X, - const uint32_t* const Y, int length) { - double bits_entropy_combined; - double huffman_cost_combined; +void VP8LGetEntropyUnrefined(const uint32_t* const X, int length, + VP8LBitEntropy* const bit_entropy, + VP8LStreaks* const stats) { int i; + int i_prev = 0; + uint32_t x_prev = X[0]; - // Bit entropy variables. - double retval = 0.; - int sum = 0; - int nonzeros = 0; - uint32_t max_val = 0; - int i_prev; - uint32_t xy; - - // Huffman cost variables. - int streak = 0; - uint32_t xy_prev; - VP8LStreaks stats; - memset(&stats, 0, sizeof(stats)); - - // Treat the first value for the huffman cost: this is keeping the original - // behavior, even though there is no first streak. - // TODO(vrabaud): study proper behavior - xy = X[0] + Y[0]; - ++stats.streaks[xy != 0][0]; - xy_prev = xy; - i_prev = 0; + memset(stats, 0, sizeof(*stats)); + VP8LBitEntropyInit(bit_entropy); for (i = 1; i < length; ++i) { - xy = X[i] + Y[i]; + const uint32_t x = X[i]; + if (x != x_prev) { + VP8LGetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats); + } + } + VP8LGetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats); - // Process data by streaks for both bit entropy and huffman cost. - if (xy != xy_prev) { - streak = i - i_prev; - - // Gather info for the bit entropy. - if (xy_prev != 0) { - sum += xy_prev * streak; - nonzeros += streak; - retval -= VP8LFastSLog2(xy_prev) * streak; - if (max_val < xy_prev) { - max_val = xy_prev; - } - } + bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); +} + +void VP8LGetCombinedEntropyUnrefined(const uint32_t* const X, + const uint32_t* const Y, int length, + VP8LBitEntropy* const bit_entropy, + VP8LStreaks* const stats) { + int i = 1; + int i_prev = 0; + uint32_t xy_prev = X[0] + Y[0]; - // Gather info for the huffman cost. - stats.counts[xy != 0] += (streak > 3); - stats.streaks[xy != 0][(streak > 3)] += streak; + memset(stats, 0, sizeof(*stats)); + VP8LBitEntropyInit(bit_entropy); - xy_prev = xy; - i_prev = i; + for (i = 1; i < length; ++i) { + const uint32_t xy = X[i] + Y[i]; + if (xy != xy_prev) { + VP8LGetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, + stats); } } + VP8LGetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats); - // Finish off the last streak for bit entropy. - if (xy != 0) { - streak = i - i_prev; - sum += xy * streak; - nonzeros += streak; - retval -= VP8LFastSLog2(xy) * streak; - if (max_val < xy) { - max_val = xy; - } + bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); +} + +static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) { + ++histo_argb[0][argb >> 24]; + ++histo_argb[1][(argb >> 16) & 0xff]; + ++histo_argb[2][(argb >> 8) & 0xff]; + ++histo_argb[3][argb & 0xff]; +} + +//------------------------------------------------------------------------------ + +static WEBP_INLINE uint32_t Predict(VP8LPredictorFunc pred_func, + int x, int y, + const uint32_t* current_row, + const uint32_t* upper_row) { + if (y == 0) { + return (x == 0) ? ARGB_BLACK : current_row[x - 1]; // Left. + } else if (x == 0) { + return upper_row[x]; // Top. + } else { + return pred_func(current_row[x - 1], upper_row + x); } - // Huffman cost is not updated with the last streak to keep original behavior. - // TODO(vrabaud): study proper behavior +} - retval += VP8LFastSLog2(sum); - bits_entropy_combined = BitsEntropyRefine(nonzeros, sum, max_val, retval); +static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) { + const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24)); + const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff)); + const int diff_g = abs((int)((p1 >> 8) & 0xff) - (int)((p2 >> 8) & 0xff)); + const int diff_b = abs((int)(p1 & 0xff) - (int)(p2 & 0xff)); + return GetMax(GetMax(diff_a, diff_r), GetMax(diff_g, diff_b)); +} - huffman_cost_combined = FinalHuffmanCost(&stats); +static int MaxDiffAroundPixel(uint32_t current, uint32_t up, uint32_t down, + uint32_t left, uint32_t right) { + const int diff_up = MaxDiffBetweenPixels(current, up); + const int diff_down = MaxDiffBetweenPixels(current, down); + const int diff_left = MaxDiffBetweenPixels(current, left); + const int diff_right = MaxDiffBetweenPixels(current, right); + return GetMax(GetMax(diff_up, diff_down), GetMax(diff_left, diff_right)); +} - return bits_entropy_combined + huffman_cost_combined; +static uint32_t AddGreenToBlueAndRed(uint32_t argb) { + const uint32_t green = (argb >> 8) & 0xff; + uint32_t red_blue = argb & 0x00ff00ffu; + red_blue += (green << 16) | green; + red_blue &= 0x00ff00ffu; + return (argb & 0xff00ff00u) | red_blue; } -// Estimates the Entropy + Huffman + other block overhead size cost. -double VP8LHistogramEstimateBits(const VP8LHistogram* const p) { - return - VP8LPopulationCost( - p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_), NULL) - + VP8LPopulationCost(p->red_, NUM_LITERAL_CODES, NULL) - + VP8LPopulationCost(p->blue_, NUM_LITERAL_CODES, NULL) - + VP8LPopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL) - + VP8LPopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL) - + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES) - + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES); +static void MaxDiffsForRow(int width, int stride, const uint32_t* const argb, + uint8_t* const max_diffs, int used_subtract_green) { + uint32_t current, up, down, left, right; + int x; + if (width <= 2) return; + current = argb[0]; + right = argb[1]; + if (used_subtract_green) { + current = AddGreenToBlueAndRed(current); + right = AddGreenToBlueAndRed(right); + } + // max_diffs[0] and max_diffs[width - 1] are never used. + for (x = 1; x < width - 1; ++x) { + up = argb[-stride + x]; + down = argb[stride + x]; + left = current; + current = right; + right = argb[x + 1]; + if (used_subtract_green) { + up = AddGreenToBlueAndRed(up); + down = AddGreenToBlueAndRed(down); + right = AddGreenToBlueAndRed(right); + } + max_diffs[x] = MaxDiffAroundPixel(current, up, down, left, right); + } } -double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) { - return - VP8LBitsEntropy(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_), - NULL) - + VP8LBitsEntropy(p->red_, NUM_LITERAL_CODES, NULL) - + VP8LBitsEntropy(p->blue_, NUM_LITERAL_CODES, NULL) - + VP8LBitsEntropy(p->alpha_, NUM_LITERAL_CODES, NULL) - + VP8LBitsEntropy(p->distance_, NUM_DISTANCE_CODES, NULL) - + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES) - + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES); +// Quantize the difference between the actual component value and its prediction +// to a multiple of quantization, working modulo 256, taking care not to cross +// a boundary (inclusive upper limit). +static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict, + uint8_t boundary, int quantization) { + const int residual = (value - predict) & 0xff; + const int boundary_residual = (boundary - predict) & 0xff; + const int lower = residual & ~(quantization - 1); + const int upper = lower + quantization; + // Resolve ties towards a value closer to the prediction (i.e. towards lower + // if value comes after prediction and towards upper otherwise). + const int bias = ((boundary - value) & 0xff) < boundary_residual; + if (residual - lower < upper - residual + bias) { + // lower is closer to residual than upper. + if (residual > boundary_residual && lower <= boundary_residual) { + // Halve quantization step to avoid crossing boundary. This midpoint is + // on the same side of boundary as residual because midpoint >= residual + // (since lower is closer than upper) and residual is above the boundary. + return lower + (quantization >> 1); + } + return lower; + } else { + // upper is closer to residual than lower. + if (residual <= boundary_residual && upper > boundary_residual) { + // Halve quantization step to avoid crossing boundary. This midpoint is + // on the same side of boundary as residual because midpoint <= residual + // (since upper is closer than lower) and residual is below the boundary. + return lower + (quantization >> 1); + } + return upper & 0xff; + } } -static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) { - ++histo_argb[0][argb >> 24]; - ++histo_argb[1][(argb >> 16) & 0xff]; - ++histo_argb[2][(argb >> 8) & 0xff]; - ++histo_argb[3][argb & 0xff]; +// Quantize every component of the difference between the actual pixel value and +// its prediction to a multiple of a quantization (a power of 2, not larger than +// max_quantization which is a power of 2, smaller than max_diff). Take care if +// value and predict have undergone subtract green, which means that red and +// blue are represented as offsets from green. +static uint32_t NearLossless(uint32_t value, uint32_t predict, + int max_quantization, int max_diff, + int used_subtract_green) { + int quantization; + uint8_t new_green = 0; + uint8_t green_diff = 0; + uint8_t a, r, g, b; + if (max_diff <= 2) { + return VP8LSubPixels(value, predict); + } + quantization = max_quantization; + while (quantization >= max_diff) { + quantization >>= 1; + } + if ((value >> 24) == 0 || (value >> 24) == 0xff) { + // Preserve transparency of fully transparent or fully opaque pixels. + a = ((value >> 24) - (predict >> 24)) & 0xff; + } else { + a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization); + } + g = NearLosslessComponent((value >> 8) & 0xff, (predict >> 8) & 0xff, 0xff, + quantization); + if (used_subtract_green) { + // The green offset will be added to red and blue components during decoding + // to obtain the actual red and blue values. + new_green = ((predict >> 8) + g) & 0xff; + // The amount by which green has been adjusted during quantization. It is + // subtracted from red and blue for compensation, to avoid accumulating two + // quantization errors in them. + green_diff = (new_green - (value >> 8)) & 0xff; + } + r = NearLosslessComponent(((value >> 16) - green_diff) & 0xff, + (predict >> 16) & 0xff, 0xff - new_green, + quantization); + b = NearLosslessComponent((value - green_diff) & 0xff, predict & 0xff, + 0xff - new_green, quantization); + return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b; } -//------------------------------------------------------------------------------ +// Returns the difference between the pixel and its prediction. In case of a +// lossy encoding, updates the source image to avoid propagating the deviation +// further to pixels which depend on the current pixel for their predictions. +static WEBP_INLINE uint32_t GetResidual(int width, int height, + uint32_t* const upper_row, + uint32_t* const current_row, + const uint8_t* const max_diffs, + int mode, VP8LPredictorFunc pred_func, + int x, int y, int max_quantization, + int exact, int used_subtract_green) { + const uint32_t predict = Predict(pred_func, x, y, current_row, upper_row); + uint32_t residual; + if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 || + x == 0 || x == width - 1) { + residual = VP8LSubPixels(current_row[x], predict); + } else { + residual = NearLossless(current_row[x], predict, max_quantization, + max_diffs[x], used_subtract_green); + // Update the source image. + current_row[x] = VP8LAddPixels(predict, residual); + // x is never 0 here so we do not need to update upper_row like below. + } + if (!exact && (current_row[x] & kMaskAlpha) == 0) { + // If alpha is 0, cleanup RGB. We can choose the RGB values of the residual + // for best compression. The prediction of alpha itself can be non-zero and + // must be kept though. We choose RGB of the residual to be 0. + residual &= kMaskAlpha; + // Update the source image. + current_row[x] = predict & ~kMaskAlpha; + // The prediction for the rightmost pixel in a row uses the leftmost pixel + // in that row as its top-right context pixel. Hence if we change the + // leftmost pixel of current_row, the corresponding change must be applied + // to upper_row as well where top-right context is being read from. + if (x == 0 && y != 0) upper_row[width] = current_row[0]; + } + return residual; +} // Returns best predictor and updates the accumulated histogram. +// If max_quantization > 1, assumes that near lossless processing will be +// applied, quantizing residuals to multiples of quantization levels up to +// max_quantization (the actual quantization level depends on smoothness near +// the given pixel). static int GetBestPredictorForTile(int width, int height, int tile_x, int tile_y, int bits, int accumulated[4][256], - const uint32_t* const argb_scratch) { + uint32_t* const argb_scratch, + const uint32_t* const argb, + int max_quantization, + int exact, int used_subtract_green) { const int kNumPredModes = 14; - const int col_start = tile_x << bits; - const int row_start = tile_y << bits; + const int start_x = tile_x << bits; + const int start_y = tile_y << bits; const int tile_size = 1 << bits; - const int max_y = GetMin(tile_size, height - row_start); - const int max_x = GetMin(tile_size, width - col_start); + const int max_y = GetMin(tile_size, height - start_y); + const int max_x = GetMin(tile_size, width - start_x); + // Whether there exist columns just outside the tile. + const int have_left = (start_x > 0); + const int have_right = (max_x < width - start_x); + // Position and size of the strip covering the tile and adjacent columns if + // they exist. + const int context_start_x = start_x - have_left; + const int context_width = max_x + have_left + have_right; + // The width of upper_row and current_row is one pixel larger than image width + // to allow the top right pixel to point to the leftmost pixel of the next row + // when at the right edge. + uint32_t* upper_row = argb_scratch; + uint32_t* current_row = upper_row + width + 1; + uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1); float best_diff = MAX_DIFF_COST; int best_mode = 0; int mode; @@ -659,30 +758,46 @@ static int GetBestPredictorForTile(int width, int height, // Need pointers to be able to swap arrays. int (*histo_argb)[256] = histo_stack_1; int (*best_histo)[256] = histo_stack_2; - int i, j; + for (mode = 0; mode < kNumPredModes; ++mode) { - const uint32_t* current_row = argb_scratch; const VP8LPredictorFunc pred_func = VP8LPredictors[mode]; float cur_diff; - int y; + int relative_y; memset(histo_argb, 0, sizeof(histo_stack_1)); - for (y = 0; y < max_y; ++y) { - int x; - const int row = row_start + y; - const uint32_t* const upper_row = current_row; - current_row = upper_row + width; - for (x = 0; x < max_x; ++x) { - const int col = col_start + x; - uint32_t predict; - if (row == 0) { - predict = (col == 0) ? ARGB_BLACK : current_row[col - 1]; // Left. - } else if (col == 0) { - predict = upper_row[col]; // Top. - } else { - predict = pred_func(current_row[col - 1], upper_row + col); - } - UpdateHisto(histo_argb, VP8LSubPixels(current_row[col], predict)); + if (start_y > 0) { + // Read the row above the tile which will become the first upper_row. + // Include a pixel to the left if it exists; include a pixel to the right + // in all cases (wrapping to the leftmost pixel of the next row if it does + // not exist). + memcpy(current_row + context_start_x, + argb + (start_y - 1) * width + context_start_x, + sizeof(*argb) * (max_x + have_left + 1)); + } + for (relative_y = 0; relative_y < max_y; ++relative_y) { + const int y = start_y + relative_y; + int relative_x; + uint32_t* tmp = upper_row; + upper_row = current_row; + current_row = tmp; + // Read current_row. Include a pixel to the left if it exists; include a + // pixel to the right in all cases except at the bottom right corner of + // the image (wrapping to the leftmost pixel of the next row if it does + // not exist in the current row). + memcpy(current_row + context_start_x, + argb + y * width + context_start_x, + sizeof(*argb) * (max_x + have_left + (y + 1 < height))); + if (max_quantization > 1 && y >= 1 && y + 1 < height) { + MaxDiffsForRow(context_width, width, argb + y * width + context_start_x, + max_diffs + context_start_x, used_subtract_green); + } + + for (relative_x = 0; relative_x < max_x; ++relative_x) { + const int x = start_x + relative_x; + UpdateHisto(histo_argb, + GetResidual(width, height, upper_row, current_row, + max_diffs, mode, pred_func, x, y, + max_quantization, exact, used_subtract_green)); } } cur_diff = PredictionCostSpatialHistogram( @@ -705,80 +820,103 @@ static int GetBestPredictorForTile(int width, int height, return best_mode; } +// Converts pixels of the image to residuals with respect to predictions. +// If max_quantization > 1, applies near lossless processing, quantizing +// residuals to multiples of quantization levels up to max_quantization +// (the actual quantization level depends on smoothness near the given pixel). static void CopyImageWithPrediction(int width, int height, int bits, uint32_t* const modes, uint32_t* const argb_scratch, - uint32_t* const argb) { + uint32_t* const argb, + int low_effort, int max_quantization, + int exact, int used_subtract_green) { const int tiles_per_row = VP8LSubSampleSize(width, bits); const int mask = (1 << bits) - 1; - // The row size is one pixel longer to allow the top right pixel to point to - // the leftmost pixel of the next row when at the right edge. - uint32_t* current_row = argb_scratch; - uint32_t* upper_row = argb_scratch + width + 1; + // The width of upper_row and current_row is one pixel larger than image width + // to allow the top right pixel to point to the leftmost pixel of the next row + // when at the right edge. + uint32_t* upper_row = argb_scratch; + uint32_t* current_row = upper_row + width + 1; + uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1); + uint8_t* lower_max_diffs = current_max_diffs + width; int y; - VP8LPredictorFunc pred_func = 0; + int mode = 0; + VP8LPredictorFunc pred_func = NULL; for (y = 0; y < height; ++y) { int x; - uint32_t* tmp = upper_row; + uint32_t* const tmp32 = upper_row; upper_row = current_row; - current_row = tmp; - memcpy(current_row, argb + y * width, sizeof(*current_row) * width); - current_row[width] = (y + 1 < height) ? argb[(y + 1) * width] : ARGB_BLACK; - for (x = 0; x < width; ++x) { - uint32_t predict; - if ((x & mask) == 0) { - const int mode = - (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff; - pred_func = VP8LPredictors[mode]; + current_row = tmp32; + memcpy(current_row, argb + y * width, + sizeof(*argb) * (width + (y + 1 < height))); + + if (low_effort) { + for (x = 0; x < width; ++x) { + const uint32_t predict = Predict(VP8LPredictors[kPredLowEffort], x, y, + current_row, upper_row); + argb[y * width + x] = VP8LSubPixels(current_row[x], predict); + } + } else { + if (max_quantization > 1) { + // Compute max_diffs for the lower row now, because that needs the + // contents of argb for the current row, which we will overwrite with + // residuals before proceeding with the next row. + uint8_t* const tmp8 = current_max_diffs; + current_max_diffs = lower_max_diffs; + lower_max_diffs = tmp8; + if (y + 2 < height) { + MaxDiffsForRow(width, width, argb + (y + 1) * width, lower_max_diffs, + used_subtract_green); + } } - if (y == 0) { - predict = (x == 0) ? ARGB_BLACK : current_row[x - 1]; // Left. - } else if (x == 0) { - predict = upper_row[x]; // Top. - } else { - predict = pred_func(current_row[x - 1], upper_row + x); + for (x = 0; x < width; ++x) { + if ((x & mask) == 0) { + mode = (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff; + pred_func = VP8LPredictors[mode]; + } + argb[y * width + x] = GetResidual( + width, height, upper_row, current_row, current_max_diffs, mode, + pred_func, x, y, max_quantization, exact, used_subtract_green); } - argb[y * width + x] = VP8LSubPixels(current_row[x], predict); } } } +// Finds the best predictor for each tile, and converts the image to residuals +// with respect to predictions. If near_lossless_quality < 100, applies +// near lossless processing, shaving off more bits of residuals for lower +// qualities. void VP8LResidualImage(int width, int height, int bits, int low_effort, uint32_t* const argb, uint32_t* const argb_scratch, - uint32_t* const image) { - const int max_tile_size = 1 << bits; + uint32_t* const image, int near_lossless_quality, + int exact, int used_subtract_green) { const int tiles_per_row = VP8LSubSampleSize(width, bits); const int tiles_per_col = VP8LSubSampleSize(height, bits); - const int kPredLowEffort = 11; - uint32_t* const upper_row = argb_scratch; - uint32_t* const current_tile_rows = argb_scratch + width; int tile_y; int histo[4][256]; - if (!low_effort) memset(histo, 0, sizeof(histo)); - for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) { - const int tile_y_offset = tile_y * max_tile_size; - const int this_tile_height = - (tile_y < tiles_per_col - 1) ? max_tile_size : height - tile_y_offset; - int tile_x; - if (tile_y > 0) { - memcpy(upper_row, current_tile_rows + (max_tile_size - 1) * width, - width * sizeof(*upper_row)); + const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality); + if (low_effort) { + int i; + for (i = 0; i < tiles_per_row * tiles_per_col; ++i) { + image[i] = ARGB_BLACK | (kPredLowEffort << 8); } - memcpy(current_tile_rows, &argb[tile_y_offset * width], - this_tile_height * width * sizeof(*current_tile_rows)); - for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) { - const int pred = - low_effort ? kPredLowEffort : - GetBestPredictorForTile(width, height, - tile_x, tile_y, bits, - (int (*)[256])histo, - argb_scratch); - image[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8); + } else { + memset(histo, 0, sizeof(histo)); + for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) { + int tile_x; + for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) { + const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y, + bits, histo, argb_scratch, argb, max_quantization, exact, + used_subtract_green); + image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8); + } } } - CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb); + CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb, + low_effort, max_quantization, exact, + used_subtract_green); } void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) { @@ -860,7 +998,7 @@ static float PredictionCostCrossColor(const int accumulated[256], // Favor low entropy, locally and globally. // Favor small absolute values for PredictionCostSpatial static const double kExpValue = 2.4; - return CombinedShannonEntropy(counts, accumulated) + + return VP8LCombinedShannonEntropy(counts, accumulated) + PredictionCostSpatial(counts, 3, kExpValue); } @@ -1124,6 +1262,17 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int quality, } //------------------------------------------------------------------------------ + +static int VectorMismatch(const uint32_t* const array1, + const uint32_t* const array2, int length) { + int match_len = 0; + + while (match_len < length && array1[match_len] == array2[match_len]) { + ++match_len; + } + return match_len; +} + // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel. void VP8LBundleColorMap(const uint8_t* const row, int width, int xbits, uint32_t* const dst) { @@ -1165,27 +1314,6 @@ static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y, return cost; } -// Returns the various RLE counts -static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) { - int i; - int streak = 0; - VP8LStreaks stats; - memset(&stats, 0, sizeof(stats)); - for (i = 0; i < length - 1; ++i) { - ++streak; - if (population[i] == population[i + 1]) { - continue; - } - stats.counts[population[i] != 0] += (streak > 3); - stats.streaks[population[i] != 0][(streak > 3)] += streak; - streak = 0; - } - ++streak; - stats.counts[population[i] != 0] += (streak > 3); - stats.streaks[population[i] != 0][(streak > 3)] += streak; - return stats; -} - //------------------------------------------------------------------------------ static void HistogramAdd(const VP8LHistogram* const a, @@ -1235,11 +1363,14 @@ VP8LFastLog2SlowFunc VP8LFastSLog2Slow; VP8LCostFunc VP8LExtraCost; VP8LCostCombinedFunc VP8LExtraCostCombined; +VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy; -VP8LCostCountFunc VP8LHuffmanCostCount; +GetEntropyUnrefinedHelperFunc VP8LGetEntropyUnrefinedHelper; VP8LHistogramAddFunc VP8LHistogramAdd; +VP8LVectorMismatchFunc VP8LVectorMismatch; + extern void VP8LEncDspInitSSE2(void); extern void VP8LEncDspInitSSE41(void); extern void VP8LEncDspInitNEON(void); @@ -1266,11 +1397,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) { VP8LExtraCost = ExtraCost; VP8LExtraCostCombined = ExtraCostCombined; + VP8LCombinedShannonEntropy = CombinedShannonEntropy; - VP8LHuffmanCostCount = HuffmanCostCount; + VP8LGetEntropyUnrefinedHelper = GetEntropyUnrefinedHelper; VP8LHistogramAdd = HistogramAdd; + VP8LVectorMismatch = VectorMismatch; + // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) diff --git a/drivers/webp/dsp/lossless_enc_mips32.c b/drivers/webp/dsp/lossless_enc_mips32.c index 0468a5aac2..49c666d4fd 100644 --- a/drivers/webp/dsp/lossless_enc_mips32.c +++ b/drivers/webp/dsp/lossless_enc_mips32.c @@ -22,10 +22,6 @@ #include <stdlib.h> #include <string.h> -#define APPROX_LOG_WITH_CORRECTION_MAX 65536 -#define APPROX_LOG_MAX 4096 -#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086 - static float FastSLog2Slow(uint32_t v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { @@ -217,51 +213,31 @@ static double ExtraCostCombined(const uint32_t* const X, ); // Returns the various RLE counts -static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) { - int i; - int streak = 0; - VP8LStreaks stats; - int* const pstreaks = &stats.streaks[0][0]; - int* const pcnts = &stats.counts[0]; +static WEBP_INLINE void GetEntropyUnrefinedHelper( + uint32_t val, int i, uint32_t* const val_prev, int* const i_prev, + VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) { + int* const pstreaks = &stats->streaks[0][0]; + int* const pcnts = &stats->counts[0]; int temp0, temp1, temp2, temp3; - memset(&stats, 0, sizeof(stats)); - for (i = 0; i < length - 1; ++i) { - ++streak; - if (population[i] == population[i + 1]) { - continue; + const int streak = i - *i_prev; + + // Gather info for the bit entropy. + if (*val_prev != 0) { + bit_entropy->sum += (*val_prev) * streak; + bit_entropy->nonzeros += streak; + bit_entropy->nonzero_code = *i_prev; + bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak; + if (bit_entropy->max_val < *val_prev) { + bit_entropy->max_val = *val_prev; } - temp0 = (population[i] != 0); - HUFFMAN_COST_PASS - streak = 0; } - ++streak; - temp0 = (population[i] != 0); - HUFFMAN_COST_PASS - return stats; -} + // Gather info for the Huffman cost. + temp0 = (*val_prev != 0); + HUFFMAN_COST_PASS -static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X, - const uint32_t* Y, int length) { - int i; - int streak = 0; - uint32_t xy_prev = 0xffffffff; - VP8LStreaks stats; - int* const pstreaks = &stats.streaks[0][0]; - int* const pcnts = &stats.counts[0]; - int temp0, temp1, temp2, temp3; - memset(&stats, 0, sizeof(stats)); - for (i = 0; i < length; ++i) { - const uint32_t xy = X[i] + Y[i]; - ++streak; - if (xy != xy_prev) { - temp0 = (xy != 0); - HUFFMAN_COST_PASS - streak = 0; - xy_prev = xy; - } - } - return stats; + *val_prev = val; + *i_prev = i; } #define ASM_START \ @@ -399,14 +375,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) { VP8LFastLog2Slow = FastLog2Slow; VP8LExtraCost = ExtraCost; VP8LExtraCostCombined = ExtraCostCombined; - VP8LHuffmanCostCount = HuffmanCostCount; -// TODO(mips team): rewrite VP8LGetCombinedEntropy (which used to use -// HuffmanCostCombinedCount) with MIPS optimizations -#if 0 - VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount; -#else - (void)HuffmanCostCombinedCount; -#endif + VP8LGetEntropyUnrefinedHelper = GetEntropyUnrefinedHelper; VP8LHistogramAdd = HistogramAdd; } diff --git a/drivers/webp/dsp/lossless_enc_sse2.c b/drivers/webp/dsp/lossless_enc_sse2.c index 1374b3ef64..7c894e7ca4 100644 --- a/drivers/webp/dsp/lossless_enc_sse2.c +++ b/drivers/webp/dsp/lossless_enc_sse2.c @@ -251,6 +251,131 @@ static void HistogramAdd(const VP8LHistogram* const a, } //------------------------------------------------------------------------------ +// Entropy + +// Checks whether the X or Y contribution is worth computing and adding. +// Used in loop unrolling. +#define ANALYZE_X_OR_Y(x_or_y, j) \ + do { \ + if (x_or_y[i + j] != 0) retval -= VP8LFastSLog2(x_or_y[i + j]); \ + } while (0) + +// Checks whether the X + Y contribution is worth computing and adding. +// Used in loop unrolling. +#define ANALYZE_XY(j) \ + do { \ + if (tmp[j] != 0) { \ + retval -= VP8LFastSLog2(tmp[j]); \ + ANALYZE_X_OR_Y(X, j); \ + } \ + } while (0) + +static float CombinedShannonEntropy(const int X[256], const int Y[256]) { + int i; + double retval = 0.; + int sumX, sumXY; + int32_t tmp[4]; + __m128i zero = _mm_setzero_si128(); + // Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY). + __m128i sumXY_128 = zero; + __m128i sumX_128 = zero; + + for (i = 0; i < 256; i += 4) { + const __m128i x = _mm_loadu_si128((const __m128i*)(X + i)); + const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i)); + + // Check if any X is non-zero: this actually provides a speedup as X is + // usually sparse. + if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) { + const __m128i xy_128 = _mm_add_epi32(x, y); + sumXY_128 = _mm_add_epi32(sumXY_128, xy_128); + + sumX_128 = _mm_add_epi32(sumX_128, x); + + // Analyze the different X + Y. + _mm_storeu_si128((__m128i*)tmp, xy_128); + + ANALYZE_XY(0); + ANALYZE_XY(1); + ANALYZE_XY(2); + ANALYZE_XY(3); + } else { + // X is fully 0, so only deal with Y. + sumXY_128 = _mm_add_epi32(sumXY_128, y); + + ANALYZE_X_OR_Y(Y, 0); + ANALYZE_X_OR_Y(Y, 1); + ANALYZE_X_OR_Y(Y, 2); + ANALYZE_X_OR_Y(Y, 3); + } + } + + // Sum up sumX_128 to get sumX. + _mm_storeu_si128((__m128i*)tmp, sumX_128); + sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0]; + + // Sum up sumXY_128 to get sumXY. + _mm_storeu_si128((__m128i*)tmp, sumXY_128); + sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0]; + + retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); + return (float)retval; +} +#undef ANALYZE_X_OR_Y +#undef ANALYZE_XY + +//------------------------------------------------------------------------------ + +static int VectorMismatch(const uint32_t* const array1, + const uint32_t* const array2, int length) { + int match_len; + + if (length >= 12) { + __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]); + __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]); + match_len = 0; + do { + // Loop unrolling and early load both provide a speedup of 10% for the + // current function. Also, max_limit can be MAX_LENGTH=4096 at most. + const __m128i cmpA = _mm_cmpeq_epi32(A0, A1); + const __m128i B0 = + _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); + const __m128i B1 = + _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); + if (_mm_movemask_epi8(cmpA) != 0xffff) break; + match_len += 4; + + { + const __m128i cmpB = _mm_cmpeq_epi32(B0, B1); + A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); + A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); + if (_mm_movemask_epi8(cmpB) != 0xffff) break; + match_len += 4; + } + } while (match_len + 12 < length); + } else { + match_len = 0; + // Unroll the potential first two loops. + if (length >= 4 && + _mm_movemask_epi8(_mm_cmpeq_epi32( + _mm_loadu_si128((const __m128i*)&array1[0]), + _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) { + match_len = 4; + if (length >= 8 && + _mm_movemask_epi8(_mm_cmpeq_epi32( + _mm_loadu_si128((const __m128i*)&array1[4]), + _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) + match_len = 8; + } + } + + while (match_len < length && array1[match_len] == array2[match_len]) { + ++match_len; + } + return match_len; +} + +//------------------------------------------------------------------------------ // Entry point extern void VP8LEncDspInitSSE2(void); @@ -261,6 +386,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { VP8LCollectColorBlueTransforms = CollectColorBlueTransforms; VP8LCollectColorRedTransforms = CollectColorRedTransforms; VP8LHistogramAdd = HistogramAdd; + VP8LCombinedShannonEntropy = CombinedShannonEntropy; + VP8LVectorMismatch = VectorMismatch; } #else // !WEBP_USE_SSE2 diff --git a/drivers/webp/dsp/msa_macro.h b/drivers/webp/dsp/msa_macro.h new file mode 100644 index 0000000000..5c707f476a --- /dev/null +++ b/drivers/webp/dsp/msa_macro.h @@ -0,0 +1,555 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// MSA common macros +// +// Author(s): Prashant Patil (prashant.patil@imgtec.com) + +#ifndef WEBP_DSP_MSA_MACRO_H_ +#define WEBP_DSP_MSA_MACRO_H_ + +#include <stdint.h> +#include <msa.h> + +#if defined(__clang__) + #define CLANG_BUILD +#endif + +#ifdef CLANG_BUILD + #define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b) + #define SRAI_H(a, b) __msa_srai_h((v8i16)a, b) + #define SRAI_W(a, b) __msa_srai_w((v4i32)a, b) +#else + #define ADDVI_H(a, b) (a + b) + #define SRAI_H(a, b) (a >> b) + #define SRAI_W(a, b) (a >> b) +#endif + +#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) +#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) + +#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) +#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) +#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) + +#define LD_W(RTYPE, psrc) *((RTYPE*)(psrc)) +#define LD_UW(...) LD_W(v4u32, __VA_ARGS__) +#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) + +#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in +#define ST_UH(...) ST_H(v8u16, __VA_ARGS__) +#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) + +#define ST_W(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in +#define ST_UW(...) ST_W(v4u32, __VA_ARGS__) +#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) + +#define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME) \ + static inline TYPE FUNC_NAME(const void* const psrc) { \ + const uint8_t* const psrc_m = (const uint8_t*)psrc; \ + TYPE val_m; \ + asm volatile ( \ + "" #INSTR " %[val_m], %[psrc_m] \n\t" \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m)); \ + return val_m; \ + } + +#define MSA_LOAD(psrc, FUNC_NAME) FUNC_NAME(psrc) + +#define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME) \ + static inline void FUNC_NAME(TYPE val, void* const pdst) { \ + uint8_t* const pdst_m = (uint8_t*)pdst; \ + TYPE val_m = val; \ + asm volatile ( \ + " " #INSTR " %[val_m], %[pdst_m] \n\t" \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m)); \ + } + +#define MSA_STORE(val, pdst, FUNC_NAME) FUNC_NAME(val, pdst) + +#if (__mips_isa_rev >= 6) + MSA_LOAD_FUNC(uint16_t, lh, msa_lh); + #define LH(psrc) MSA_LOAD(psrc, msa_lh) + MSA_LOAD_FUNC(uint32_t, lw, msa_lw); + #define LW(psrc) MSA_LOAD(psrc, msa_lw) + #if (__mips == 64) + MSA_LOAD_FUNC(uint64_t, ld, msa_ld); + #define LD(psrc) MSA_LOAD(psrc, msa_ld) + #else // !(__mips == 64) + #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_lw)) << 32) | \ + MSA_LOAD(psrc, msa_lw)) + #endif // (__mips == 64) + + MSA_STORE_FUNC(uint16_t, sh, msa_sh); + #define SH(val, pdst) MSA_STORE(val, pdst, msa_sh) + MSA_STORE_FUNC(uint32_t, sw, msa_sw); + #define SW(val, pdst) MSA_STORE(val, pdst, msa_sw) + MSA_STORE_FUNC(uint64_t, sd, msa_sd); + #define SD(val, pdst) MSA_STORE(val, pdst, msa_sd) +#else // !(__mips_isa_rev >= 6) + MSA_LOAD_FUNC(uint16_t, ulh, msa_ulh); + #define LH(psrc) MSA_LOAD(psrc, msa_ulh) + MSA_LOAD_FUNC(uint32_t, ulw, msa_ulw); + #define LW(psrc) MSA_LOAD(psrc, msa_ulw) + #if (__mips == 64) + MSA_LOAD_FUNC(uint64_t, uld, msa_uld); + #define LD(psrc) MSA_LOAD(psrc, msa_uld) + #else // !(__mips == 64) + #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_ulw)) << 32) | \ + MSA_LOAD(psrc, msa_ulw)) + #endif // (__mips == 64) + + MSA_STORE_FUNC(uint16_t, ush, msa_ush); + #define SH(val, pdst) MSA_STORE(val, pdst, msa_ush) + MSA_STORE_FUNC(uint32_t, usw, msa_usw); + #define SW(val, pdst) MSA_STORE(val, pdst, msa_usw) + #define SD(val, pdst) { \ + uint8_t* const pdst_sd_m = (uint8_t*)(pdst); \ + const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF); \ + const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF); \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + } +#endif // (__mips_isa_rev >= 6) + +/* Description : Load 4 words with stride + * Arguments : Inputs - psrc, stride + * Outputs - out0, out1, out2, out3 + * Details : Load word in 'out0' from (psrc) + * Load word in 'out1' from (psrc + stride) + * Load word in 'out2' from (psrc + 2 * stride) + * Load word in 'out3' from (psrc + 3 * stride) + */ +#define LW4(psrc, stride, out0, out1, out2, out3) { \ + const uint8_t* ptmp = (const uint8_t*)psrc; \ + out0 = LW(ptmp); \ + ptmp += stride; \ + out1 = LW(ptmp); \ + ptmp += stride; \ + out2 = LW(ptmp); \ + ptmp += stride; \ + out3 = LW(ptmp); \ +} + +/* Description : Store 4 words with stride + * Arguments : Inputs - in0, in1, in2, in3, pdst, stride + * Details : Store word from 'in0' to (pdst) + * Store word from 'in1' to (pdst + stride) + * Store word from 'in2' to (pdst + 2 * stride) + * Store word from 'in3' to (pdst + 3 * stride) + */ +#define SW4(in0, in1, in2, in3, pdst, stride) { \ + uint8_t* ptmp = (uint8_t*)pdst; \ + SW(in0, ptmp); \ + ptmp += stride; \ + SW(in1, ptmp); \ + ptmp += stride; \ + SW(in2, ptmp); \ + ptmp += stride; \ + SW(in3, ptmp); \ +} + +/* Description : Load vectors with 16 byte elements with stride + * Arguments : Inputs - psrc, stride + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Load 16 byte elements in 'out0' from (psrc) + * Load 16 byte elements in 'out1' from (psrc + stride) + */ +#define LD_B2(RTYPE, psrc, stride, out0, out1) { \ + out0 = LD_B(RTYPE, psrc); \ + out1 = LD_B(RTYPE, psrc + stride); \ +} +#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ + LD_B2(RTYPE, psrc, stride, out0, out1); \ + LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3); \ +} +#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) + +/* Description : Load vectors with 8 halfword elements with stride + * Arguments : Inputs - psrc, stride + * Outputs - out0, out1 + * Details : Load 8 halfword elements in 'out0' from (psrc) + * Load 8 halfword elements in 'out1' from (psrc + stride) + */ +#define LD_H2(RTYPE, psrc, stride, out0, out1) { \ + out0 = LD_H(RTYPE, psrc); \ + out1 = LD_H(RTYPE, psrc + stride); \ +} +#define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__) +#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) + +/* Description : Store 4x4 byte block to destination memory from input vector + * Arguments : Inputs - in0, in1, pdst, stride + * Details : 'Idx0' word element from input vector 'in0' is copied to the + * GP register and stored to (pdst) + * 'Idx1' word element from input vector 'in0' is copied to the + * GP register and stored to (pdst + stride) + * 'Idx2' word element from input vector 'in0' is copied to the + * GP register and stored to (pdst + 2 * stride) + * 'Idx3' word element from input vector 'in0' is copied to the + * GP register and stored to (pdst + 3 * stride) + */ +#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \ + uint8_t* const pblk_4x4_m = (uint8_t*)pdst; \ + const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0); \ + const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1); \ + const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2); \ + const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3); \ + SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ +} + +/* Description : Immediate number of elements to slide + * Arguments : Inputs - in0, in1, slide_val + * Outputs - out + * Return Type - as per RTYPE + * Details : Byte elements from 'in1' vector are slid into 'in0' by + * value specified in the 'slide_val' + */ +#define SLDI_B(RTYPE, in0, in1, slide_val) \ + (RTYPE)__msa_sldi_b((v16i8)in0, (v16i8)in1, slide_val) \ + +#define SLDI_UB(...) SLDI_B(v16u8, __VA_ARGS__) +#define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__) +#define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__) + +/* Description : Shuffle halfword vector elements as per mask vector + * Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : halfword elements from 'in0' & 'in1' are copied selectively to + * 'out0' as per control vector 'mask0' + */ +#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \ + out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \ +} +#define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__) +#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__) + +/* Description : Clips all signed halfword elements of input vector + * between 0 & 255 + * Arguments : Input/output - val + * Return Type - signed halfword + */ +#define CLIP_SH_0_255(val) { \ + const v8i16 max_m = __msa_ldi_h(255); \ + val = __msa_maxi_s_h((v8i16)val, 0); \ + val = __msa_min_s_h(max_m, (v8i16)val); \ +} +#define CLIP_SH2_0_255(in0, in1) { \ + CLIP_SH_0_255(in0); \ + CLIP_SH_0_255(in1); \ +} + +/* Description : Clips all signed word elements of input vector + * between 0 & 255 + * Arguments : Input/output - val + * Return Type - signed word + */ +#define CLIP_SW_0_255(val) { \ + const v4i32 max_m = __msa_ldi_w(255); \ + val = __msa_maxi_s_w((v4i32)val, 0); \ + val = __msa_min_s_w(max_m, (v4i32)val); \ +} +#define CLIP_SW4_0_255(in0, in1, in2, in3) { \ + CLIP_SW_0_255(in0); \ + CLIP_SW_0_255(in1); \ + CLIP_SW_0_255(in2); \ + CLIP_SW_0_255(in3); \ +} + +/* Description : Set element n input vector to GPR value + * Arguments : Inputs - in0, in1, in2, in3 + * Output - out + * Return Type - as per RTYPE + * Details : Set element 0 in vector 'out' to value specified in 'in0' + */ +#define INSERT_W2(RTYPE, in0, in1, out) { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ +} +#define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__) +#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) + +#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ +} +#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) +#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) +#define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__) + +/* Description : Interleave right half of byte elements from vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Right half of byte elements of 'in0' and 'in1' are interleaved + * and written to out0. + */ +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ +} +#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) +#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) +#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) +#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__) + +#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) +#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) +#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) +#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) +#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__) + +/* Description : Interleave right half of halfword elements from vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Right half of halfword elements of 'in0' and 'in1' are + * interleaved and written to 'out0'. + */ +#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ +} +#define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__) +#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) +#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) + +#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__) +#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) +#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__) + +/* Description : Interleave right half of double word elements from vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Right half of double word elements of 'in0' and 'in1' are + * interleaved and written to 'out0'. + */ +#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1); \ + out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3); \ +} +#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) +#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) +#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) + +#define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ +} +#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__) +#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__) +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) +#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) +#define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__) + +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ +} +#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) + +/* Description : Pack even byte elements of vector pairs + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Even byte elements of 'in0' are copied to the left half of + * 'out0' & even byte elements of 'in1' are copied to the right + * half of 'out0'. + */ +#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ +} +#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) +#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) +#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__) + +/* Description : Arithmetic immediate shift right all elements of word vector + * Arguments : Inputs - in0, in1, shift + * Outputs - in place operation + * Return Type - as per input vector RTYPE + * Details : Each element of vector 'in0' is right shifted by 'shift' and + * the result is written in-place. 'shift' is a GP variable. + */ +#define SRAI_W2(RTYPE, in0, in1, shift_val) { \ + in0 = (RTYPE)SRAI_W(in0, shift_val); \ + in1 = (RTYPE)SRAI_W(in1, shift_val); \ +} +#define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__) +#define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__) + +#define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) { \ + SRAI_W2(RTYPE, in0, in1, shift_val); \ + SRAI_W2(RTYPE, in2, in3, shift_val); \ +} +#define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__) +#define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__) + +/* Description : Arithmetic shift right all elements of half-word vector + * Arguments : Inputs - in0, in1, shift + * Outputs - in place operation + * Return Type - as per input vector RTYPE + * Details : Each element of vector 'in0' is right shifted by 'shift' and + * the result is written in-place. 'shift' is a GP variable. + */ +#define SRAI_H2(RTYPE, in0, in1, shift_val) { \ + in0 = (RTYPE)SRAI_H(in0, shift_val); \ + in1 = (RTYPE)SRAI_H(in1, shift_val); \ +} +#define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__) +#define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__) + +/* Description : Arithmetic rounded shift right all elements of word vector + * Arguments : Inputs - in0, in1, shift + * Outputs - in place operation + * Return Type - as per input vector RTYPE + * Details : Each element of vector 'in0' is right shifted by 'shift' and + * the result is written in-place. 'shift' is a GP variable. + */ +#define SRARI_W2(RTYPE, in0, in1, shift) { \ + in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ + in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ +} +#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) + +#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \ + SRARI_W2(RTYPE, in0, in1, shift); \ + SRARI_W2(RTYPE, in2, in3, shift); \ +} +#define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__) +#define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__) +#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) + +/* Description : Addition of 2 pairs of half-word vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Details : Each element in 'in0' is added to 'in1' and result is written + * to 'out0'. + */ +#define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)ADDVI_H(in0, in1); \ + out1 = (RTYPE)ADDVI_H(in2, in3); \ +} +#define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__) +#define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__) + +/* Description : Addition of 2 pairs of vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Details : Each element in 'in0' is added to 'in1' and result is written + * to 'out0'. + */ +#define ADD2(in0, in1, in2, in3, out0, out1) { \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ +} +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ +} + +/* Description : Sign extend halfword elements from input vector and return + * the result in pair of vectors + * Arguments : Input - in (halfword vector) + * Outputs - out0, out1 (sign extended word vectors) + * Return Type - signed word + * Details : Sign bit of halfword elements from input vector 'in' is + * extracted and interleaved right with same vector 'in0' to + * generate 4 signed word elements in 'out0' + * Then interleaved left with same vector 'in0' to + * generate 4 signed word elements in 'out1' + */ +#define UNPCK_SH_SW(in, out0, out1) { \ + const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0); \ + ILVRL_H2_SW(tmp_m, in, out0, out1); \ +} + +/* Description : Butterfly of 4 input vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : Butterfly operation + */ +#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \ + out0 = in0 + in3; \ + out1 = in1 + in2; \ + out2 = in1 - in2; \ + out3 = in0 - in3; \ +} + +/* Description : Transpose 4x4 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Return Type - as per RTYPE + */ +#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) { \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ + out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ + out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ + out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ +} +#define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__) + +/* Description : Add block 4x4 + * Arguments : Inputs - in0, in1, in2, in3, pdst, stride + * Details : Least significant 4 bytes from each input vector are added to + * the destination bytes, clipped between 0-255 and stored. + */ +#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \ + uint32_t src0_m, src1_m, src2_m, src3_m; \ + v8i16 inp0_m, inp1_m, res0_m, res1_m; \ + v16i8 dst0_m = { 0 }; \ + v16i8 dst1_m = { 0 }; \ + const v16i8 zero_m = { 0 }; \ + ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m); \ + LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ + INSERT_W2_SB(src0_m, src1_m, dst0_m); \ + INSERT_W2_SB(src2_m, src3_m, dst1_m); \ + ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ + ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ + CLIP_SH2_0_255(res0_m, res1_m); \ + PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ + ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ +} + +#endif /* WEBP_DSP_MSA_MACRO_H_ */ diff --git a/drivers/webp/dsp/rescaler_sse2.c b/drivers/webp/dsp/rescaler_sse2.c index 186edb653e..5b9702817c 100644 --- a/drivers/webp/dsp/rescaler_sse2.c +++ b/drivers/webp/dsp/rescaler_sse2.c @@ -18,6 +18,7 @@ #include <assert.h> #include "../utils/rescaler.h" +#include "../utils/utils.h" //------------------------------------------------------------------------------ // Implementations of critical functions ImportRow / ExportRow @@ -84,7 +85,8 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk, while (1) { const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum); const __m128i out = _mm_madd_epi16(cur_pixels, mult); - *(uint32_t*)frow = _mm_cvtsi128_si32(out); + assert(sizeof(*frow) == sizeof(uint32_t)); + WebPUint32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out)); frow += 1; if (frow >= frow_end) break; accum -= wrk->x_sub; @@ -131,7 +133,7 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk, __m128i base = zero; accum += wrk->x_add; while (accum > 0) { - const __m128i A = _mm_cvtsi32_si128(*(int*)src); + const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src)); src += 4; base = _mm_unpacklo_epi8(A, zero); // To avoid overflow, we need: base * x_add / x_sub < 32768 diff --git a/drivers/webp/dsp/upsampling_mips_dsp_r2.c b/drivers/webp/dsp/upsampling_mips_dsp_r2.c index 46f207b43e..ed2eb74825 100644 --- a/drivers/webp/dsp/upsampling_mips_dsp_r2.c +++ b/drivers/webp/dsp/upsampling_mips_dsp_r2.c @@ -22,21 +22,21 @@ #if !defined(WEBP_YUV_USE_TABLE) #define YUV_TO_RGB(Y, U, V, R, G, B) do { \ - const int t1 = kYScale * Y; \ - const int t2 = kVToG * V; \ - R = kVToR * V; \ - G = kUToG * U; \ - B = kUToB * U; \ + const int t1 = MultHi(Y, 19077); \ + const int t2 = MultHi(V, 13320); \ + R = MultHi(V, 26149); \ + G = MultHi(U, 6419); \ + B = MultHi(U, 33050); \ R = t1 + R; \ G = t1 - G; \ B = t1 + B; \ - R = R + kRCst; \ - G = G - t2 + kGCst; \ - B = B + kBCst; \ + R = R - 14234; \ + G = G - t2 + 8708; \ + B = B - 17685; \ __asm__ volatile ( \ - "shll_s.w %[" #R "], %[" #R "], 9 \n\t" \ - "shll_s.w %[" #G "], %[" #G "], 9 \n\t" \ - "shll_s.w %[" #B "], %[" #B "], 9 \n\t" \ + "shll_s.w %[" #R "], %[" #R "], 17 \n\t" \ + "shll_s.w %[" #G "], %[" #G "], 17 \n\t" \ + "shll_s.w %[" #B "], %[" #B "], 17 \n\t" \ "precrqu_s.qb.ph %[" #R "], %[" #R "], $zero \n\t" \ "precrqu_s.qb.ph %[" #G "], %[" #G "], $zero \n\t" \ "precrqu_s.qb.ph %[" #B "], %[" #B "], $zero \n\t" \ diff --git a/drivers/webp/dsp/upsampling_neon.c b/drivers/webp/dsp/upsampling_neon.c index a8384c2149..2b0c99bddb 100644 --- a/drivers/webp/dsp/upsampling_neon.c +++ b/drivers/webp/dsp/upsampling_neon.c @@ -89,9 +89,11 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2, //----------------------------------------------------------------------------- // YUV->RGB conversion -static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG }; +// note: we represent the 33050 large constant as 32768 + 282 +static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 }; #define v255 vdup_n_u8(255) +#define v_0x0f vdup_n_u8(15) #define STORE_Rgb(out, r, g, b) do { \ uint8x8x3_t r_g_b; \ @@ -117,38 +119,67 @@ static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG }; vst4_u8(out, b_g_r_v255); \ } while (0) -#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) { \ +#define STORE_Argb(out, r, g, b) do { \ + uint8x8x4_t v255_r_g_b; \ + INIT_VECTOR4(v255_r_g_b, v255, r, g, b); \ + vst4_u8(out, v255_r_g_b); \ +} while (0) + +#if !defined(WEBP_SWAP_16BIT_CSP) +#define ZIP_U8(lo, hi) vzip_u8((lo), (hi)) +#else +#define ZIP_U8(lo, hi) vzip_u8((hi), (lo)) +#endif + +#define STORE_Rgba4444(out, r, g, b) do { \ + const uint8x8_t r1 = vshl_n_u8(vshr_n_u8(r, 4), 4); /* 4bits */ \ + const uint8x8_t g1 = vshr_n_u8(g, 4); \ + const uint8x8_t ba = vorr_u8(b, v_0x0f); \ + const uint8x8_t rg = vorr_u8(r1, g1); \ + const uint8x8x2_t rgba4444 = ZIP_U8(rg, ba); \ + vst1q_u8(out, vcombine_u8(rgba4444.val[0], rgba4444.val[1])); \ +} while (0) + +#define STORE_Rgb565(out, r, g, b) do { \ + const uint8x8_t r1 = vshl_n_u8(vshr_n_u8(r, 3), 3); /* 5bits */ \ + const uint8x8_t g1 = vshr_n_u8(g, 5); /* upper 3bits */\ + const uint8x8_t g2 = vshl_n_u8(vshr_n_u8(g, 2), 5); /* lower 3bits */\ + const uint8x8_t b1 = vshr_n_u8(b, 3); /* 5bits */ \ + const uint8x8_t rg = vorr_u8(r1, g1); \ + const uint8x8_t gb = vorr_u8(g2, b1); \ + const uint8x8x2_t rgb565 = ZIP_U8(rg, gb); \ + vst1q_u8(out, vcombine_u8(rgb565.val[0], rgb565.val[1])); \ +} while (0) + +#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) do { \ int i; \ for (i = 0; i < N; i += 8) { \ const int off = ((cur_x) + i) * XSTEP; \ - uint8x8_t y = vld1_u8((src_y) + (cur_x) + i); \ - uint8x8_t u = vld1_u8((src_uv) + i); \ - uint8x8_t v = vld1_u8((src_uv) + i + 16); \ - const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); \ - const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); \ - const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); \ - int32x4_t yl = vmull_lane_s16(vget_low_s16(yy), cf16, 0); \ - int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0); \ - const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv), cf16, 1);\ - const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\ - int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu), cf16, 2); \ - int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2); \ - const int32x4_t bl = vmovl_s16(vget_low_s16(uu)); \ - const int32x4_t bh = vmovl_s16(vget_high_s16(uu)); \ - gl = vmlsl_lane_s16(gl, vget_low_s16(vv), cf16, 3); \ - gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3); \ - yl = vmlaq_lane_s32(yl, bl, cf32, 0); \ - yh = vmlaq_lane_s32(yh, bh, cf32, 0); \ - /* vrshrn_n_s32() already incorporates the rounding constant */ \ - y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2), \ - vrshrn_n_s32(rh, YUV_FIX2))); \ - u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2), \ - vrshrn_n_s32(gh, YUV_FIX2))); \ - v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2), \ - vrshrn_n_s32(yh, YUV_FIX2))); \ - STORE_ ## FMT(out + off, y, u, v); \ + const uint8x8_t y = vld1_u8((src_y) + (cur_x) + i); \ + const uint8x8_t u = vld1_u8((src_uv) + i + 0); \ + const uint8x8_t v = vld1_u8((src_uv) + i + 16); \ + const int16x8_t Y0 = vreinterpretq_s16_u16(vshll_n_u8(y, 7)); \ + const int16x8_t U0 = vreinterpretq_s16_u16(vshll_n_u8(u, 7)); \ + const int16x8_t V0 = vreinterpretq_s16_u16(vshll_n_u8(v, 7)); \ + const int16x8_t Y1 = vqdmulhq_lane_s16(Y0, coeff1, 0); \ + const int16x8_t R0 = vqdmulhq_lane_s16(V0, coeff1, 1); \ + const int16x8_t G0 = vqdmulhq_lane_s16(U0, coeff1, 2); \ + const int16x8_t G1 = vqdmulhq_lane_s16(V0, coeff1, 3); \ + const int16x8_t B0 = vqdmulhq_n_s16(U0, 282); \ + const int16x8_t R1 = vqaddq_s16(Y1, R_Rounder); \ + const int16x8_t G2 = vqaddq_s16(Y1, G_Rounder); \ + const int16x8_t B1 = vqaddq_s16(Y1, B_Rounder); \ + const int16x8_t R2 = vqaddq_s16(R0, R1); \ + const int16x8_t G3 = vqaddq_s16(G0, G1); \ + const int16x8_t B2 = vqaddq_s16(B0, B1); \ + const int16x8_t G4 = vqsubq_s16(G2, G3); \ + const int16x8_t B3 = vqaddq_s16(B2, U0); \ + const uint8x8_t R = vqshrun_n_s16(R2, YUV_FIX2); \ + const uint8x8_t G = vqshrun_n_s16(G4, YUV_FIX2); \ + const uint8x8_t B = vqshrun_n_s16(B3, YUV_FIX2); \ + STORE_ ## FMT(out + off, R, G, B); \ } \ -} +} while (0) #define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) { \ int i; \ @@ -163,9 +194,9 @@ static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG }; #define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv, \ top_dst, bottom_dst, cur_x, len) { \ - CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x) \ + CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x); \ if (bottom_y != NULL) { \ - CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x) \ + CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \ } \ } @@ -195,10 +226,10 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \ const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \ const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \ \ - const int16x4_t cf16 = vld1_s16(kCoeffs); \ - const int32x2_t cf32 = vdup_n_s32(kUToB); \ - const uint8x8_t u16 = vdup_n_u8(16); \ - const uint8x8_t u128 = vdup_n_u8(128); \ + const int16x4_t coeff1 = vld1_s16(kCoeffs1); \ + const int16x8_t R_Rounder = vdupq_n_s16(-14234); \ + const int16x8_t G_Rounder = vdupq_n_s16(8708); \ + const int16x8_t B_Rounder = vdupq_n_s16(-17685); \ \ /* Treat the first pixel in regular way */ \ assert(top_y != NULL); \ @@ -235,6 +266,9 @@ NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair, Rgb, 3) NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair, Bgr, 3) NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4) NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4) +NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair, Argb, 4) +NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, Rgba4444, 2) +NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair, Rgb565, 2) //------------------------------------------------------------------------------ // Entry point @@ -248,8 +282,13 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersNEON(void) { WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; + WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; + WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair; + WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair; + WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair; + WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; } #endif // FANCY_UPSAMPLING diff --git a/drivers/webp/dsp/upsampling_sse2.c b/drivers/webp/dsp/upsampling_sse2.c index b85808e271..b5b668900f 100644 --- a/drivers/webp/dsp/upsampling_sse2.c +++ b/drivers/webp/dsp/upsampling_sse2.c @@ -173,6 +173,9 @@ SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3) SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3) SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4) SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4) +SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4) +SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2) +SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2) #undef GET_M #undef PACK_AND_STORE @@ -190,13 +193,17 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; extern void WebPInitUpsamplersSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) { - VP8YUVInitSSE2(); WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; + WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; + WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair; + WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair; + WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair; + WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; } #endif // FANCY_UPSAMPLING @@ -225,7 +232,6 @@ YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb32, 3); YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr32, 3); WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) { - VP8YUVInitSSE2(); WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba; WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra; WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb; diff --git a/drivers/webp/dsp/yuv.h b/drivers/webp/dsp/yuv.h index af435a5b3e..01c40fcb84 100644 --- a/drivers/webp/dsp/yuv.h +++ b/drivers/webp/dsp/yuv.h @@ -21,16 +21,15 @@ // G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.391 * (U-128) // B = 1.164 * (Y-16) + 2.018 * (U-128) // where Y is in the [16,235] range, and U/V in the [16,240] range. -// In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor -// "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table. -// So in this case the formulae should read: -// R = 1.164 * [Y + 1.371 * (V-128) ] - 18.624 -// G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624 -// B = 1.164 * [Y + 1.733 * (U-128)] - 18.624 -// once factorized. -// For YUV->RGB conversion, only 14bit fixed precision is used (YUV_FIX2). -// That's the maximum possible for a convenient ARM implementation. // +// The fixed-point implementation used here is: +// R = (19077 . y + 26149 . v - 14234) >> 6 +// G = (19077 . y - 6419 . u - 13320 . v + 8708) >> 6 +// B = (19077 . y + 33050 . u - 17685) >> 6 +// where the '.' operator is the mulhi_epu16 variant: +// a . b = ((a << 8) * b) >> 16 +// that preserves 8 bits of fractional precision before final descaling. + // Author: Skal (pascal.massimino@gmail.com) #ifndef WEBP_DSP_YUV_H_ @@ -39,9 +38,6 @@ #include "./dsp.h" #include "../dec/decode_vp8.h" -// Define the following to use the LUT-based code: -// #define WEBP_YUV_USE_TABLE - #if defined(WEBP_EXPERIMENTAL_FEATURES) // Do NOT activate this feature for real compression. This is only experimental! // This flag is for comparison purpose against JPEG's "YUVj" natural colorspace. @@ -66,41 +62,32 @@ enum { YUV_RANGE_MIN = -227, // min value of r/g/b output YUV_RANGE_MAX = 256 + 226, // max value of r/g/b output - YUV_FIX2 = 14, // fixed-point precision for YUV->RGB - YUV_HALF2 = 1 << (YUV_FIX2 - 1), + YUV_FIX2 = 6, // fixed-point precision for YUV->RGB + YUV_HALF2 = 1 << YUV_FIX2 >> 1, YUV_MASK2 = (256 << YUV_FIX2) - 1 }; -// These constants are 14b fixed-point version of ITU-R BT.601 constants. -#define kYScale 19077 // 1.164 = 255 / 219 -#define kVToR 26149 // 1.596 = 255 / 112 * 0.701 -#define kUToG 6419 // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587 -#define kVToG 13320 // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587 -#define kUToB 33050 // 2.018 = 255 / 112 * 0.886 -#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF2) -#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF2) -#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF2) - //------------------------------------------------------------------------------ +// slower on x86 by ~7-8%, but bit-exact with the SSE2/NEON version -#if !defined(WEBP_YUV_USE_TABLE) - -// slower on x86 by ~7-8%, but bit-exact with the SSE2 version +static WEBP_INLINE int MultHi(int v, int coeff) { // _mm_mulhi_epu16 emulation + return (v * coeff) >> 8; +} static WEBP_INLINE int VP8Clip8(int v) { return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : (v < 0) ? 0 : 255; } static WEBP_INLINE int VP8YUVToR(int y, int v) { - return VP8Clip8(kYScale * y + kVToR * v + kRCst); + return VP8Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234); } static WEBP_INLINE int VP8YUVToG(int y, int u, int v) { - return VP8Clip8(kYScale * y - kUToG * u - kVToG * v + kGCst); + return VP8Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708); } static WEBP_INLINE int VP8YUVToB(int y, int u) { - return VP8Clip8(kYScale * y + kUToB * u + kBCst); + return VP8Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685); } static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v, @@ -149,73 +136,6 @@ static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v, #endif } -#else - -// Table-based version, not totally equivalent to the SSE2 version. -// Rounding diff is only +/-1 though. - -extern int16_t VP8kVToR[256], VP8kUToB[256]; -extern int32_t VP8kVToG[256], VP8kUToG[256]; -extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN]; -extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN]; - -static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v, - uint8_t* const rgb) { - const int r_off = VP8kVToR[v]; - const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX; - const int b_off = VP8kUToB[u]; - rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN]; - rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN]; - rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN]; -} - -static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v, - uint8_t* const bgr) { - const int r_off = VP8kVToR[v]; - const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX; - const int b_off = VP8kUToB[u]; - bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN]; - bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN]; - bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN]; -} - -static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v, - uint8_t* const rgb) { - const int r_off = VP8kVToR[v]; - const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX; - const int b_off = VP8kUToB[u]; - const int rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) | - (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5)); - const int gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) | - (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3)); -#ifdef WEBP_SWAP_16BIT_CSP - rgb[0] = gb; - rgb[1] = rg; -#else - rgb[0] = rg; - rgb[1] = gb; -#endif -} - -static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v, - uint8_t* const argb) { - const int r_off = VP8kVToR[v]; - const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX; - const int b_off = VP8kUToB[u]; - const int rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) | - VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]); - const int ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f; -#ifdef WEBP_SWAP_16BIT_CSP - argb[0] = ba; - argb[1] = rg; -#else - argb[0] = rg; - argb[1] = ba; -#endif -} - -#endif // WEBP_YUV_USE_TABLE - //----------------------------------------------------------------------------- // Alpha handling variants @@ -245,11 +165,7 @@ void VP8YUVInit(void); #if defined(WEBP_USE_SSE2) -// When the following is defined, tables are initialized statically, adding ~12k -// to the binary size. Otherwise, they are initialized at run-time (small cost). -#define WEBP_YUV_USE_SSE2_TABLES - -// Process 32 pixels and store the result (24b or 32b per pixel) in *dst. +// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst. void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst); void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, @@ -258,9 +174,12 @@ void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst); void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst); - -// Must be called to initialize tables before using the functions. -void VP8YUVInitSSE2(void); +void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); #endif // WEBP_USE_SSE2 diff --git a/drivers/webp/dsp/yuv_mips32.c b/drivers/webp/dsp/yuv_mips32.c index 018f8ab774..e61aac571f 100644 --- a/drivers/webp/dsp/yuv_mips32.c +++ b/drivers/webp/dsp/yuv_mips32.c @@ -28,19 +28,19 @@ static void FUNC_NAME(const uint8_t* y, \ int i, r, g, b; \ int temp0, temp1, temp2, temp3, temp4; \ for (i = 0; i < (len >> 1); i++) { \ - temp1 = kVToR * v[0]; \ - temp3 = kVToG * v[0]; \ - temp2 = kUToG * u[0]; \ - temp4 = kUToB * u[0]; \ - temp0 = kYScale * y[0]; \ - temp1 += kRCst; \ - temp3 -= kGCst; \ + temp1 = MultHi(v[0], 26149); \ + temp3 = MultHi(v[0], 13320); \ + temp2 = MultHi(u[0], 6419); \ + temp4 = MultHi(u[0], 33050); \ + temp0 = MultHi(y[0], 19077); \ + temp1 -= 14234; \ + temp3 -= 8708; \ temp2 += temp3; \ - temp4 += kBCst; \ + temp4 -= 17685; \ r = VP8Clip8(temp0 + temp1); \ g = VP8Clip8(temp0 - temp2); \ b = VP8Clip8(temp0 + temp4); \ - temp0 = kYScale * y[1]; \ + temp0 = MultHi(y[1], 19077); \ dst[R] = r; \ dst[G] = g; \ dst[B] = b; \ @@ -58,15 +58,15 @@ static void FUNC_NAME(const uint8_t* y, \ dst += 2 * XSTEP; \ } \ if (len & 1) { \ - temp1 = kVToR * v[0]; \ - temp3 = kVToG * v[0]; \ - temp2 = kUToG * u[0]; \ - temp4 = kUToB * u[0]; \ - temp0 = kYScale * y[0]; \ - temp1 += kRCst; \ - temp3 -= kGCst; \ + temp1 = MultHi(v[0], 26149); \ + temp3 = MultHi(v[0], 13320); \ + temp2 = MultHi(u[0], 6419); \ + temp4 = MultHi(u[0], 33050); \ + temp0 = MultHi(y[0], 19077); \ + temp1 -= 14234; \ + temp3 -= 8708; \ temp2 += temp3; \ - temp4 += kBCst; \ + temp4 -= 17685; \ r = VP8Clip8(temp0 + temp1); \ g = VP8Clip8(temp0 - temp2); \ b = VP8Clip8(temp0 + temp4); \ diff --git a/drivers/webp/dsp/yuv_mips_dsp_r2.c b/drivers/webp/dsp/yuv_mips_dsp_r2.c index 45a2200352..1720d4190f 100644 --- a/drivers/webp/dsp/yuv_mips_dsp_r2.c +++ b/drivers/webp/dsp/yuv_mips_dsp_r2.c @@ -30,10 +30,10 @@ "mul %[temp2], %[t_con_3], %[temp4] \n\t" \ "mul %[temp4], %[t_con_4], %[temp4] \n\t" \ "mul %[temp0], %[t_con_5], %[temp0] \n\t" \ - "addu %[temp1], %[temp1], %[t_con_6] \n\t" \ + "subu %[temp1], %[temp1], %[t_con_6] \n\t" \ "subu %[temp3], %[temp3], %[t_con_7] \n\t" \ "addu %[temp2], %[temp2], %[temp3] \n\t" \ - "addu %[temp4], %[temp4], %[t_con_8] \n\t" \ + "subu %[temp4], %[temp4], %[t_con_8] \n\t" \ #define ROW_FUNC_PART_2(R, G, B, K) \ "addu %[temp5], %[temp0], %[temp1] \n\t" \ @@ -42,12 +42,12 @@ ".if " #K " \n\t" \ "lbu %[temp0], 1(%[y]) \n\t" \ ".endif \n\t" \ - "shll_s.w %[temp5], %[temp5], 9 \n\t" \ - "shll_s.w %[temp6], %[temp6], 9 \n\t" \ + "shll_s.w %[temp5], %[temp5], 17 \n\t" \ + "shll_s.w %[temp6], %[temp6], 17 \n\t" \ ".if " #K " \n\t" \ "mul %[temp0], %[t_con_5], %[temp0] \n\t" \ ".endif \n\t" \ - "shll_s.w %[temp7], %[temp7], 9 \n\t" \ + "shll_s.w %[temp7], %[temp7], 17 \n\t" \ "precrqu_s.qb.ph %[temp5], %[temp5], $zero \n\t" \ "precrqu_s.qb.ph %[temp6], %[temp6], $zero \n\t" \ "precrqu_s.qb.ph %[temp7], %[temp7], $zero \n\t" \ @@ -74,14 +74,14 @@ static void FUNC_NAME(const uint8_t* y, \ uint8_t* dst, int len) { \ int i; \ uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \ - const int t_con_1 = kVToR; \ - const int t_con_2 = kVToG; \ - const int t_con_3 = kUToG; \ - const int t_con_4 = kUToB; \ - const int t_con_5 = kYScale; \ - const int t_con_6 = kRCst; \ - const int t_con_7 = kGCst; \ - const int t_con_8 = kBCst; \ + const int t_con_1 = 26149; \ + const int t_con_2 = 13320; \ + const int t_con_3 = 6419; \ + const int t_con_4 = 33050; \ + const int t_con_5 = 19077; \ + const int t_con_6 = 14234; \ + const int t_con_7 = 8708; \ + const int t_con_8 = 17685; \ for (i = 0; i < (len >> 1); i++) { \ __asm__ volatile ( \ ROW_FUNC_PART_1() \ diff --git a/drivers/webp/dsp/yuv_sse2.c b/drivers/webp/dsp/yuv_sse2.c index 283b3af228..e19bddff6c 100644 --- a/drivers/webp/dsp/yuv_sse2.c +++ b/drivers/webp/dsp/yuv_sse2.c @@ -16,172 +16,294 @@ #if defined(WEBP_USE_SSE2) #include <emmintrin.h> -#include <string.h> // for memcpy -typedef union { // handy struct for converting SSE2 registers - int32_t i32[4]; - uint8_t u8[16]; - __m128i m; -} VP8kCstSSE2; - -#if defined(WEBP_YUV_USE_SSE2_TABLES) - -#include "./yuv_tables_sse2.h" - -WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInitSSE2(void) {} +//----------------------------------------------------------------------------- +// Convert spans of 32 pixels to various RGB formats for the fancy upsampler. -#else +// These constants are 14b fixed-point version of ITU-R BT.601 constants. +// R = (19077 * y + 26149 * v - 14234) >> 6 +// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 +// B = (19077 * y + 33050 * u - 17685) >> 6 +static void ConvertYUV444ToRGB(const __m128i* const Y0, + const __m128i* const U0, + const __m128i* const V0, + __m128i* const R, + __m128i* const G, + __m128i* const B) { + const __m128i k19077 = _mm_set1_epi16(19077); + const __m128i k26149 = _mm_set1_epi16(26149); + const __m128i k14234 = _mm_set1_epi16(14234); + // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic + const __m128i k33050 = _mm_set1_epi16((short)33050); + const __m128i k17685 = _mm_set1_epi16(17685); + const __m128i k6419 = _mm_set1_epi16(6419); + const __m128i k13320 = _mm_set1_epi16(13320); + const __m128i k8708 = _mm_set1_epi16(8708); + + const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077); + + const __m128i R0 = _mm_mulhi_epu16(*V0, k26149); + const __m128i R1 = _mm_sub_epi16(Y1, k14234); + const __m128i R2 = _mm_add_epi16(R1, R0); + + const __m128i G0 = _mm_mulhi_epu16(*U0, k6419); + const __m128i G1 = _mm_mulhi_epu16(*V0, k13320); + const __m128i G2 = _mm_add_epi16(Y1, k8708); + const __m128i G3 = _mm_add_epi16(G0, G1); + const __m128i G4 = _mm_sub_epi16(G2, G3); + + // be careful with the saturated *unsigned* arithmetic here! + const __m128i B0 = _mm_mulhi_epu16(*U0, k33050); + const __m128i B1 = _mm_adds_epu16(B0, Y1); + const __m128i B2 = _mm_subs_epu16(B1, k17685); + + // use logical shift for B2, which can be larger than 32767 + *R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815] + *G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710] + *B = _mm_srli_epi16(B2, 6); // range: [0, 34238] +} -static int done_sse2 = 0; -static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256]; - -WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInitSSE2(void) { - if (!done_sse2) { - int i; - for (i = 0; i < 256; ++i) { - VP8kYtoRGBA[i].i32[0] = - VP8kYtoRGBA[i].i32[1] = - VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2; - VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2; - - VP8kUtoRGBA[i].i32[0] = 0; - VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128); - VP8kUtoRGBA[i].i32[2] = kUToB * (i - 128); - VP8kUtoRGBA[i].i32[3] = 0; - - VP8kVtoRGBA[i].i32[0] = kVToR * (i - 128); - VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128); - VP8kVtoRGBA[i].i32[2] = 0; - VP8kVtoRGBA[i].i32[3] = 0; - } - done_sse2 = 1; - -#if 0 // code used to generate 'yuv_tables_sse2.h' - printf("static const VP8kCstSSE2 VP8kYtoRGBA[256] = {\n"); - for (i = 0; i < 256; ++i) { - printf(" {{0x%.8x, 0x%.8x, 0x%.8x, 0x%.8x}},\n", - VP8kYtoRGBA[i].i32[0], VP8kYtoRGBA[i].i32[1], - VP8kYtoRGBA[i].i32[2], VP8kYtoRGBA[i].i32[3]); - } - printf("};\n\n"); - printf("static const VP8kCstSSE2 VP8kUtoRGBA[256] = {\n"); - for (i = 0; i < 256; ++i) { - printf(" {{0, 0x%.8x, 0x%.8x, 0}},\n", - VP8kUtoRGBA[i].i32[1], VP8kUtoRGBA[i].i32[2]); - } - printf("};\n\n"); - printf("static VP8kCstSSE2 VP8kVtoRGBA[256] = {\n"); - for (i = 0; i < 256; ++i) { - printf(" {{0x%.8x, 0x%.8x, 0, 0}},\n", - VP8kVtoRGBA[i].i32[0], VP8kVtoRGBA[i].i32[1]); - } - printf("};\n\n"); -#endif - } +// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. +static WEBP_INLINE __m128i Load_HI_16(const uint8_t* src) { + const __m128i zero = _mm_setzero_si128(); + return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src)); } -#endif // WEBP_YUV_USE_SSE2_TABLES +// Load and replicate the U/V samples +static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) { + const __m128i zero = _mm_setzero_si128(); + const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src); + const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0); + return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples +} -//----------------------------------------------------------------------------- +// Convert 32 samples of YUV444 to R/G/B +static void YUV444ToRGB(const uint8_t* const y, + const uint8_t* const u, + const uint8_t* const v, + __m128i* const R, __m128i* const G, __m128i* const B) { + const __m128i Y0 = Load_HI_16(y), U0 = Load_HI_16(u), V0 = Load_HI_16(v); + ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B); +} -static WEBP_INLINE __m128i LoadUVPart(int u, int v) { - const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m); - const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m); - const __m128i uv_part = _mm_add_epi32(u_part, v_part); - return uv_part; +// Convert 32 samples of YUV420 to R/G/B +static void YUV420ToRGB(const uint8_t* const y, + const uint8_t* const u, + const uint8_t* const v, + __m128i* const R, __m128i* const G, __m128i* const B) { + const __m128i Y0 = Load_HI_16(y), U0 = Load_UV_HI_8(u), V0 = Load_UV_HI_8(v); + ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B); } -static WEBP_INLINE __m128i GetRGBA32bWithUV(int y, const __m128i uv_part) { - const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m); - const __m128i rgba1 = _mm_add_epi32(y_part, uv_part); - const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2); - return rgba2; +// Pack R/G/B/A results into 32b output. +static WEBP_INLINE void PackAndStore4(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + const __m128i* const A, + uint8_t* const dst) { + const __m128i rb = _mm_packus_epi16(*R, *B); + const __m128i ga = _mm_packus_epi16(*G, *A); + const __m128i rg = _mm_unpacklo_epi8(rb, ga); + const __m128i ba = _mm_unpackhi_epi8(rb, ga); + const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba); + const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba); + _mm_storeu_si128((__m128i*)(dst + 0), RGBA_lo); + _mm_storeu_si128((__m128i*)(dst + 16), RGBA_hi); } -static WEBP_INLINE __m128i GetRGBA32b(int y, int u, int v) { - const __m128i uv_part = LoadUVPart(u, v); - return GetRGBA32bWithUV(y, uv_part); +// Pack R/G/B/A results into 16b output. +static WEBP_INLINE void PackAndStore4444(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + const __m128i* const A, + uint8_t* const dst) { +#if !defined(WEBP_SWAP_16BIT_CSP) + const __m128i rg0 = _mm_packus_epi16(*R, *G); + const __m128i ba0 = _mm_packus_epi16(*B, *A); +#else + const __m128i rg0 = _mm_packus_epi16(*B, *A); + const __m128i ba0 = _mm_packus_epi16(*R, *G); +#endif + const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); + const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0); // rbrbrbrbrb... + const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0); // gagagagaga... + const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0); + const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), 4); + const __m128i rgba4444 = _mm_or_si128(rb2, ga2); + _mm_storeu_si128((__m128i*)dst, rgba4444); } -static WEBP_INLINE void YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v, - uint8_t* const rgb) { - const __m128i tmp0 = GetRGBA32b(y, u, v); - const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0); - const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1); - // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp - _mm_storel_epi64((__m128i*)rgb, tmp2); +// Pack R/G/B results into 16b output. +static WEBP_INLINE void PackAndStore565(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + uint8_t* const dst) { + const __m128i r0 = _mm_packus_epi16(*R, *R); + const __m128i g0 = _mm_packus_epi16(*G, *G); + const __m128i b0 = _mm_packus_epi16(*B, *B); + const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(0xf8)); + const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f)); + const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0xe0)), 5); + const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3); + const __m128i rg = _mm_or_si128(r1, g1); + const __m128i gb = _mm_or_si128(g2, b1); +#if !defined(WEBP_SWAP_16BIT_CSP) + const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb); +#else + const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg); +#endif + _mm_storeu_si128((__m128i*)dst, rgb565); } -static WEBP_INLINE void YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v, - uint8_t* const bgr) { - const __m128i tmp0 = GetRGBA32b(y, u, v); - const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2)); - const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1); - const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2); - // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp - _mm_storel_epi64((__m128i*)bgr, tmp3); +// Function used several times in PlanarTo24b. +// It samples the in buffer as follows: one every two unsigned char is stored +// at the beginning of the buffer, while the other half is stored at the end. +static WEBP_INLINE void PlanarTo24bHelper(const __m128i* const in /*in[6]*/, + __m128i* const out /*out[6]*/) { + const __m128i v_mask = _mm_set1_epi16(0x00ff); + + // Take one every two upper 8b values. + out[0] = _mm_packus_epi16(_mm_and_si128(in[0], v_mask), + _mm_and_si128(in[1], v_mask)); + out[1] = _mm_packus_epi16(_mm_and_si128(in[2], v_mask), + _mm_and_si128(in[3], v_mask)); + out[2] = _mm_packus_epi16(_mm_and_si128(in[4], v_mask), + _mm_and_si128(in[5], v_mask)); + // Take one every two lower 8b values. + out[3] = _mm_packus_epi16(_mm_srli_epi16(in[0], 8), _mm_srli_epi16(in[1], 8)); + out[4] = _mm_packus_epi16(_mm_srli_epi16(in[2], 8), _mm_srli_epi16(in[3], 8)); + out[5] = _mm_packus_epi16(_mm_srli_epi16(in[4], 8), _mm_srli_epi16(in[5], 8)); } -//----------------------------------------------------------------------------- -// Convert spans of 32 pixels to various RGB formats for the fancy upsampler. +// Pack the planar buffers +// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... +// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... +static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) { + // The input is 6 registers of sixteen 8b but for the sake of explanation, + // let's take 6 registers of four 8b values. + // To pack, we will keep taking one every two 8b integer and move it + // around as follows: + // Input: + // r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7 + // Split the 6 registers in two sets of 3 registers: the first set as the even + // 8b bytes, the second the odd ones: + // r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7 + // Repeat the same permutations twice more: + // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7 + // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 + __m128i tmp[6]; + PlanarTo24bHelper(in, tmp); + PlanarTo24bHelper(tmp, in); + PlanarTo24bHelper(in, tmp); + // We need to do it two more times than the example as we have sixteen bytes. + PlanarTo24bHelper(tmp, in); + PlanarTo24bHelper(in, tmp); + + _mm_storeu_si128((__m128i*)(rgb + 0), tmp[0]); + _mm_storeu_si128((__m128i*)(rgb + 16), tmp[1]); + _mm_storeu_si128((__m128i*)(rgb + 32), tmp[2]); + _mm_storeu_si128((__m128i*)(rgb + 48), tmp[3]); + _mm_storeu_si128((__m128i*)(rgb + 64), tmp[4]); + _mm_storeu_si128((__m128i*)(rgb + 80), tmp[5]); +} +#undef MK_UINT32 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst) { + const __m128i kAlpha = _mm_set1_epi16(255); int n; - for (n = 0; n < 32; n += 4) { - const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); - const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); - const __m128i tmp0_3 = GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]); - const __m128i tmp0_4 = GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]); - const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2); - const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4); - const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2); - _mm_storeu_si128((__m128i*)dst, tmp2); - dst += 4 * 4; + for (n = 0; n < 32; n += 8, dst += 32) { + __m128i R, G, B; + YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); + PackAndStore4(&R, &G, &B, &kAlpha, dst); } } void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst) { + const __m128i kAlpha = _mm_set1_epi16(255); int n; - for (n = 0; n < 32; n += 2) { - const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); - const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); - const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2)); - const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2)); - const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); - const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); - _mm_storel_epi64((__m128i*)dst, tmp3); - dst += 4 * 2; + for (n = 0; n < 32; n += 8, dst += 32) { + __m128i R, G, B; + YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); + PackAndStore4(&B, &G, &R, &kAlpha, dst); } } -void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { + const __m128i kAlpha = _mm_set1_epi16(255); int n; - uint8_t tmp0[2 * 3 + 5 + 15]; - uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align - for (n = 0; n < 30; ++n) { // we directly stomp the *dst memory - YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3); + for (n = 0; n < 32; n += 8, dst += 32) { + __m128i R, G, B; + YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); + PackAndStore4(&kAlpha, &R, &G, &B, dst); } - // Last two pixels are special: we write in a tmp buffer before sending - // to dst. - YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0); - YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3); - memcpy(dst + n * 3, tmp, 2 * 3); } -void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { + const __m128i kAlpha = _mm_set1_epi16(255); int n; - uint8_t tmp0[2 * 3 + 5 + 15]; - uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align - for (n = 0; n < 30; ++n) { - YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3); + for (n = 0; n < 32; n += 8, dst += 16) { + __m128i R, G, B; + YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); + PackAndStore4444(&R, &G, &B, &kAlpha, dst); } - YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0); - YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3); - memcpy(dst + n * 3, tmp, 2 * 3); +} + +void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { + int n; + for (n = 0; n < 32; n += 8, dst += 16) { + __m128i R, G, B; + YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); + PackAndStore565(&R, &G, &B, dst); + } +} + +void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { + __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; + __m128i rgb[6]; + + YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); + YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); + YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); + + // Cast to 8b and store as RRRRGGGGBBBB. + rgb[0] = _mm_packus_epi16(R0, R1); + rgb[1] = _mm_packus_epi16(R2, R3); + rgb[2] = _mm_packus_epi16(G0, G1); + rgb[3] = _mm_packus_epi16(G2, G3); + rgb[4] = _mm_packus_epi16(B0, B1); + rgb[5] = _mm_packus_epi16(B2, B3); + + // Pack as RGBRGBRGBRGB. + PlanarTo24b(rgb, dst); +} + +void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { + __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; + __m128i bgr[6]; + + YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); + YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); + YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); + + // Cast to 8b and store as BBBBGGGGRRRR. + bgr[0] = _mm_packus_epi16(B0, B1); + bgr[1] = _mm_packus_epi16(B2, B3); + bgr[2] = _mm_packus_epi16(G0, G1); + bgr[3] = _mm_packus_epi16(G2, G3); + bgr[4] = _mm_packus_epi16(R0, R1); + bgr[5] = _mm_packus_epi16(R2, R3); + + // Pack as BGRBGRBGRBGR. + PlanarTo24b(bgr, dst); } //----------------------------------------------------------------------------- @@ -189,110 +311,137 @@ void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int len) { + const __m128i kAlpha = _mm_set1_epi16(255); int n; - for (n = 0; n + 4 <= len; n += 4) { - const __m128i uv_0 = LoadUVPart(u[0], v[0]); - const __m128i uv_1 = LoadUVPart(u[1], v[1]); - const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0); - const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0); - const __m128i tmp0_3 = GetRGBA32bWithUV(y[2], uv_1); - const __m128i tmp0_4 = GetRGBA32bWithUV(y[3], uv_1); - const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2); - const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4); - const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2); - _mm_storeu_si128((__m128i*)dst, tmp2); - dst += 4 * 4; - y += 4; - u += 2; - v += 2; + for (n = 0; n + 8 <= len; n += 8, dst += 32) { + __m128i R, G, B; + YUV420ToRGB(y, u, v, &R, &G, &B); + PackAndStore4(&R, &G, &B, &kAlpha, dst); + y += 8; + u += 4; + v += 4; } - // Finish off - while (n < len) { + for (; n < len; ++n) { // Finish off VP8YuvToRgba(y[0], u[0], v[0], dst); dst += 4; - ++y; + y += 1; u += (n & 1); v += (n & 1); - ++n; } } static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int len) { + const __m128i kAlpha = _mm_set1_epi16(255); int n; - for (n = 0; n + 2 <= len; n += 2) { - const __m128i uv_0 = LoadUVPart(u[0], v[0]); - const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0); - const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0); - const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2)); - const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2)); - const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); - const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); - _mm_storel_epi64((__m128i*)dst, tmp3); - dst += 4 * 2; - y += 2; - ++u; - ++v; + for (n = 0; n + 8 <= len; n += 8, dst += 32) { + __m128i R, G, B; + YUV420ToRGB(y, u, v, &R, &G, &B); + PackAndStore4(&B, &G, &R, &kAlpha, dst); + y += 8; + u += 4; + v += 4; } - // Finish off - if (len & 1) { + for (; n < len; ++n) { // Finish off VP8YuvToBgra(y[0], u[0], v[0], dst); + dst += 4; + y += 1; + u += (n & 1); + v += (n & 1); } } static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int len) { + const __m128i kAlpha = _mm_set1_epi16(255); int n; - for (n = 0; n + 2 <= len; n += 2) { - const __m128i uv_0 = LoadUVPart(u[0], v[0]); - const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0); - const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0); - const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(2, 1, 0, 3)); - const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(2, 1, 0, 3)); - const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); - const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); - _mm_storel_epi64((__m128i*)dst, tmp3); - dst += 4 * 2; - y += 2; - ++u; - ++v; + for (n = 0; n + 8 <= len; n += 8, dst += 32) { + __m128i R, G, B; + YUV420ToRGB(y, u, v, &R, &G, &B); + PackAndStore4(&kAlpha, &R, &G, &B, dst); + y += 8; + u += 4; + v += 4; } - // Finish off - if (len & 1) { + for (; n < len; ++n) { // Finish off VP8YuvToArgb(y[0], u[0], v[0], dst); + dst += 4; + y += 1; + u += (n & 1); + v += (n & 1); } } static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int len) { int n; - for (n = 0; n + 2 < len; ++n) { // we directly stomp the *dst memory - YuvToRgbSSE2(y[0], u[0], v[0], dst); // stomps 8 bytes + for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { + __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; + __m128i rgb[6]; + + YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); + YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); + YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); + + // Cast to 8b and store as RRRRGGGGBBBB. + rgb[0] = _mm_packus_epi16(R0, R1); + rgb[1] = _mm_packus_epi16(R2, R3); + rgb[2] = _mm_packus_epi16(G0, G1); + rgb[3] = _mm_packus_epi16(G2, G3); + rgb[4] = _mm_packus_epi16(B0, B1); + rgb[5] = _mm_packus_epi16(B2, B3); + + // Pack as RGBRGBRGBRGB. + PlanarTo24b(rgb, dst); + + y += 32; + u += 16; + v += 16; + } + for (; n < len; ++n) { // Finish off + VP8YuvToRgb(y[0], u[0], v[0], dst); dst += 3; - ++y; + y += 1; u += (n & 1); v += (n & 1); } - VP8YuvToRgb(y[0], u[0], v[0], dst); - if (len > 1) { - VP8YuvToRgb(y[1], u[n & 1], v[n & 1], dst + 3); - } } static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int len) { int n; - for (n = 0; n + 2 < len; ++n) { // we directly stomp the *dst memory - YuvToBgrSSE2(y[0], u[0], v[0], dst); // stomps 8 bytes + for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { + __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; + __m128i bgr[6]; + + YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); + YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); + YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); + + // Cast to 8b and store as BBBBGGGGRRRR. + bgr[0] = _mm_packus_epi16(B0, B1); + bgr[1] = _mm_packus_epi16(B2, B3); + bgr[2] = _mm_packus_epi16(G0, G1); + bgr[3] = _mm_packus_epi16(G2, G3); + bgr[4] = _mm_packus_epi16(R0, R1); + bgr[5] = _mm_packus_epi16(R2, R3); + + // Pack as BGRBGRBGRBGR. + PlanarTo24b(bgr, dst); + + y += 32; + u += 16; + v += 16; + } + for (; n < len; ++n) { // Finish off + VP8YuvToBgr(y[0], u[0], v[0], dst); dst += 3; - ++y; + y += 1; u += (n & 1); v += (n & 1); } - VP8YuvToBgr(y[0], u[0], v[0], dst + 0); - if (len > 1) { - VP8YuvToBgr(y[1], u[n & 1], v[n & 1], dst + 3); - } } //------------------------------------------------------------------------------ @@ -316,52 +465,36 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) { // Store either 16b-words into *dst #define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V)) -// Convert 8 packed RGB or BGR samples to r[], g[], b[] +// Function that inserts a value of the second half of the in buffer in between +// every two char of the first half. +static WEBP_INLINE void RGB24PackedToPlanarHelper( + const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) { + out[0] = _mm_unpacklo_epi8(in[0], in[3]); + out[1] = _mm_unpackhi_epi8(in[0], in[3]); + out[2] = _mm_unpacklo_epi8(in[1], in[4]); + out[3] = _mm_unpackhi_epi8(in[1], in[4]); + out[4] = _mm_unpacklo_epi8(in[2], in[5]); + out[5] = _mm_unpackhi_epi8(in[2], in[5]); +} + +// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers: +// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... +// Similar to PlanarTo24bHelper(), but in reverse order. static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb, - __m128i* const r, - __m128i* const g, - __m128i* const b, - int input_is_bgr) { - const __m128i zero = _mm_setzero_si128(); - // in0: r0 g0 b0 r1 | g1 b1 r2 g2 | b2 r3 g3 b3 | r4 g4 b4 r5 - // in1: b2 r3 g3 b3 | r4 g4 b4 r5 | g5 b5 r6 g6 | b6 r7 g7 b7 - const __m128i in0 = LOAD_16(rgb + 0); - const __m128i in1 = LOAD_16(rgb + 8); - // A0: | r2 g2 b2 r3 | g3 b3 r4 g4 | b4 r5 ... - // A1: ... b2 r3 | g3 b3 r4 g4 | b4 r5 g5 b5 | - const __m128i A0 = _mm_srli_si128(in0, 6); - const __m128i A1 = _mm_slli_si128(in1, 6); - // B0: r0 r2 g0 g2 | b0 b2 r1 r3 | g1 g3 b1 b3 | r2 r4 b2 b4 - // B1: g3 g5 b3 b5 | r4 r6 g4 g6 | b4 b6 r5 r7 | g5 g7 b5 b7 - const __m128i B0 = _mm_unpacklo_epi8(in0, A0); - const __m128i B1 = _mm_unpackhi_epi8(A1, in1); - // C0: r1 r3 g1 g3 | b1 b3 r2 r4 | b2 b4 ... - // C1: ... g3 g5 | b3 b5 r4 r6 | g4 g6 b4 b6 - const __m128i C0 = _mm_srli_si128(B0, 6); - const __m128i C1 = _mm_slli_si128(B1, 6); - // D0: r0 r1 r2 r3 | g0 g1 g2 g3 | b0 b1 b2 b3 | r1 r2 r3 r4 - // D1: b3 b4 b5 b6 | r4 r5 r6 r7 | g4 g5 g6 g7 | b4 b5 b6 b7 | - const __m128i D0 = _mm_unpacklo_epi8(B0, C0); - const __m128i D1 = _mm_unpackhi_epi8(C1, B1); - // r4 r5 r6 r7 | g4 g5 g6 g7 | b4 b5 b6 b7 | 0 - const __m128i D2 = _mm_srli_si128(D1, 4); - // r0 r1 r2 r3 | r4 r5 r6 r7 | g0 g1 g2 g3 | g4 g5 g6 g7 - const __m128i E0 = _mm_unpacklo_epi32(D0, D2); - // b0 b1 b2 b3 | b4 b5 b6 b7 | r1 r2 r3 r4 | 0 - const __m128i E1 = _mm_unpackhi_epi32(D0, D2); - // g0 g1 g2 g3 | g4 g5 g6 g7 | 0 - const __m128i E2 = _mm_srli_si128(E0, 8); - const __m128i F0 = _mm_unpacklo_epi8(E0, zero); // -> R - const __m128i F1 = _mm_unpacklo_epi8(E1, zero); // -> B - const __m128i F2 = _mm_unpacklo_epi8(E2, zero); // -> G - *g = F2; - if (input_is_bgr) { - *r = F1; - *b = F0; - } else { - *r = F0; - *b = F1; - } + __m128i* const out /*out[6]*/) { + __m128i tmp[6]; + tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0)); + tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16)); + tmp[2] = _mm_loadu_si128((const __m128i*)(rgb + 32)); + tmp[3] = _mm_loadu_si128((const __m128i*)(rgb + 48)); + tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64)); + tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80)); + + RGB24PackedToPlanarHelper(tmp, out); + RGB24PackedToPlanarHelper(out, tmp); + RGB24PackedToPlanarHelper(tmp, out); + RGB24PackedToPlanarHelper(out, tmp); + RGB24PackedToPlanarHelper(tmp, out); } // Convert 8 packed ARGB to r[], g[], b[] @@ -445,15 +578,33 @@ static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R, #undef TRANSFORM static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) { - const int max_width = width & ~15; + const int max_width = width & ~31; int i; - for (i = 0; i < max_width; i += 16, rgb += 3 * 16) { - __m128i r, g, b, Y0, Y1; - RGB24PackedToPlanar(rgb + 0 * 8, &r, &g, &b, 0); - ConvertRGBToY(&r, &g, &b, &Y0); - RGB24PackedToPlanar(rgb + 3 * 8, &r, &g, &b, 0); - ConvertRGBToY(&r, &g, &b, &Y1); - STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + for (i = 0; i < max_width; rgb += 3 * 16 * 2) { + __m128i rgb_plane[6]; + int j; + + RGB24PackedToPlanar(rgb, rgb_plane); + + for (j = 0; j < 2; ++j, i += 16) { + const __m128i zero = _mm_setzero_si128(); + __m128i r, g, b, Y0, Y1; + + // Convert to 16-bit Y. + r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero); + g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero); + b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero); + ConvertRGBToY(&r, &g, &b, &Y0); + + // Convert to 16-bit Y. + r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero); + g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero); + b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero); + ConvertRGBToY(&r, &g, &b, &Y1); + + // Cast to 8-bit and store. + STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + } } for (; i < width; ++i, rgb += 3) { // left-over y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); @@ -461,15 +612,33 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) { } static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) { + const int max_width = width & ~31; int i; - const int max_width = width & ~15; - for (i = 0; i < max_width; i += 16, bgr += 3 * 16) { - __m128i r, g, b, Y0, Y1; - RGB24PackedToPlanar(bgr + 0 * 8, &r, &g, &b, 1); - ConvertRGBToY(&r, &g, &b, &Y0); - RGB24PackedToPlanar(bgr + 3 * 8, &r, &g, &b, 1); - ConvertRGBToY(&r, &g, &b, &Y1); - STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + for (i = 0; i < max_width; bgr += 3 * 16 * 2) { + __m128i bgr_plane[6]; + int j; + + RGB24PackedToPlanar(bgr, bgr_plane); + + for (j = 0; j < 2; ++j, i += 16) { + const __m128i zero = _mm_setzero_si128(); + __m128i r, g, b, Y0, Y1; + + // Convert to 16-bit Y. + b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero); + g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero); + r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero); + ConvertRGBToY(&r, &g, &b, &Y0); + + // Convert to 16-bit Y. + b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero); + g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero); + r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero); + ConvertRGBToY(&r, &g, &b, &Y1); + + // Cast to 8-bit and store. + STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + } } for (; i < width; ++i, bgr += 3) { // left-over y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); |