diff options
Diffstat (limited to 'thirdparty/libwebp/dsp/yuv_sse2.c')
-rw-r--r-- | thirdparty/libwebp/dsp/yuv_sse2.c | 304 |
1 files changed, 196 insertions, 108 deletions
diff --git a/thirdparty/libwebp/dsp/yuv_sse2.c b/thirdparty/libwebp/dsp/yuv_sse2.c index e19bddff6c..e33c2bbafd 100644 --- a/thirdparty/libwebp/dsp/yuv_sse2.c +++ b/thirdparty/libwebp/dsp/yuv_sse2.c @@ -15,6 +15,8 @@ #if defined(WEBP_USE_SSE2) +#include "./common_sse2.h" +#include <stdlib.h> #include <emmintrin.h> //----------------------------------------------------------------------------- @@ -155,30 +157,13 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R, _mm_storeu_si128((__m128i*)dst, rgb565); } -// Function used several times in PlanarTo24b. -// It samples the in buffer as follows: one every two unsigned char is stored -// at the beginning of the buffer, while the other half is stored at the end. -static WEBP_INLINE void PlanarTo24bHelper(const __m128i* const in /*in[6]*/, - __m128i* const out /*out[6]*/) { - const __m128i v_mask = _mm_set1_epi16(0x00ff); - - // Take one every two upper 8b values. - out[0] = _mm_packus_epi16(_mm_and_si128(in[0], v_mask), - _mm_and_si128(in[1], v_mask)); - out[1] = _mm_packus_epi16(_mm_and_si128(in[2], v_mask), - _mm_and_si128(in[3], v_mask)); - out[2] = _mm_packus_epi16(_mm_and_si128(in[4], v_mask), - _mm_and_si128(in[5], v_mask)); - // Take one every two lower 8b values. - out[3] = _mm_packus_epi16(_mm_srli_epi16(in[0], 8), _mm_srli_epi16(in[1], 8)); - out[4] = _mm_packus_epi16(_mm_srli_epi16(in[2], 8), _mm_srli_epi16(in[3], 8)); - out[5] = _mm_packus_epi16(_mm_srli_epi16(in[4], 8), _mm_srli_epi16(in[5], 8)); -} - // Pack the planar buffers // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... -static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) { +static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1, + __m128i* const in2, __m128i* const in3, + __m128i* const in4, __m128i* const in5, + uint8_t* const rgb) { // The input is 6 registers of sixteen 8b but for the sake of explanation, // let's take 6 registers of four 8b values. // To pack, we will keep taking one every two 8b integer and move it @@ -191,22 +176,15 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) { // Repeat the same permutations twice more: // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7 // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 - __m128i tmp[6]; - PlanarTo24bHelper(in, tmp); - PlanarTo24bHelper(tmp, in); - PlanarTo24bHelper(in, tmp); - // We need to do it two more times than the example as we have sixteen bytes. - PlanarTo24bHelper(tmp, in); - PlanarTo24bHelper(in, tmp); - - _mm_storeu_si128((__m128i*)(rgb + 0), tmp[0]); - _mm_storeu_si128((__m128i*)(rgb + 16), tmp[1]); - _mm_storeu_si128((__m128i*)(rgb + 32), tmp[2]); - _mm_storeu_si128((__m128i*)(rgb + 48), tmp[3]); - _mm_storeu_si128((__m128i*)(rgb + 64), tmp[4]); - _mm_storeu_si128((__m128i*)(rgb + 80), tmp[5]); -} -#undef MK_UINT32 + VP8PlanarTo24b(in0, in1, in2, in3, in4, in5); + + _mm_storeu_si128((__m128i*)(rgb + 0), *in0); + _mm_storeu_si128((__m128i*)(rgb + 16), *in1); + _mm_storeu_si128((__m128i*)(rgb + 32), *in2); + _mm_storeu_si128((__m128i*)(rgb + 48), *in3); + _mm_storeu_si128((__m128i*)(rgb + 64), *in4); + _mm_storeu_si128((__m128i*)(rgb + 80), *in5); +} void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst) { @@ -265,29 +243,29 @@ void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; - __m128i rgb[6]; + __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; - YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); - YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); + YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); // Cast to 8b and store as RRRRGGGGBBBB. - rgb[0] = _mm_packus_epi16(R0, R1); - rgb[1] = _mm_packus_epi16(R2, R3); - rgb[2] = _mm_packus_epi16(G0, G1); - rgb[3] = _mm_packus_epi16(G2, G3); - rgb[4] = _mm_packus_epi16(B0, B1); - rgb[5] = _mm_packus_epi16(B2, B3); + rgb0 = _mm_packus_epi16(R0, R1); + rgb1 = _mm_packus_epi16(R2, R3); + rgb2 = _mm_packus_epi16(G0, G1); + rgb3 = _mm_packus_epi16(G2, G3); + rgb4 = _mm_packus_epi16(B0, B1); + rgb5 = _mm_packus_epi16(B2, B3); // Pack as RGBRGBRGBRGB. - PlanarTo24b(rgb, dst); + PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); } void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; - __m128i bgr[6]; + __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); @@ -295,15 +273,15 @@ void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); // Cast to 8b and store as BBBBGGGGRRRR. - bgr[0] = _mm_packus_epi16(B0, B1); - bgr[1] = _mm_packus_epi16(B2, B3); - bgr[2] = _mm_packus_epi16(G0, G1); - bgr[3] = _mm_packus_epi16(G2, G3); - bgr[4] = _mm_packus_epi16(R0, R1); - bgr[5] = _mm_packus_epi16(R2, R3); + bgr0 = _mm_packus_epi16(B0, B1); + bgr1 = _mm_packus_epi16(B2, B3); + bgr2 = _mm_packus_epi16(G0, G1); + bgr3 = _mm_packus_epi16(G2, G3); + bgr4 = _mm_packus_epi16(R0, R1); + bgr5= _mm_packus_epi16(R2, R3); // Pack as BGRBGRBGRBGR. - PlanarTo24b(bgr, dst); + PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); } //----------------------------------------------------------------------------- @@ -377,7 +355,7 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; - __m128i rgb[6]; + __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); @@ -385,15 +363,15 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); // Cast to 8b and store as RRRRGGGGBBBB. - rgb[0] = _mm_packus_epi16(R0, R1); - rgb[1] = _mm_packus_epi16(R2, R3); - rgb[2] = _mm_packus_epi16(G0, G1); - rgb[3] = _mm_packus_epi16(G2, G3); - rgb[4] = _mm_packus_epi16(B0, B1); - rgb[5] = _mm_packus_epi16(B2, B3); + rgb0 = _mm_packus_epi16(R0, R1); + rgb1 = _mm_packus_epi16(R2, R3); + rgb2 = _mm_packus_epi16(G0, G1); + rgb3 = _mm_packus_epi16(G2, G3); + rgb4 = _mm_packus_epi16(B0, B1); + rgb5 = _mm_packus_epi16(B2, B3); // Pack as RGBRGBRGBRGB. - PlanarTo24b(rgb, dst); + PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); y += 32; u += 16; @@ -413,7 +391,7 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; - __m128i bgr[6]; + __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); @@ -421,15 +399,15 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); // Cast to 8b and store as BBBBGGGGRRRR. - bgr[0] = _mm_packus_epi16(B0, B1); - bgr[1] = _mm_packus_epi16(B2, B3); - bgr[2] = _mm_packus_epi16(G0, G1); - bgr[3] = _mm_packus_epi16(G2, G3); - bgr[4] = _mm_packus_epi16(R0, R1); - bgr[5] = _mm_packus_epi16(R2, R3); + bgr0 = _mm_packus_epi16(B0, B1); + bgr1 = _mm_packus_epi16(B2, B3); + bgr2 = _mm_packus_epi16(G0, G1); + bgr3 = _mm_packus_epi16(G2, G3); + bgr4 = _mm_packus_epi16(R0, R1); + bgr5 = _mm_packus_epi16(R2, R3); // Pack as BGRBGRBGRBGR. - PlanarTo24b(bgr, dst); + PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); y += 32; u += 16; @@ -499,25 +477,19 @@ static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb, // Convert 8 packed ARGB to r[], g[], b[] static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb, - __m128i* const r, - __m128i* const g, - __m128i* const b) { + __m128i* const rgb /*in[6]*/) { const __m128i zero = _mm_setzero_si128(); - const __m128i in0 = LOAD_16(argb + 0); // argb3 | argb2 | argb1 | argb0 - const __m128i in1 = LOAD_16(argb + 4); // argb7 | argb6 | argb5 | argb4 - // column-wise transpose - const __m128i A0 = _mm_unpacklo_epi8(in0, in1); - const __m128i A1 = _mm_unpackhi_epi8(in0, in1); - const __m128i B0 = _mm_unpacklo_epi8(A0, A1); - const __m128i B1 = _mm_unpackhi_epi8(A0, A1); - // C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0 - // C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0 - const __m128i C0 = _mm_unpacklo_epi8(B0, B1); - const __m128i C1 = _mm_unpackhi_epi8(B0, B1); - // store 16b - *r = _mm_unpacklo_epi8(C1, zero); - *g = _mm_unpackhi_epi8(C0, zero); - *b = _mm_unpacklo_epi8(C0, zero); + __m128i a0 = LOAD_16(argb + 0); + __m128i a1 = LOAD_16(argb + 4); + __m128i a2 = LOAD_16(argb + 8); + __m128i a3 = LOAD_16(argb + 12); + VP8L32bToPlanar(&a0, &a1, &a2, &a3); + rgb[0] = _mm_unpacklo_epi8(a1, zero); + rgb[1] = _mm_unpackhi_epi8(a1, zero); + rgb[2] = _mm_unpacklo_epi8(a2, zero); + rgb[3] = _mm_unpackhi_epi8(a2, zero); + rgb[4] = _mm_unpacklo_epi8(a3, zero); + rgb[5] = _mm_unpackhi_epi8(a3, zero); } // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX @@ -649,11 +621,10 @@ static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) { const int max_width = width & ~15; int i; for (i = 0; i < max_width; i += 16) { - __m128i r, g, b, Y0, Y1; - RGB32PackedToPlanar(&argb[i + 0], &r, &g, &b); - ConvertRGBToY(&r, &g, &b, &Y0); - RGB32PackedToPlanar(&argb[i + 8], &r, &g, &b); - ConvertRGBToY(&r, &g, &b, &Y1); + __m128i Y0, Y1, rgb[6]; + RGB32PackedToPlanar(&argb[i], rgb); + ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0); + ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1); STORE_16(_mm_packus_epi16(Y0, Y1), y + i); } for (; i < width; ++i) { // left-over @@ -678,20 +649,18 @@ static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v, const int max_width = src_width & ~31; int i; for (i = 0; i < max_width; i += 32, u += 16, v += 16) { - __m128i r0, g0, b0, r1, g1, b1, U0, V0, U1, V1; - RGB32PackedToPlanar(&argb[i + 0], &r0, &g0, &b0); - RGB32PackedToPlanar(&argb[i + 8], &r1, &g1, &b1); - HorizontalAddPack(&r0, &r1, &r0); - HorizontalAddPack(&g0, &g1, &g0); - HorizontalAddPack(&b0, &b1, &b0); - ConvertRGBToUV(&r0, &g0, &b0, &U0, &V0); - - RGB32PackedToPlanar(&argb[i + 16], &r0, &g0, &b0); - RGB32PackedToPlanar(&argb[i + 24], &r1, &g1, &b1); - HorizontalAddPack(&r0, &r1, &r0); - HorizontalAddPack(&g0, &g1, &g0); - HorizontalAddPack(&b0, &b1, &b0); - ConvertRGBToUV(&r0, &g0, &b0, &U1, &V1); + __m128i rgb[6], U0, V0, U1, V1; + RGB32PackedToPlanar(&argb[i], rgb); + HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]); + HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]); + HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]); + ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); + + RGB32PackedToPlanar(&argb[i + 16], rgb); + HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]); + HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]); + HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]); + ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1); U0 = _mm_packus_epi16(U0, U1); V0 = _mm_packus_epi16(V0, V1); @@ -767,9 +736,128 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) { WebPConvertRGBA32ToUV = ConvertRGBA32ToUV; } +//------------------------------------------------------------------------------ + +#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic +static uint16_t clip_y(int v) { + return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; +} + +static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src, + uint16_t* dst, int len) { + uint64_t diff = 0; + uint32_t tmp[4]; + int i; + const __m128i zero = _mm_setzero_si128(); + const __m128i max = _mm_set1_epi16(MAX_Y); + const __m128i one = _mm_set1_epi16(1); + __m128i sum = zero; + + for (i = 0; i + 8 <= len; i += 8) { + const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); + const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); + const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); + const __m128i D = _mm_sub_epi16(A, B); // diff_y + const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0) + const __m128i F = _mm_add_epi16(C, D); // new_y + const __m128i G = _mm_or_si128(E, one); // -1 or 1 + const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero); + const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...)) + _mm_storeu_si128((__m128i*)(dst + i), H); + sum = _mm_add_epi32(sum, I); + } + _mm_storeu_si128((__m128i*)tmp, sum); + diff = tmp[3] + tmp[2] + tmp[1] + tmp[0]; + for (; i < len; ++i) { + const int diff_y = ref[i] - src[i]; + const int new_y = (int)dst[i] + diff_y; + dst[i] = clip_y(new_y); + diff += (uint64_t)abs(diff_y); + } + return diff; +} + +static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src, + int16_t* dst, int len) { + int i = 0; + for (i = 0; i + 8 <= len; i += 8) { + const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); + const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); + const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); + const __m128i D = _mm_sub_epi16(A, B); // diff_uv + const __m128i E = _mm_add_epi16(C, D); // new_uv + _mm_storeu_si128((__m128i*)(dst + i), E); + } + for (; i < len; ++i) { + const int diff_uv = ref[i] - src[i]; + dst[i] += diff_uv; + } +} + +static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len, + const uint16_t* best_y, uint16_t* out) { + int i; + const __m128i kCst8 = _mm_set1_epi16(8); + const __m128i max = _mm_set1_epi16(MAX_Y); + const __m128i zero = _mm_setzero_si128(); + for (i = 0; i + 8 <= len; i += 8) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0)); + const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1)); + const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0)); + const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1)); + const __m128i a0b1 = _mm_add_epi16(a0, b1); + const __m128i a1b0 = _mm_add_epi16(a1, b0); + const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1 + const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8); + const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1) + const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0) + const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3); + const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3); + const __m128i d0 = _mm_add_epi16(c1, a0); + const __m128i d1 = _mm_add_epi16(c0, a1); + const __m128i e0 = _mm_srai_epi16(d0, 1); + const __m128i e1 = _mm_srai_epi16(d1, 1); + const __m128i f0 = _mm_unpacklo_epi16(e0, e1); + const __m128i f1 = _mm_unpackhi_epi16(e0, e1); + const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0)); + const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8)); + const __m128i h0 = _mm_add_epi16(g0, f0); + const __m128i h1 = _mm_add_epi16(g1, f1); + const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero); + const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero); + _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0); + _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1); + } + for (; i < len; ++i) { + // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 = + // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4 + // We reuse the common sub-expressions. + const int a0b1 = A[i + 0] + B[i + 1]; + const int a1b0 = A[i + 1] + B[i + 0]; + const int a0a1b0b1 = a0b1 + a1b0 + 8; + const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; + const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; + out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); + out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); + } +} + +#undef MAX_Y + +//------------------------------------------------------------------------------ + +extern void WebPInitSharpYUVSSE2(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) { + WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2; + WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2; + WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2; +} + #else // !WEBP_USE_SSE2 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2) WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2) +WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2) #endif // WEBP_USE_SSE2 |