summaryrefslogtreecommitdiff
path: root/drivers/webp/dsp
diff options
context:
space:
mode:
authorJuan Linietsky <reduzio@gmail.com>2015-12-04 10:18:28 -0300
committerJuan Linietsky <reduzio@gmail.com>2015-12-04 10:18:28 -0300
commitda113fe40d0a9410859912473d53e43903dc6c8e (patch)
tree23c6019a28a11d67241789721d1feecdd19410e6 /drivers/webp/dsp
parent064fd762fae75371658e773a3acf39616e813b08 (diff)
-Upgraded webp to a MUCH newer version. Hoping it fixes some bugs in the process. Keeping old version just in case for now.
-Added ability to convert xml and tscn scenes to binary on export, makes loading of larger scenes faster
Diffstat (limited to 'drivers/webp/dsp')
-rw-r--r--drivers/webp/dsp/cpu.c95
-rw-r--r--drivers/webp/dsp/dec.c338
-rw-r--r--drivers/webp/dsp/dec_neon.c1478
-rw-r--r--drivers/webp/dsp/dec_sse2.c1056
-rw-r--r--drivers/webp/dsp/dsp.h368
-rw-r--r--drivers/webp/dsp/enc.c347
-rw-r--r--drivers/webp/dsp/enc_sse2.c1364
-rw-r--r--drivers/webp/dsp/lossless.c1013
-rw-r--r--drivers/webp/dsp/lossless.h292
-rw-r--r--drivers/webp/dsp/upsampling.c259
-rw-r--r--drivers/webp/dsp/upsampling_sse2.c196
-rw-r--r--drivers/webp/dsp/yuv.c264
-rw-r--r--drivers/webp/dsp/yuv.h287
13 files changed, 5123 insertions, 2234 deletions
diff --git a/drivers/webp/dsp/cpu.c b/drivers/webp/dsp/cpu.c
index 0228734457..35c2af7f58 100644
--- a/drivers/webp/dsp/cpu.c
+++ b/drivers/webp/dsp/cpu.c
@@ -1,8 +1,10 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
-// This code is licensed under the same terms as WebM:
-// Software License Agreement: http://www.webmproject.org/license/software/
-// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// CPU detection
@@ -11,14 +13,10 @@
#include "./dsp.h"
-#if defined(__ANDROID__)
+#if defined(WEBP_ANDROID_NEON)
#include <cpu-features.h>
#endif
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
//------------------------------------------------------------------------------
// SSE2 detection.
//
@@ -31,22 +29,66 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
"cpuid\n"
"xchg %%edi, %%ebx\n"
: "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
- : "a"(info_type));
+ : "a"(info_type), "c"(0));
}
#elif defined(__i386__) || defined(__x86_64__)
static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
__asm__ volatile (
"cpuid\n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
- : "a"(info_type));
+ : "a"(info_type), "c"(0));
}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+ defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729 // >= VS2008 SP1
+#include <intrin.h>
+#define GetCPUInfo(info, type) __cpuidex(info, type, 0) // set ecx=0
#elif defined(WEBP_MSC_SSE2)
#define GetCPUInfo __cpuid
#endif
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static WEBP_INLINE uint64_t xgetbv(void) {
+ const uint32_t ecx = 0;
+ uint32_t eax, edx;
+ // Use the raw opcode for xgetbv for compatibility with older toolchains.
+ __asm__ volatile (
+ ".byte 0x0f, 0x01, 0xd0\n"
+ : "=a"(eax), "=d"(edx) : "c" (ecx));
+ return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+ defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219 // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static WEBP_INLINE uint64_t xgetbv(void) {
+ uint32_t eax_, edx_;
+ __asm {
+ xor ecx, ecx // ecx = 0
+ // Use the raw opcode for xgetbv for compatibility with older toolchains.
+ __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+ mov eax_, eax
+ mov edx_, edx
+ }
+ return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains.
+#endif
+
#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
static int x86CPUInfo(CPUFeature feature) {
+ int max_cpuid_value;
int cpu_info[4];
+
+ // get the highest feature value cpuid supports
+ GetCPUInfo(cpu_info, 0);
+ max_cpuid_value = cpu_info[0];
+ if (max_cpuid_value < 1) {
+ return 0;
+ }
+
GetCPUInfo(cpu_info, 1);
if (feature == kSSE2) {
return 0 != (cpu_info[3] & 0x04000000);
@@ -54,10 +96,26 @@ static int x86CPUInfo(CPUFeature feature) {
if (feature == kSSE3) {
return 0 != (cpu_info[2] & 0x00000001);
}
+ if (feature == kSSE4_1) {
+ return 0 != (cpu_info[2] & 0x00080000);
+ }
+ if (feature == kAVX) {
+ // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+ if ((cpu_info[2] & 0x18000000) == 0x18000000) {
+ // XMM state and YMM state enabled by the OS.
+ return (xgetbv() & 0x6) == 0x6;
+ }
+ }
+ if (feature == kAVX2) {
+ if (x86CPUInfo(kAVX) && max_cpuid_value >= 7) {
+ GetCPUInfo(cpu_info, 7);
+ return ((cpu_info[1] & 0x00000020) == 0x00000020);
+ }
+ }
return 0;
}
VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
-#elif defined(WEBP_ANDROID_NEON)
+#elif defined(WEBP_ANDROID_NEON) // NB: needs to be before generic NEON test.
static int AndroidCPUInfo(CPUFeature feature) {
const AndroidCpuFamily cpu_family = android_getCpuFamily();
const uint64_t cpu_features = android_getCpuFeatures();
@@ -68,7 +126,7 @@ static int AndroidCPUInfo(CPUFeature feature) {
return 0;
}
VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
-#elif defined(__ARM_NEON__)
+#elif defined(WEBP_USE_NEON)
// define a dummy function to enable turning off NEON at runtime by setting
// VP8DecGetCPUInfo = NULL
static int armCPUInfo(CPUFeature feature) {
@@ -76,10 +134,17 @@ static int armCPUInfo(CPUFeature feature) {
return 1;
}
VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
+#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2)
+static int mipsCPUInfo(CPUFeature feature) {
+ if ((feature == kMIPS32) || (feature == kMIPSdspR2)) {
+ return 1;
+ } else {
+ return 0;
+ }
+
+}
+VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
#else
VP8CPUInfo VP8GetCPUInfo = NULL;
#endif
-#if defined(__cplusplus) || defined(c_plusplus)
-} // extern "C"
-#endif
diff --git a/drivers/webp/dsp/dec.c b/drivers/webp/dsp/dec.c
index 9ae7b6fa76..77a00381c5 100644
--- a/drivers/webp/dsp/dec.c
+++ b/drivers/webp/dsp/dec.c
@@ -1,53 +1,20 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
-// This code is licensed under the same terms as WebM:
-// Software License Agreement: http://www.webmproject.org/license/software/
-// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
-// Speed-critical decoding functions.
+// Speed-critical decoding functions, default plain-C implementations.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "../dec/vp8i.h"
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
//------------------------------------------------------------------------------
-// run-time tables (~4k)
-
-static uint8_t abs0[255 + 255 + 1]; // abs(i)
-static uint8_t abs1[255 + 255 + 1]; // abs(i)>>1
-static int8_t sclip1[1020 + 1020 + 1]; // clips [-1020, 1020] to [-128, 127]
-static int8_t sclip2[112 + 112 + 1]; // clips [-112, 112] to [-16, 15]
-static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
-
-// We declare this variable 'volatile' to prevent instruction reordering
-// and make sure it's set to true _last_ (so as to be thread-safe)
-static volatile int tables_ok = 0;
-
-static void DspInitTables(void) {
- if (!tables_ok) {
- int i;
- for (i = -255; i <= 255; ++i) {
- abs0[255 + i] = (i < 0) ? -i : i;
- abs1[255 + i] = abs0[255 + i] >> 1;
- }
- for (i = -1020; i <= 1020; ++i) {
- sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
- }
- for (i = -112; i <= 112; ++i) {
- sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
- }
- for (i = -255; i <= 255 + 255; ++i) {
- clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
- }
- tables_ok = 1;
- }
-}
static WEBP_INLINE uint8_t clip_8b(int v) {
return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
@@ -59,9 +26,16 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
#define STORE(x, y, v) \
dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
-#define MUL(a, b) (((a) * (b)) >> 16)
+#define STORE2(y, dc, d, c) do { \
+ const int DC = (dc); \
+ STORE(0, y, DC + (d)); \
+ STORE(1, y, DC + (c)); \
+ STORE(2, y, DC - (c)); \
+ STORE(3, y, DC - (d)); \
+} while (0)
+
+#define MUL1(a) ((((a) * 20091) >> 16) + (a))
+#define MUL2(a) (((a) * 35468) >> 16)
static void TransformOne(const int16_t* in, uint8_t* dst) {
int C[4 * 4], *tmp;
@@ -70,8 +44,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
for (i = 0; i < 4; ++i) { // vertical pass
const int a = in[0] + in[8]; // [-4096, 4094]
const int b = in[0] - in[8]; // [-4095, 4095]
- const int c = MUL(in[4], kC2) - MUL(in[12], kC1); // [-3783, 3783]
- const int d = MUL(in[4], kC1) + MUL(in[12], kC2); // [-3785, 3781]
+ const int c = MUL2(in[4]) - MUL1(in[12]); // [-3783, 3783]
+ const int d = MUL1(in[4]) + MUL2(in[12]); // [-3785, 3781]
tmp[0] = a + d; // [-7881, 7875]
tmp[1] = b + c; // [-7878, 7878]
tmp[2] = b - c; // [-7878, 7878]
@@ -80,7 +54,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
in++;
}
// Each pass is expanding the dynamic range by ~3.85 (upper bound).
- // The exact value is (2. + (kC1 + kC2) / 65536).
+ // The exact value is (2. + (20091 + 35468) / 65536).
// After the second pass, maximum interval is [-3794, 3794], assuming
// an input in [-2048, 2047] interval. We then need to add a dst value
// in the [0, 255] range.
@@ -91,8 +65,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
const int dc = tmp[0] + 4;
const int a = dc + tmp[8];
const int b = dc - tmp[8];
- const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
- const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+ const int c = MUL2(tmp[4]) - MUL1(tmp[12]);
+ const int d = MUL1(tmp[4]) + MUL2(tmp[12]);
STORE(0, 0, a + d);
STORE(1, 0, b + c);
STORE(2, 0, b - c);
@@ -101,7 +75,22 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
dst += BPS;
}
}
-#undef MUL
+
+// Simplified transform when only in[0], in[1] and in[4] are non-zero
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+ const int a = in[0] + 4;
+ const int c4 = MUL2(in[4]);
+ const int d4 = MUL1(in[4]);
+ const int c1 = MUL2(in[1]);
+ const int d1 = MUL1(in[1]);
+ STORE2(0, a + d4, d1, c1);
+ STORE2(1, a + c4, d1, c1);
+ STORE2(2, a - c4, d1, c1);
+ STORE2(3, a - d4, d1, c1);
+}
+#undef MUL1
+#undef MUL2
+#undef STORE2
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
TransformOne(in, dst);
@@ -115,7 +104,7 @@ static void TransformUV(const int16_t* in, uint8_t* dst) {
VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
}
-static void TransformDC(const int16_t *in, uint8_t* dst) {
+static void TransformDC(const int16_t* in, uint8_t* dst) {
const int DC = in[0] + 4;
int i, j;
for (j = 0; j < 4; ++j) {
@@ -126,10 +115,10 @@ static void TransformDC(const int16_t *in, uint8_t* dst) {
}
static void TransformDCUV(const int16_t* in, uint8_t* dst) {
- if (in[0 * 16]) TransformDC(in + 0 * 16, dst);
- if (in[1 * 16]) TransformDC(in + 1 * 16, dst + 4);
- if (in[2 * 16]) TransformDC(in + 2 * 16, dst + 4 * BPS);
- if (in[3 * 16]) TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
+ if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
+ if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
+ if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
+ if (in[3 * 16]) VP8TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
}
#undef STORE
@@ -164,16 +153,16 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
}
}
-void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;
+void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
//------------------------------------------------------------------------------
// Intra predictions
#define DST(x, y) dst[(x) + (y) * BPS]
-static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
const uint8_t* top = dst - BPS;
- const uint8_t* const clip0 = clip1 + 255 - top[-1];
+ const uint8_t* const clip0 = VP8kclip1 - top[-1];
int y;
for (y = 0; y < size; ++y) {
const uint8_t* const clip = clip0 + dst[-1];
@@ -184,21 +173,21 @@ static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
dst += BPS;
}
}
-static void TM4(uint8_t *dst) { TrueMotion(dst, 4); }
-static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); }
-static void TM16(uint8_t *dst) { TrueMotion(dst, 16); }
+static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t* dst) { TrueMotion(dst, 16); }
//------------------------------------------------------------------------------
// 16x16
-static void VE16(uint8_t *dst) { // vertical
+static void VE16(uint8_t* dst) { // vertical
int j;
for (j = 0; j < 16; ++j) {
memcpy(dst + j * BPS, dst - BPS, 16);
}
}
-static void HE16(uint8_t *dst) { // horizontal
+static void HE16(uint8_t* dst) { // horizontal
int j;
for (j = 16; j > 0; --j) {
memset(dst, dst[-1], 16);
@@ -213,7 +202,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) {
}
}
-static void DC16(uint8_t *dst) { // DC
+static void DC16(uint8_t* dst) { // DC
int DC = 16;
int j;
for (j = 0; j < 16; ++j) {
@@ -222,7 +211,7 @@ static void DC16(uint8_t *dst) { // DC
Put16(DC >> 5, dst);
}
-static void DC16NoTop(uint8_t *dst) { // DC with top samples not available
+static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
int DC = 8;
int j;
for (j = 0; j < 16; ++j) {
@@ -231,7 +220,7 @@ static void DC16NoTop(uint8_t *dst) { // DC with top samples not available
Put16(DC >> 4, dst);
}
-static void DC16NoLeft(uint8_t *dst) { // DC with left samples not available
+static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
int DC = 8;
int i;
for (i = 0; i < 16; ++i) {
@@ -240,17 +229,19 @@ static void DC16NoLeft(uint8_t *dst) { // DC with left samples not available
Put16(DC >> 4, dst);
}
-static void DC16NoTopLeft(uint8_t *dst) { // DC with no top and left samples
+static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples
Put16(0x80, dst);
}
+VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
+
//------------------------------------------------------------------------------
// 4x4
#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
-static void VE4(uint8_t *dst) { // vertical
+static void VE4(uint8_t* dst) { // vertical
const uint8_t* top = dst - BPS;
const uint8_t vals[4] = {
AVG3(top[-1], top[0], top[1]),
@@ -264,7 +255,7 @@ static void VE4(uint8_t *dst) { // vertical
}
}
-static void HE4(uint8_t *dst) { // horizontal
+static void HE4(uint8_t* dst) { // horizontal
const int A = dst[-1 - BPS];
const int B = dst[-1];
const int C = dst[-1 + BPS];
@@ -276,7 +267,7 @@ static void HE4(uint8_t *dst) { // horizontal
*(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(D, E, E);
}
-static void DC4(uint8_t *dst) { // DC
+static void DC4(uint8_t* dst) { // DC
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
@@ -284,7 +275,7 @@ static void DC4(uint8_t *dst) { // DC
for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
}
-static void RD4(uint8_t *dst) { // Down-right
+static void RD4(uint8_t* dst) { // Down-right
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@@ -295,15 +286,15 @@ static void RD4(uint8_t *dst) { // Down-right
const int C = dst[2 - BPS];
const int D = dst[3 - BPS];
DST(0, 3) = AVG3(J, K, L);
- DST(0, 2) = DST(1, 3) = AVG3(I, J, K);
- DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J);
- DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
- DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X);
- DST(2, 0) = DST(3, 1) = AVG3(C, B, A);
- DST(3, 0) = AVG3(D, C, B);
+ DST(1, 3) = DST(0, 2) = AVG3(I, J, K);
+ DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J);
+ DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+ DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+ DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+ DST(3, 0) = AVG3(D, C, B);
}
-static void LD4(uint8_t *dst) { // Down-Left
+static void LD4(uint8_t* dst) { // Down-Left
const int A = dst[0 - BPS];
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
@@ -316,12 +307,12 @@ static void LD4(uint8_t *dst) { // Down-Left
DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
- DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
- DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
- DST(3, 3) = AVG3(G, H, H);
+ DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+ DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+ DST(3, 3) = AVG3(G, H, H);
}
-static void VR4(uint8_t *dst) { // Vertical-Right
+static void VR4(uint8_t* dst) { // Vertical-Right
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@@ -343,7 +334,7 @@ static void VR4(uint8_t *dst) { // Vertical-Right
DST(3, 1) = AVG3(B, C, D);
}
-static void VL4(uint8_t *dst) { // Vertical-Left
+static void VL4(uint8_t* dst) { // Vertical-Left
const int A = dst[0 - BPS];
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
@@ -365,7 +356,7 @@ static void VL4(uint8_t *dst) { // Vertical-Left
DST(3, 3) = AVG3(F, G, H);
}
-static void HU4(uint8_t *dst) { // Horizontal-Up
+static void HU4(uint8_t* dst) { // Horizontal-Up
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@@ -380,7 +371,7 @@ static void HU4(uint8_t *dst) { // Horizontal-Up
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
-static void HD4(uint8_t *dst) { // Horizontal-Down
+static void HD4(uint8_t* dst) { // Horizontal-Down
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@@ -407,17 +398,19 @@ static void HD4(uint8_t *dst) { // Horizontal-Down
#undef AVG3
#undef AVG2
+VP8PredFunc VP8PredLuma4[NUM_BMODES];
+
//------------------------------------------------------------------------------
// Chroma
-static void VE8uv(uint8_t *dst) { // vertical
+static void VE8uv(uint8_t* dst) { // vertical
int j;
for (j = 0; j < 8; ++j) {
memcpy(dst + j * BPS, dst - BPS, 8);
}
}
-static void HE8uv(uint8_t *dst) { // horizontal
+static void HE8uv(uint8_t* dst) { // horizontal
int j;
for (j = 0; j < 8; ++j) {
memset(dst, dst[-1], 8);
@@ -426,60 +419,45 @@ static void HE8uv(uint8_t *dst) { // horizontal
}
// helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint64_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
int j;
for (j = 0; j < 8; ++j) {
- *(uint64_t*)(dst + j * BPS) = v;
+ memset(dst + j * BPS, value, 8);
}
}
-static void DC8uv(uint8_t *dst) { // DC
+static void DC8uv(uint8_t* dst) { // DC
int dc0 = 8;
int i;
for (i = 0; i < 8; ++i) {
dc0 += dst[i - BPS] + dst[-1 + i * BPS];
}
- Put8x8uv((uint64_t)((dc0 >> 4) * 0x0101010101010101ULL), dst);
+ Put8x8uv(dc0 >> 4, dst);
}
-static void DC8uvNoLeft(uint8_t *dst) { // DC with no left samples
+static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
int dc0 = 4;
int i;
for (i = 0; i < 8; ++i) {
dc0 += dst[i - BPS];
}
- Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+ Put8x8uv(dc0 >> 3, dst);
}
-static void DC8uvNoTop(uint8_t *dst) { // DC with no top samples
+static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
int dc0 = 4;
int i;
for (i = 0; i < 8; ++i) {
dc0 += dst[-1 + i * BPS];
}
- Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+ Put8x8uv(dc0 >> 3, dst);
}
-static void DC8uvNoTopLeft(uint8_t *dst) { // DC with nothing
- Put8x8uv(0x8080808080808080ULL, dst);
+static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
+ Put8x8uv(0x80, dst);
}
-//------------------------------------------------------------------------------
-// default C implementations
-
-const VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
- DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
-};
-
-const VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
- DC16, TM16, VE16, HE16,
- DC16NoTop, DC16NoLeft, DC16NoTopLeft
-};
-
-const VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
- DC8uv, TM8uv, VE8uv, HE8uv,
- DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
-};
+VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
//------------------------------------------------------------------------------
// Edge filtering functions
@@ -487,61 +465,62 @@ const VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
// 4 pixels in, 2 pixels out
static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
- const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
- const int a1 = sclip2[112 + ((a + 4) >> 3)];
- const int a2 = sclip2[112 + ((a + 3) >> 3)];
- p[-step] = clip1[255 + p0 + a2];
- p[ 0] = clip1[255 + q0 - a1];
+ const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1]; // in [-893,892]
+ const int a1 = VP8ksclip2[(a + 4) >> 3]; // in [-16,15]
+ const int a2 = VP8ksclip2[(a + 3) >> 3];
+ p[-step] = VP8kclip1[p0 + a2];
+ p[ 0] = VP8kclip1[q0 - a1];
}
// 4 pixels in, 4 pixels out
static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
const int a = 3 * (q0 - p0);
- const int a1 = sclip2[112 + ((a + 4) >> 3)];
- const int a2 = sclip2[112 + ((a + 3) >> 3)];
+ const int a1 = VP8ksclip2[(a + 4) >> 3];
+ const int a2 = VP8ksclip2[(a + 3) >> 3];
const int a3 = (a1 + 1) >> 1;
- p[-2*step] = clip1[255 + p1 + a3];
- p[- step] = clip1[255 + p0 + a2];
- p[ 0] = clip1[255 + q0 - a1];
- p[ step] = clip1[255 + q1 - a3];
+ p[-2*step] = VP8kclip1[p1 + a3];
+ p[- step] = VP8kclip1[p0 + a2];
+ p[ 0] = VP8kclip1[q0 - a1];
+ p[ step] = VP8kclip1[q1 - a3];
}
// 6 pixels in, 6 pixels out
static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
const int q0 = p[0], q1 = p[step], q2 = p[2*step];
- const int a = sclip1[1020 + 3 * (q0 - p0) + sclip1[1020 + p1 - q1]];
+ const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
+ // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
const int a1 = (27 * a + 63) >> 7; // eq. to ((3 * a + 7) * 9) >> 7
const int a2 = (18 * a + 63) >> 7; // eq. to ((2 * a + 7) * 9) >> 7
const int a3 = (9 * a + 63) >> 7; // eq. to ((1 * a + 7) * 9) >> 7
- p[-3*step] = clip1[255 + p2 + a3];
- p[-2*step] = clip1[255 + p1 + a2];
- p[- step] = clip1[255 + p0 + a1];
- p[ 0] = clip1[255 + q0 - a1];
- p[ step] = clip1[255 + q1 - a2];
- p[ 2*step] = clip1[255 + q2 - a3];
+ p[-3*step] = VP8kclip1[p2 + a3];
+ p[-2*step] = VP8kclip1[p1 + a2];
+ p[- step] = VP8kclip1[p0 + a1];
+ p[ 0] = VP8kclip1[q0 - a1];
+ p[ step] = VP8kclip1[q1 - a2];
+ p[ 2*step] = VP8kclip1[q2 - a3];
}
static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
- return (abs0[255 + p1 - p0] > thresh) || (abs0[255 + q1 - q0] > thresh);
+ return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
}
-static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
- const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
- return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
+ const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+ return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
}
static WEBP_INLINE int needs_filter2(const uint8_t* p,
int step, int t, int it) {
- const int p3 = p[-4*step], p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
- const int q0 = p[0], q1 = p[step], q2 = p[2*step], q3 = p[3*step];
- if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
- return 0;
- return abs0[255 + p3 - p2] <= it && abs0[255 + p2 - p1] <= it &&
- abs0[255 + p1 - p0] <= it && abs0[255 + q3 - q2] <= it &&
- abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
+ const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
+ const int p0 = p[-step], q0 = p[0];
+ const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+ if ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) > t) return 0;
+ return VP8kabs0[p3 - p2] <= it && VP8kabs0[p2 - p1] <= it &&
+ VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
+ VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
}
//------------------------------------------------------------------------------
@@ -549,8 +528,9 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
int i;
+ const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
- if (needs_filter(p + i, stride, thresh)) {
+ if (needs_filter(p + i, stride, thresh2)) {
do_filter2(p + i, stride);
}
}
@@ -558,8 +538,9 @@ static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
int i;
+ const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
- if (needs_filter(p + i * stride, 1, thresh)) {
+ if (needs_filter(p + i * stride, 1, thresh2)) {
do_filter2(p + i * stride, 1);
}
}
@@ -587,8 +568,9 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
static WEBP_INLINE void FilterLoop26(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
+ const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
- if (needs_filter2(p, hstride, thresh, ithresh)) {
+ if (needs_filter2(p, hstride, thresh2, ithresh)) {
if (hev(p, hstride, hev_thresh)) {
do_filter2(p, hstride);
} else {
@@ -602,8 +584,9 @@ static WEBP_INLINE void FilterLoop26(uint8_t* p,
static WEBP_INLINE void FilterLoop24(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
+ const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
- if (needs_filter2(p, hstride, thresh, ithresh)) {
+ if (needs_filter2(p, hstride, thresh2, ithresh)) {
if (hev(p, hstride, hev_thresh)) {
do_filter2(p, hstride);
} else {
@@ -672,6 +655,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
//------------------------------------------------------------------------------
VP8DecIdct2 VP8Transform;
+VP8DecIdct VP8TransformAC3;
VP8DecIdct VP8TransformUV;
VP8DecIdct VP8TransformDC;
VP8DecIdct VP8TransformDCUV;
@@ -690,15 +674,25 @@ VP8SimpleFilterFunc VP8SimpleVFilter16i;
VP8SimpleFilterFunc VP8SimpleHFilter16i;
extern void VP8DspInitSSE2(void);
+extern void VP8DspInitSSE41(void);
extern void VP8DspInitNEON(void);
+extern void VP8DspInitMIPS32(void);
+extern void VP8DspInitMIPSdspR2(void);
+
+static volatile VP8CPUInfo dec_last_cpuinfo_used =
+ (VP8CPUInfo)&dec_last_cpuinfo_used;
-void VP8DspInit(void) {
- DspInitTables();
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
+ if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;
+ VP8InitClipTables();
+
+ VP8TransformWHT = TransformWHT;
VP8Transform = TransformTwo;
VP8TransformUV = TransformUV;
VP8TransformDC = TransformDC;
VP8TransformDCUV = TransformDCUV;
+ VP8TransformAC3 = TransformAC3;
VP8VFilter16 = VFilter16;
VP8HFilter16 = HFilter16;
@@ -713,20 +707,60 @@ void VP8DspInit(void) {
VP8SimpleVFilter16i = SimpleVFilter16i;
VP8SimpleHFilter16i = SimpleHFilter16i;
+ VP8PredLuma4[0] = DC4;
+ VP8PredLuma4[1] = TM4;
+ VP8PredLuma4[2] = VE4;
+ VP8PredLuma4[3] = HE4;
+ VP8PredLuma4[4] = RD4;
+ VP8PredLuma4[5] = VR4;
+ VP8PredLuma4[6] = LD4;
+ VP8PredLuma4[7] = VL4;
+ VP8PredLuma4[8] = HD4;
+ VP8PredLuma4[9] = HU4;
+
+ VP8PredLuma16[0] = DC16;
+ VP8PredLuma16[1] = TM16;
+ VP8PredLuma16[2] = VE16;
+ VP8PredLuma16[3] = HE16;
+ VP8PredLuma16[4] = DC16NoTop;
+ VP8PredLuma16[5] = DC16NoLeft;
+ VP8PredLuma16[6] = DC16NoTopLeft;
+
+ VP8PredChroma8[0] = DC8uv;
+ VP8PredChroma8[1] = TM8uv;
+ VP8PredChroma8[2] = VE8uv;
+ VP8PredChroma8[3] = HE8uv;
+ VP8PredChroma8[4] = DC8uvNoTop;
+ VP8PredChroma8[5] = DC8uvNoLeft;
+ VP8PredChroma8[6] = DC8uvNoTopLeft;
+
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
- if (VP8GetCPUInfo) {
+ if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8DspInitSSE2();
+#if defined(WEBP_USE_SSE41)
+ if (VP8GetCPUInfo(kSSE4_1)) {
+ VP8DspInitSSE41();
+ }
+#endif
}
-#elif defined(WEBP_USE_NEON)
+#endif
+#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8DspInitNEON();
}
#endif
+#if defined(WEBP_USE_MIPS32)
+ if (VP8GetCPUInfo(kMIPS32)) {
+ VP8DspInitMIPS32();
+ }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+ if (VP8GetCPUInfo(kMIPSdspR2)) {
+ VP8DspInitMIPSdspR2();
+ }
+#endif
}
+ dec_last_cpuinfo_used = VP8GetCPUInfo;
}
-
-#if defined(__cplusplus) || defined(c_plusplus)
-} // extern "C"
-#endif
diff --git a/drivers/webp/dsp/dec_neon.c b/drivers/webp/dsp/dec_neon.c
index ec824b790b..a63f43fe17 100644
--- a/drivers/webp/dsp/dec_neon.c
+++ b/drivers/webp/dsp/dec_neon.c
@@ -1,8 +1,10 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
-// This code is licensed under the same terms as WebM:
-// Software License Agreement: http://www.webmproject.org/license/software/
-// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARM NEON version of dsp functions and loop filtering.
@@ -14,13 +16,535 @@
#if defined(WEBP_USE_NEON)
+#include "./neon.h"
#include "../dec/vp8i.h"
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+//------------------------------------------------------------------------------
+// NxM Loading functions
+
+// Load/Store vertical edge
+#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
+ "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
+ "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
+ "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
+ "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
+ "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
+ "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
+ "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
+ "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
+
+#define STORE8x2(c1, c2, p, stride) \
+ "vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
+
+#if !defined(WORK_AROUND_GCC)
+
+// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
+// (register alloc, probably). The variants somewhat mitigate the problem, but
+// not quite. HFilter16i() remains problematic.
+static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
+ const uint8x8_t zero = vdup_n_u8(0);
+ uint8x8x4_t out;
+ INIT_VECTOR4(out, zero, zero, zero, zero);
+ out = vld4_lane_u8(src + 0 * stride, out, 0);
+ out = vld4_lane_u8(src + 1 * stride, out, 1);
+ out = vld4_lane_u8(src + 2 * stride, out, 2);
+ out = vld4_lane_u8(src + 3 * stride, out, 3);
+ out = vld4_lane_u8(src + 4 * stride, out, 4);
+ out = vld4_lane_u8(src + 5 * stride, out, 5);
+ out = vld4_lane_u8(src + 6 * stride, out, 6);
+ out = vld4_lane_u8(src + 7 * stride, out, 7);
+ return out;
+}
+
+static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
+ uint8x16_t* const p1, uint8x16_t* const p0,
+ uint8x16_t* const q0, uint8x16_t* const q1) {
+ // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
+ // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
+ const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride);
+ const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride);
+ *p1 = vcombine_u8(row0.val[0], row8.val[0]);
+ *p0 = vcombine_u8(row0.val[1], row8.val[1]);
+ *q0 = vcombine_u8(row0.val[2], row8.val[2]);
+ *q1 = vcombine_u8(row0.val[3], row8.val[3]);
+}
+
+#else // WORK_AROUND_GCC
+
+#define LOADQ_LANE_32b(VALUE, LANE) do { \
+ (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \
+ src += stride; \
+} while (0)
+
+static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
+ uint8x16_t* const p1, uint8x16_t* const p0,
+ uint8x16_t* const q0, uint8x16_t* const q1) {
+ const uint32x4_t zero = vdupq_n_u32(0);
+ uint32x4x4_t in;
+ INIT_VECTOR4(in, zero, zero, zero, zero);
+ src -= 2;
+ LOADQ_LANE_32b(in.val[0], 0);
+ LOADQ_LANE_32b(in.val[1], 0);
+ LOADQ_LANE_32b(in.val[2], 0);
+ LOADQ_LANE_32b(in.val[3], 0);
+ LOADQ_LANE_32b(in.val[0], 1);
+ LOADQ_LANE_32b(in.val[1], 1);
+ LOADQ_LANE_32b(in.val[2], 1);
+ LOADQ_LANE_32b(in.val[3], 1);
+ LOADQ_LANE_32b(in.val[0], 2);
+ LOADQ_LANE_32b(in.val[1], 2);
+ LOADQ_LANE_32b(in.val[2], 2);
+ LOADQ_LANE_32b(in.val[3], 2);
+ LOADQ_LANE_32b(in.val[0], 3);
+ LOADQ_LANE_32b(in.val[1], 3);
+ LOADQ_LANE_32b(in.val[2], 3);
+ LOADQ_LANE_32b(in.val[3], 3);
+ // Transpose four 4x4 parts:
+ {
+ const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
+ vreinterpretq_u8_u32(in.val[1]));
+ const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
+ vreinterpretq_u8_u32(in.val[3]));
+ const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
+ vreinterpretq_u16_u8(row23.val[0]));
+ const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
+ vreinterpretq_u16_u8(row23.val[1]));
+ *p1 = vreinterpretq_u8_u16(row02.val[0]);
+ *p0 = vreinterpretq_u8_u16(row13.val[0]);
+ *q0 = vreinterpretq_u8_u16(row02.val[1]);
+ *q1 = vreinterpretq_u8_u16(row13.val[1]);
+ }
+}
+#undef LOADQ_LANE_32b
+
+#endif // !WORK_AROUND_GCC
+
+static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,
+ uint8x16_t* const p3, uint8x16_t* const p2,
+ uint8x16_t* const p1, uint8x16_t* const p0,
+ uint8x16_t* const q0, uint8x16_t* const q1,
+ uint8x16_t* const q2, uint8x16_t* const q3) {
+ Load4x16(src - 2, stride, p3, p2, p1, p0);
+ Load4x16(src + 2, stride, q0, q1, q2, q3);
+}
+
+static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,
+ uint8x16_t* const p1, uint8x16_t* const p0,
+ uint8x16_t* const q0, uint8x16_t* const q1) {
+ *p1 = vld1q_u8(src - 2 * stride);
+ *p0 = vld1q_u8(src - 1 * stride);
+ *q0 = vld1q_u8(src + 0 * stride);
+ *q1 = vld1q_u8(src + 1 * stride);
+}
+
+static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
+ uint8x16_t* const p3, uint8x16_t* const p2,
+ uint8x16_t* const p1, uint8x16_t* const p0,
+ uint8x16_t* const q0, uint8x16_t* const q1,
+ uint8x16_t* const q2, uint8x16_t* const q3) {
+ Load16x4(src - 2 * stride, stride, p3, p2, p1, p0);
+ Load16x4(src + 2 * stride, stride, q0, q1, q2, q3);
+}
+
+static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
+ const uint8_t* const v,
+ int stride,
+ uint8x16_t* const p3, uint8x16_t* const p2,
+ uint8x16_t* const p1, uint8x16_t* const p0,
+ uint8x16_t* const q0, uint8x16_t* const q1,
+ uint8x16_t* const q2, uint8x16_t* const q3) {
+ // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
+ // and the v-samples on the higher half.
+ *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
+ *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
+ *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
+ *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
+ *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
+ *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
+ *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
+ *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
+}
+
+#if !defined(WORK_AROUND_GCC)
+
+#define LOAD_UV_8(ROW) \
+ vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
+
+static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
+ const uint8_t* const v,
+ int stride,
+ uint8x16_t* const p3, uint8x16_t* const p2,
+ uint8x16_t* const p1, uint8x16_t* const p0,
+ uint8x16_t* const q0, uint8x16_t* const q1,
+ uint8x16_t* const q2, uint8x16_t* const q3) {
+ // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
+ // and the v-samples on the higher half.
+ const uint8x16_t row0 = LOAD_UV_8(0);
+ const uint8x16_t row1 = LOAD_UV_8(1);
+ const uint8x16_t row2 = LOAD_UV_8(2);
+ const uint8x16_t row3 = LOAD_UV_8(3);
+ const uint8x16_t row4 = LOAD_UV_8(4);
+ const uint8x16_t row5 = LOAD_UV_8(5);
+ const uint8x16_t row6 = LOAD_UV_8(6);
+ const uint8x16_t row7 = LOAD_UV_8(7);
+ // Perform two side-by-side 8x8 transposes
+ // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
+ // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
+ // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
+ // u30 u31 u32 u33 u34 u35 u36 u37 | ...
+ // u40 u41 u42 u43 u44 u45 u46 u47 | ...
+ // u50 u51 u52 u53 u54 u55 u56 u57 | ...
+ // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
+ // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
+ const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...
+ // u01 u11 u03 u13 ...
+ const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...
+ // u21 u31 u23 u33 ...
+ const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...
+ const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...
+ const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
+ vreinterpretq_u16_u8(row23.val[0]));
+ const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
+ vreinterpretq_u16_u8(row23.val[1]));
+ const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
+ vreinterpretq_u16_u8(row67.val[0]));
+ const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
+ vreinterpretq_u16_u8(row67.val[1]));
+ const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
+ vreinterpretq_u32_u16(row46.val[0]));
+ const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
+ vreinterpretq_u32_u16(row46.val[1]));
+ const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
+ vreinterpretq_u32_u16(row57.val[0]));
+ const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
+ vreinterpretq_u32_u16(row57.val[1]));
+ *p3 = vreinterpretq_u8_u32(row04.val[0]);
+ *p2 = vreinterpretq_u8_u32(row15.val[0]);
+ *p1 = vreinterpretq_u8_u32(row26.val[0]);
+ *p0 = vreinterpretq_u8_u32(row37.val[0]);
+ *q0 = vreinterpretq_u8_u32(row04.val[1]);
+ *q1 = vreinterpretq_u8_u32(row15.val[1]);
+ *q2 = vreinterpretq_u8_u32(row26.val[1]);
+ *q3 = vreinterpretq_u8_u32(row37.val[1]);
+}
+#undef LOAD_UV_8
+
+#endif // !WORK_AROUND_GCC
+
+static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
+ uint8_t* const dst, int stride) {
+ vst2_lane_u8(dst + 0 * stride, v, 0);
+ vst2_lane_u8(dst + 1 * stride, v, 1);
+ vst2_lane_u8(dst + 2 * stride, v, 2);
+ vst2_lane_u8(dst + 3 * stride, v, 3);
+ vst2_lane_u8(dst + 4 * stride, v, 4);
+ vst2_lane_u8(dst + 5 * stride, v, 5);
+ vst2_lane_u8(dst + 6 * stride, v, 6);
+ vst2_lane_u8(dst + 7 * stride, v, 7);
+}
-#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \
+static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
+ uint8_t* const dst, int stride) {
+ uint8x8x2_t lo, hi;
+ lo.val[0] = vget_low_u8(p0);
+ lo.val[1] = vget_low_u8(q0);
+ hi.val[0] = vget_high_u8(p0);
+ hi.val[1] = vget_high_u8(q0);
+ Store2x8(lo, dst - 1 + 0 * stride, stride);
+ Store2x8(hi, dst - 1 + 8 * stride, stride);
+}
+
+#if !defined(WORK_AROUND_GCC)
+static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
+ uint8_t* const dst, int stride) {
+ vst4_lane_u8(dst + 0 * stride, v, 0);
+ vst4_lane_u8(dst + 1 * stride, v, 1);
+ vst4_lane_u8(dst + 2 * stride, v, 2);
+ vst4_lane_u8(dst + 3 * stride, v, 3);
+ vst4_lane_u8(dst + 4 * stride, v, 4);
+ vst4_lane_u8(dst + 5 * stride, v, 5);
+ vst4_lane_u8(dst + 6 * stride, v, 6);
+ vst4_lane_u8(dst + 7 * stride, v, 7);
+}
+
+static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ uint8_t* const dst, int stride) {
+ uint8x8x4_t lo, hi;
+ INIT_VECTOR4(lo,
+ vget_low_u8(p1), vget_low_u8(p0),
+ vget_low_u8(q0), vget_low_u8(q1));
+ INIT_VECTOR4(hi,
+ vget_high_u8(p1), vget_high_u8(p0),
+ vget_high_u8(q0), vget_high_u8(q1));
+ Store4x8(lo, dst - 2 + 0 * stride, stride);
+ Store4x8(hi, dst - 2 + 8 * stride, stride);
+}
+#endif // !WORK_AROUND_GCC
+
+static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
+ uint8_t* const dst, int stride) {
+ vst1q_u8(dst - stride, p0);
+ vst1q_u8(dst, q0);
+}
+
+static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ uint8_t* const dst, int stride) {
+ Store16x2(p1, p0, dst - stride, stride);
+ Store16x2(q0, q1, dst + stride, stride);
+}
+
+static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
+ uint8_t* const u, uint8_t* const v,
+ int stride) {
+ // p0 and q0 contain the u+v samples packed in low/high halves.
+ vst1_u8(u - stride, vget_low_u8(p0));
+ vst1_u8(u, vget_low_u8(q0));
+ vst1_u8(v - stride, vget_high_u8(p0));
+ vst1_u8(v, vget_high_u8(q0));
+}
+
+static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ uint8_t* const u, uint8_t* const v,
+ int stride) {
+ // The p1...q1 registers contain the u+v samples packed in low/high halves.
+ Store8x2x2(p1, p0, u - stride, v - stride, stride);
+ Store8x2x2(q0, q1, u + stride, v + stride, stride);
+}
+
+#if !defined(WORK_AROUND_GCC)
+
+#define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \
+ vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \
+ vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \
+ (DST) += stride; \
+} while (0)
+
+static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
+ const uint8x16_t p0, const uint8x16_t q0,
+ const uint8x16_t q1, const uint8x16_t q2,
+ uint8_t* u, uint8_t* v,
+ int stride) {
+ uint8x8x3_t u0, u1, v0, v1;
+ INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
+ INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
+ INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
+ INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
+ STORE6_LANE(u, u0, u1, 0);
+ STORE6_LANE(u, u0, u1, 1);
+ STORE6_LANE(u, u0, u1, 2);
+ STORE6_LANE(u, u0, u1, 3);
+ STORE6_LANE(u, u0, u1, 4);
+ STORE6_LANE(u, u0, u1, 5);
+ STORE6_LANE(u, u0, u1, 6);
+ STORE6_LANE(u, u0, u1, 7);
+ STORE6_LANE(v, v0, v1, 0);
+ STORE6_LANE(v, v0, v1, 1);
+ STORE6_LANE(v, v0, v1, 2);
+ STORE6_LANE(v, v0, v1, 3);
+ STORE6_LANE(v, v0, v1, 4);
+ STORE6_LANE(v, v0, v1, 5);
+ STORE6_LANE(v, v0, v1, 6);
+ STORE6_LANE(v, v0, v1, 7);
+}
+#undef STORE6_LANE
+
+static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ uint8_t* const u, uint8_t* const v,
+ int stride) {
+ uint8x8x4_t u0, v0;
+ INIT_VECTOR4(u0,
+ vget_low_u8(p1), vget_low_u8(p0),
+ vget_low_u8(q0), vget_low_u8(q1));
+ INIT_VECTOR4(v0,
+ vget_high_u8(p1), vget_high_u8(p0),
+ vget_high_u8(q0), vget_high_u8(q1));
+ vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
+ vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
+ vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
+ vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
+ vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
+ vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
+ vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
+ vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
+ vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
+ vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
+ vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
+ vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
+ vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
+ vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
+ vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
+ vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
+}
+
+#endif // !WORK_AROUND_GCC
+
+// Zero extend 'v' to an int16x8_t.
+static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) {
+ return vreinterpretq_s16_u16(vmovl_u8(v));
+}
+
+// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
+// to the corresponding rows of 'dst'.
+static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
+ const int16x8_t dst01,
+ const int16x8_t dst23) {
+ // Unsigned saturate to 8b.
+ const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
+ const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
+
+ // Store the results.
+ vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
+ vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
+ vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
+ vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
+}
+
+static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
+ uint8_t* const dst) {
+ uint32x2_t dst01 = vdup_n_u32(0);
+ uint32x2_t dst23 = vdup_n_u32(0);
+
+ // Load the source pixels.
+ dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
+ dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
+ dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
+ dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
+
+ {
+ // Convert to 16b.
+ const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01));
+ const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23));
+
+ // Descale with rounding.
+ const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
+ const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
+ // Add the inverse transform.
+ SaturateAndStore4x4(dst, out01, out23);
+ }
+}
+
+//-----------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ int thresh) {
+ const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
+ const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
+ const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)
+ const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)
+ const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2
+ const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
+ const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
+ return mask;
+}
+
+static int8x16_t FlipSign(const uint8x16_t v) {
+ const uint8x16_t sign_bit = vdupq_n_u8(0x80);
+ return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
+}
+
+static uint8x16_t FlipSignBack(const int8x16_t v) {
+ const int8x16_t sign_bit = vdupq_n_s8(0x80);
+ return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
+}
+
+static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
+ const int8x16_t q0, const int8x16_t q1) {
+ const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
+ const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)
+ const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)
+ const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)
+ const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)
+ return s3;
+}
+
+static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
+ const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
+ const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)
+ const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)
+ return s2;
+}
+
+//------------------------------------------------------------------------------
+
+static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
+ const int8x16_t delta,
+ int8x16_t* const op0, int8x16_t* const oq0) {
+ const int8x16_t kCst3 = vdupq_n_s8(0x03);
+ const int8x16_t kCst4 = vdupq_n_s8(0x04);
+ const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
+ const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
+ const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
+ const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
+ *op0 = vqaddq_s8(p0s, delta3);
+ *oq0 = vqsubq_s8(q0s, delta4);
+}
+
+#if defined(WEBP_USE_INTRINSICS)
+
+static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
+ const int8x16_t delta,
+ uint8x16_t* const op0, uint8x16_t* const oq0) {
+ const int8x16_t kCst3 = vdupq_n_s8(0x03);
+ const int8x16_t kCst4 = vdupq_n_s8(0x04);
+ const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
+ const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
+ const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
+ const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
+ const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
+ const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
+ *op0 = FlipSignBack(sp0);
+ *oq0 = FlipSignBack(sq0);
+}
+
+static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ const uint8x16_t mask,
+ uint8x16_t* const op0, uint8x16_t* const oq0) {
+ const int8x16_t p1s = FlipSign(p1);
+ const int8x16_t p0s = FlipSign(p0);
+ const int8x16_t q0s = FlipSign(q0);
+ const int8x16_t q1s = FlipSign(q1);
+ const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
+ const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
+ ApplyFilter2(p0s, q0s, delta1, op0, oq0);
+}
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+ uint8x16_t p1, p0, q0, q1, op0, oq0;
+ Load16x4(p, stride, &p1, &p0, &q0, &q1);
+ {
+ const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
+ DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
+ }
+ Store16x2(op0, oq0, p, stride);
+}
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+ uint8x16_t p1, p0, q0, q1, oq0, op0;
+ Load4x16(p, stride, &p1, &p0, &q0, &q1);
+ {
+ const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
+ DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
+ }
+ Store2x16(op0, oq0, p, stride);
+}
+
+#else
+
+#define QRegs "q0", "q1", "q2", "q3", \
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
#define FLIP_SIGN_BIT2(a, b, s) \
@@ -68,40 +592,16 @@ extern "C" {
DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
FLIP_SIGN_BIT2(p0, q0, q10)
-// Load/Store vertical edge
-#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
- "vld4.8 {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
- "vld4.8 {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
- "vld4.8 {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
- "vld4.8 {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
- "vld4.8 {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
- "vld4.8 {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
- "vld4.8 {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
- "vld4.8 {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
-
-#define STORE8x2(c1, c2, p,stride) \
- "vst2.8 {" #c1"[0], " #c2"[0]}," #p "," #stride " \n" \
- "vst2.8 {" #c1"[1], " #c2"[1]}," #p "," #stride " \n" \
- "vst2.8 {" #c1"[2], " #c2"[2]}," #p "," #stride " \n" \
- "vst2.8 {" #c1"[3], " #c2"[3]}," #p "," #stride " \n" \
- "vst2.8 {" #c1"[4], " #c2"[4]}," #p "," #stride " \n" \
- "vst2.8 {" #c1"[5], " #c2"[5]}," #p "," #stride " \n" \
- "vst2.8 {" #c1"[6], " #c2"[6]}," #p "," #stride " \n" \
- "vst2.8 {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
-
-//-----------------------------------------------------------------------------
-// Simple In-loop filtering (Paragraph 15.2)
-
-static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
__asm__ volatile (
"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
"vld1.u8 {q1}, [%[p]], %[stride] \n" // p1
"vld1.u8 {q2}, [%[p]], %[stride] \n" // p0
"vld1.u8 {q3}, [%[p]], %[stride] \n" // q0
- "vld1.u8 {q4}, [%[p]] \n" // q1
+ "vld1.u8 {q12}, [%[p]] \n" // q1
- DO_FILTER2(q1, q2, q3, q4, %[thresh])
+ DO_FILTER2(q1, q2, q3, q12, %[thresh])
"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
@@ -113,25 +613,25 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
);
}
-static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
__asm__ volatile (
"sub r4, %[p], #2 \n" // base1 = p - 2
"lsl r6, %[stride], #1 \n" // r6 = 2 * stride
"add r5, r4, %[stride] \n" // base2 = base1 + stride
LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
- LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6)
- "vswp d3, d6 \n" // p1:q1 p0:q3
- "vswp d5, d8 \n" // q0:q2 q1:q4
- "vswp q2, q3 \n" // p1:q1 p0:q2 q0:q3 q1:q4
+ LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
+ "vswp d3, d24 \n" // p1:q1 p0:q3
+ "vswp d5, d26 \n" // q0:q2 q1:q4
+ "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4
- DO_FILTER2(q1, q2, q3, q4, %[thresh])
+ DO_FILTER2(q1, q2, q12, q13, %[thresh])
"sub %[p], %[p], #1 \n" // p - 1
- "vswp d5, d6 \n"
+ "vswp d5, d24 \n"
STORE8x2(d4, d5, [%[p]], %[stride])
- STORE8x2(d6, d7, [%[p]], %[stride])
+ STORE8x2(d24, d25, [%[p]], %[stride])
: [p] "+r"(p)
: [stride] "r"(stride), [thresh] "r"(thresh)
@@ -139,44 +639,408 @@ static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
);
}
-static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) {
- int k;
- for (k = 3; k > 0; --k) {
+#endif // WEBP_USE_INTRINSICS
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+ uint32_t k;
+ for (k = 3; k != 0; --k) {
p += 4 * stride;
- SimpleVFilter16NEON(p, stride, thresh);
+ SimpleVFilter16(p, stride, thresh);
}
}
-static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
- int k;
- for (k = 3; k > 0; --k) {
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+ uint32_t k;
+ for (k = 3; k != 0; --k) {
p += 4;
- SimpleHFilter16NEON(p, stride, thresh);
+ SimpleHFilter16(p, stride, thresh);
}
}
-static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
- const int kBPS = BPS;
- const int16_t constants[] = {20091, 17734, 0, 0};
- /* kC1, kC2. Padded because vld1.16 loads 8 bytes
- * Technically these are unsigned but vqdmulh is only available in signed.
- * vqdmulh returns high half (effectively >> 16) but also doubles the value,
- * changing the >> 16 to >> 15 and requiring an additional >> 1.
- * We use this to our advantage with kC2. The canonical value is 35468.
- * However, the high bit is set so treating it as signed will give incorrect
- * results. We avoid this by down shifting by 1 here to clear the highest bit.
- * Combined with the doubling effect of vqdmulh we get >> 16.
- * This can not be applied to kC1 because the lowest bit is set. Down shifting
- * the constant would reduce precision.
- */
-
- /* libwebp uses a trick to avoid some extra addition that libvpx does.
- * Instead of:
- * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
- * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
- * same issue with kC1 and vqdmulh that we work around by down shifting kC2
- */
+//------------------------------------------------------------------------------
+// Complex In-loop filtering (Paragraph 15.3)
+
+static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ int hev_thresh) {
+ const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
+ const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
+ const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
+ const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v);
+ const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v);
+ const uint8x16_t mask = vorrq_u8(mask1, mask2);
+ return mask;
+}
+
+static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
+ const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ const uint8x16_t q2, const uint8x16_t q3,
+ int ithresh, int thresh) {
+ const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
+ const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)
+ const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)
+ const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
+ const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)
+ const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)
+ const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
+ const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
+ const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
+ const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
+ const uint8x16_t max12 = vmaxq_u8(max1, max2);
+ const uint8x16_t max123 = vmaxq_u8(max12, max3);
+ const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
+ const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
+ const uint8x16_t mask = vandq_u8(mask1, mask2);
+ return mask;
+}
+
+// 4-points filter
+
+static void ApplyFilter4(
+ const int8x16_t p1, const int8x16_t p0,
+ const int8x16_t q0, const int8x16_t q1,
+ const int8x16_t delta0,
+ uint8x16_t* const op1, uint8x16_t* const op0,
+ uint8x16_t* const oq0, uint8x16_t* const oq1) {
+ const int8x16_t kCst3 = vdupq_n_s8(0x03);
+ const int8x16_t kCst4 = vdupq_n_s8(0x04);
+ const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
+ const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
+ const int8x16_t a1 = vshrq_n_s8(delta1, 3);
+ const int8x16_t a2 = vshrq_n_s8(delta2, 3);
+ const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1
+ *op0 = FlipSignBack(vqaddq_s8(p0, a2)); // clip(p0 + a2)
+ *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - a1)
+ *op1 = FlipSignBack(vqaddq_s8(p1, a3)); // clip(p1 + a3)
+ *oq1 = FlipSignBack(vqsubq_s8(q1, a3)); // clip(q1 - a3)
+}
+
+static void DoFilter4(
+ const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ const uint8x16_t mask, const uint8x16_t hev_mask,
+ uint8x16_t* const op1, uint8x16_t* const op0,
+ uint8x16_t* const oq0, uint8x16_t* const oq1) {
+ // This is a fused version of DoFilter2() calling ApplyFilter2 directly
+ const int8x16_t p1s = FlipSign(p1);
+ int8x16_t p0s = FlipSign(p0);
+ int8x16_t q0s = FlipSign(q0);
+ const int8x16_t q1s = FlipSign(q1);
+ const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
+
+ // do_filter2 part (simple loopfilter on pixels with hev)
+ {
+ const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
+ const int8x16_t simple_lf_delta =
+ vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
+ ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
+ }
+
+ // do_filter4 part (complex loopfilter on pixels without hev)
+ {
+ const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
+ // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
+ const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
+ const int8x16_t complex_lf_delta =
+ vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
+ ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
+ }
+}
+
+// 6-points filter
+
+static void ApplyFilter6(
+ const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
+ const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
+ const int8x16_t delta,
+ uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
+ uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
+ const int16x8_t kCst63 = vdupq_n_s16(63);
+ const int8x8_t kCst27 = vdup_n_s8(27);
+ const int8x8_t kCst18 = vdup_n_s8(18);
+ const int8x8_t kCst9 = vdup_n_s8(9);
+ const int8x8_t delta_lo = vget_low_s8(delta);
+ const int8x8_t delta_hi = vget_high_s8(delta);
+ const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo); // 63 + 27 * a
+ const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi); // 63 + 27 * a
+ const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo); // 63 + 18 * a
+ const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi); // 63 + 18 * a
+ const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo); // 63 + 9 * a
+ const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi); // 63 + 9 * a
+ const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7);
+ const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7);
+ const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7);
+ const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7);
+ const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7);
+ const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7);
+ const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
+ const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
+ const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
+
+ *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1)
+ *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1)
+ *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2)
+ *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2)
+ *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3)
+ *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3)
+}
+
+static void DoFilter6(
+ const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
+ const uint8x16_t mask, const uint8x16_t hev_mask,
+ uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
+ uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
+ // This is a fused version of DoFilter2() calling ApplyFilter2 directly
+ const int8x16_t p2s = FlipSign(p2);
+ const int8x16_t p1s = FlipSign(p1);
+ int8x16_t p0s = FlipSign(p0);
+ int8x16_t q0s = FlipSign(q0);
+ const int8x16_t q1s = FlipSign(q1);
+ const int8x16_t q2s = FlipSign(q2);
+ const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
+ const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
+
+ // do_filter2 part (simple loopfilter on pixels with hev)
+ {
+ const int8x16_t simple_lf_delta =
+ vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
+ ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
+ }
+
+ // do_filter6 part (complex loopfilter on pixels without hev)
+ {
+ // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
+ const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
+ const int8x16_t complex_lf_delta =
+ vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
+ ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
+ op2, op1, op0, oq0, oq1, oq2);
+ }
+}
+
+// on macroblock edges
+
+static void VFilter16(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
+ Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ {
+ const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
+ ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ uint8x16_t op2, op1, op0, oq0, oq1, oq2;
+ DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+ &op2, &op1, &op0, &oq0, &oq1, &oq2);
+ Store16x2(op2, op1, p - 2 * stride, stride);
+ Store16x2(op0, oq0, p + 0 * stride, stride);
+ Store16x2(oq1, oq2, p + 2 * stride, stride);
+ }
+}
+
+static void HFilter16(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
+ Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ {
+ const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
+ ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ uint8x16_t op2, op1, op0, oq0, oq1, oq2;
+ DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+ &op2, &op1, &op0, &oq0, &oq1, &oq2);
+ Store2x16(op2, op1, p - 2, stride);
+ Store2x16(op0, oq0, p + 0, stride);
+ Store2x16(oq1, oq2, p + 2, stride);
+ }
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ uint32_t k;
+ uint8x16_t p3, p2, p1, p0;
+ Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
+ for (k = 3; k != 0; --k) {
+ uint8x16_t q0, q1, q2, q3;
+ p += 4 * stride;
+ Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
+ {
+ const uint8x16_t mask =
+ NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ // p3 and p2 are not just temporary variables here: they will be
+ // re-used for next span. And q2/q3 will become p1/p0 accordingly.
+ DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
+ Store16x4(p1, p0, p3, p2, p, stride);
+ p1 = q2;
+ p0 = q3;
+ }
+ }
+}
+
+#if !defined(WORK_AROUND_GCC)
+static void HFilter16i(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ uint32_t k;
+ uint8x16_t p3, p2, p1, p0;
+ Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
+ for (k = 3; k != 0; --k) {
+ uint8x16_t q0, q1, q2, q3;
+ p += 4;
+ Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
+ {
+ const uint8x16_t mask =
+ NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
+ Store4x16(p1, p0, p3, p2, p, stride);
+ p1 = q2;
+ p0 = q3;
+ }
+ }
+}
+#endif // !WORK_AROUND_GCC
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
+ Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ {
+ const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
+ ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ uint8x16_t op2, op1, op0, oq0, oq1, oq2;
+ DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+ &op2, &op1, &op0, &oq0, &oq1, &oq2);
+ Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
+ Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
+ Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
+ }
+}
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
+ u += 4 * stride;
+ v += 4 * stride;
+ Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ {
+ const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
+ ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ uint8x16_t op1, op0, oq0, oq1;
+ DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
+ Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
+ }
+}
+
+#if !defined(WORK_AROUND_GCC)
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
+ Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ {
+ const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
+ ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ uint8x16_t op2, op1, op0, oq0, oq1, oq2;
+ DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+ &op2, &op1, &op0, &oq0, &oq1, &oq2);
+ Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
+ }
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
+ u += 4;
+ v += 4;
+ Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ {
+ const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
+ ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ uint8x16_t op1, op0, oq0, oq1;
+ DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
+ Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
+ }
+}
+#endif // !WORK_AROUND_GCC
+
+//-----------------------------------------------------------------------------
+// Inverse transforms (Paragraph 14.4)
+
+// Technically these are unsigned but vqdmulh is only available in signed.
+// vqdmulh returns high half (effectively >> 16) but also doubles the value,
+// changing the >> 16 to >> 15 and requiring an additional >> 1.
+// We use this to our advantage with kC2. The canonical value is 35468.
+// However, the high bit is set so treating it as signed will give incorrect
+// results. We avoid this by down shifting by 1 here to clear the highest bit.
+// Combined with the doubling effect of vqdmulh we get >> 16.
+// This can not be applied to kC1 because the lowest bit is set. Down shifting
+// the constant would reduce precision.
+
+// libwebp uses a trick to avoid some extra addition that libvpx does.
+// Instead of:
+// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
+// same issue with kC1 and vqdmulh that we work around by down shifting kC2
+static const int16_t kC1 = 20091;
+static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
+
+#if defined(WEBP_USE_INTRINSICS)
+static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
+ int16x8x2_t* const out) {
+ // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
+ // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
+ const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
+ // b0 d0 b1 d1 b2 d2 ...
+ *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
+}
+
+static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
+ // {rows} = in0 | in4
+ // in8 | in12
+ // B1 = in4 | in12
+ const int16x8_t B1 =
+ vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
+ // C0 = kC1 * in4 | kC1 * in12
+ // C1 = kC2 * in4 | kC2 * in12
+ const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
+ const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
+ const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
+ vget_low_s16(rows->val[1])); // in0 + in8
+ const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
+ vget_low_s16(rows->val[1])); // in0 - in8
+ // c = kC2 * in4 - kC1 * in12
+ // d = kC1 * in4 + kC2 * in12
+ const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
+ const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
+ const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
+ const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
+ const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
+ const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
+ const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
+ Transpose8x2(E0, E1, rows);
+}
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+ int16x8x2_t rows;
+ INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
+ TransformPass(&rows);
+ TransformPass(&rows);
+ Add4x4(rows.val[0], rows.val[1], dst);
+}
+
+#else
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+ const int kBPS = BPS;
+ // kC1, kC2. Padded because vld1.16 loads 8 bytes
+ const int16_t constants[4] = { kC1, kC2, 0, 0 };
/* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
__asm__ volatile (
"vld1.16 {q1, q2}, [%[in]] \n"
@@ -304,26 +1168,472 @@ static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
);
}
-static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
- TransformOneNEON(in, dst);
+#endif // WEBP_USE_INTRINSICS
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+ TransformOne(in, dst);
if (do_two) {
- TransformOneNEON(in + 16, dst + 4);
+ TransformOne(in + 16, dst + 4);
}
}
-extern void VP8DspInitNEON(void);
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+ const int16x8_t DC = vdupq_n_s16(in[0]);
+ Add4x4(DC, DC, dst);
+}
+
+//------------------------------------------------------------------------------
+
+#define STORE_WHT(dst, col, rows) do { \
+ *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
+ *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
+ *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
+ *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
+} while (0)
+
+static void TransformWHT(const int16_t* in, int16_t* out) {
+ int32x4x4_t tmp;
-void VP8DspInitNEON(void) {
- VP8Transform = TransformTwoNEON;
+ {
+ // Load the source.
+ const int16x4_t in00_03 = vld1_s16(in + 0);
+ const int16x4_t in04_07 = vld1_s16(in + 4);
+ const int16x4_t in08_11 = vld1_s16(in + 8);
+ const int16x4_t in12_15 = vld1_s16(in + 12);
+ const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]
+ const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
+ const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
+ const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
+ tmp.val[0] = vaddq_s32(a0, a1);
+ tmp.val[1] = vaddq_s32(a3, a2);
+ tmp.val[2] = vsubq_s32(a0, a1);
+ tmp.val[3] = vsubq_s32(a3, a2);
+ // Arrange the temporary results column-wise.
+ tmp = Transpose4x4(tmp);
+ }
+
+ {
+ const int32x4_t kCst3 = vdupq_n_s32(3);
+ const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder
+ const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
+ const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
+ const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
+ const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
+
+ tmp.val[0] = vaddq_s32(a0, a1);
+ tmp.val[1] = vaddq_s32(a3, a2);
+ tmp.val[2] = vsubq_s32(a0, a1);
+ tmp.val[3] = vsubq_s32(a3, a2);
+
+ // right shift the results by 3.
+ tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
+ tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
+ tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
+ tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
- VP8SimpleVFilter16 = SimpleVFilter16NEON;
- VP8SimpleHFilter16 = SimpleHFilter16NEON;
- VP8SimpleVFilter16i = SimpleVFilter16iNEON;
- VP8SimpleHFilter16i = SimpleHFilter16iNEON;
+ STORE_WHT(out, 0, tmp);
+ STORE_WHT(out, 1, tmp);
+ STORE_WHT(out, 2, tmp);
+ STORE_WHT(out, 3, tmp);
+ }
}
-#if defined(__cplusplus) || defined(c_plusplus)
-} // extern "C"
+#undef STORE_WHT
+
+//------------------------------------------------------------------------------
+
+#define MUL(a, b) (((a) * (b)) >> 16)
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+ static const int kC1_full = 20091 + (1 << 16);
+ static const int kC2_full = 35468;
+ const int16x4_t A = vld1_dup_s16(in);
+ const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
+ const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
+ const int c1 = MUL(in[1], kC2_full);
+ const int d1 = MUL(in[1], kC1_full);
+ const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 |
+ (uint64_t)( c1 & 0xffff) << 16 |
+ (uint64_t)(-c1 & 0xffff) << 32 |
+ (uint64_t)(-d1 & 0xffff) << 48;
+ const int16x4_t CD = vcreate_s16(cd);
+ const int16x4_t B = vqadd_s16(A, CD);
+ const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
+ const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
+ Add4x4(m0_m1, m2_m3, dst);
+}
+#undef MUL
+
+//------------------------------------------------------------------------------
+// 4x4
+
+static void DC4(uint8_t* dst) { // DC
+ const uint8x8_t A = vld1_u8(dst - BPS); // top row
+ const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
+ const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
+ const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
+ const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
+ const uint16x8_t s0 = vaddq_u16(L0, L1);
+ const uint16x8_t s1 = vaddq_u16(L2, L3);
+ const uint16x8_t s01 = vaddq_u16(s0, s1);
+ const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3
+ const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
+ }
+}
+
+// TrueMotion (4x4 + 8x8)
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+ const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
+ const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
+ const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1]
+ int y;
+ for (y = 0; y < size; y += 4) {
+ // left edge
+ const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
+ const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
+ const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
+ const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+ const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1]
+ const int16x8_t r1 = vaddq_s16(L1, d);
+ const int16x8_t r2 = vaddq_s16(L2, d);
+ const int16x8_t r3 = vaddq_s16(L3, d);
+ // Saturate and store the result.
+ const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
+ const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
+ const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
+ const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
+ if (size == 4) {
+ vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
+ vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
+ vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
+ vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
+ } else {
+ vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
+ vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
+ vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
+ vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
+ }
+ dst += 4 * BPS;
+ }
+}
+
+static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
+
+static void VE4(uint8_t* dst) { // vertical
+ // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
+ const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row
+ const uint64x1_t A1 = vshr_n_u64(A0, 8);
+ const uint64x1_t A2 = vshr_n_u64(A0, 16);
+ const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+ const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+ const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+ const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
+ const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
+ }
+}
+
+static void RD4(uint8_t* dst) { // Down-right
+ const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
+ const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+ const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+ const uint32_t I = dst[-1 + 0 * BPS];
+ const uint32_t J = dst[-1 + 1 * BPS];
+ const uint32_t K = dst[-1 + 2 * BPS];
+ const uint32_t L = dst[-1 + 3 * BPS];
+ const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
+ const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+ const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+ const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+ const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+ const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+ const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+ const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+ const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+ const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+ const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+ const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+ const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+ const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+ vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
+ vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
+ vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
+ vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
+}
+
+static void LD4(uint8_t* dst) { // Down-left
+ // Note using the same shift trick as VE4() is slower here.
+ const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
+ const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
+ const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
+ const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
+ const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
+ const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+ const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+ const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+ const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+ const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+ const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+ vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
+ vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
+ vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
+ vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void VE8uv(uint8_t* dst) { // vertical
+ const uint8x8_t top = vld1_u8(dst - BPS);
+ int j;
+ for (j = 0; j < 8; ++j) {
+ vst1_u8(dst + j * BPS, top);
+ }
+}
+
+static void HE8uv(uint8_t* dst) { // horizontal
+ int j;
+ for (j = 0; j < 8; ++j) {
+ const uint8x8_t left = vld1_dup_u8(dst - 1);
+ vst1_u8(dst, left);
+ dst += BPS;
+ }
+}
+
+static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+ if (do_top) {
+ const uint8x8_t A = vld1_u8(dst - BPS); // top row
+ const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ sum_top = vcombine_u16(p2, p2);
+ }
+
+ if (do_left) {
+ const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
+ const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
+ const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
+ const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
+ const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
+ const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
+ const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
+ const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
+ const uint16x8_t s0 = vaddq_u16(L0, L1);
+ const uint16x8_t s1 = vaddq_u16(L2, L3);
+ const uint16x8_t s2 = vaddq_u16(L4, L5);
+ const uint16x8_t s3 = vaddq_u16(L6, L7);
+ const uint16x8_t s01 = vaddq_u16(s0, s1);
+ const uint16x8_t s23 = vaddq_u16(s2, s3);
+ sum_left = vaddq_u16(s01, s23);
+ }
+
+ if (do_top && do_left) {
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 4);
+ } else if (do_top) {
+ dc0 = vrshrn_n_u16(sum_top, 3);
+ } else if (do_left) {
+ dc0 = vrshrn_n_u16(sum_left, 3);
+ } else {
+ dc0 = vdup_n_u8(0x80);
+ }
+
+ {
+ const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+ int i;
+ for (i = 0; i < 8; ++i) {
+ vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
+ }
+ }
+}
+
+static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); }
+static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); }
+static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); }
+static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); }
+
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+
+//------------------------------------------------------------------------------
+// 16x16
+
+static void VE16(uint8_t* dst) { // vertical
+ const uint8x16_t top = vld1q_u8(dst - BPS);
+ int j;
+ for (j = 0; j < 16; ++j) {
+ vst1q_u8(dst + j * BPS, top);
+ }
+}
+
+static void HE16(uint8_t* dst) { // horizontal
+ int j;
+ for (j = 0; j < 16; ++j) {
+ const uint8x16_t left = vld1q_dup_u8(dst - 1);
+ vst1q_u8(dst, left);
+ dst += BPS;
+ }
+}
+
+static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+ if (do_top) {
+ const uint8x16_t A = vld1q_u8(dst - BPS); // top row
+ const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
+ const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ const uint16x4_t p3 = vpadd_u16(p2, p2);
+ sum_top = vcombine_u16(p3, p3);
+ }
+
+ if (do_left) {
+ int i;
+ sum_left = vdupq_n_u16(0);
+ for (i = 0; i < 16; i += 8) {
+ const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
+ const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
+ const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
+ const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
+ const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
+ const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
+ const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
+ const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
+ const uint16x8_t s0 = vaddq_u16(L0, L1);
+ const uint16x8_t s1 = vaddq_u16(L2, L3);
+ const uint16x8_t s2 = vaddq_u16(L4, L5);
+ const uint16x8_t s3 = vaddq_u16(L6, L7);
+ const uint16x8_t s01 = vaddq_u16(s0, s1);
+ const uint16x8_t s23 = vaddq_u16(s2, s3);
+ const uint16x8_t sum = vaddq_u16(s01, s23);
+ sum_left = vaddq_u16(sum_left, sum);
+ }
+ }
+
+ if (do_top && do_left) {
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 5);
+ } else if (do_top) {
+ dc0 = vrshrn_n_u16(sum_top, 4);
+ } else if (do_left) {
+ dc0 = vrshrn_n_u16(sum_left, 4);
+ } else {
+ dc0 = vdup_n_u8(0x80);
+ }
+
+ {
+ const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+ int i;
+ for (i = 0; i < 16; ++i) {
+ vst1q_u8(dst + i * BPS, dc);
+ }
+ }
+}
+
+static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); }
+static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); }
+static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); }
+static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); }
+
+static void TM16(uint8_t* dst) {
+ const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
+ const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
+ // A[c] - A[-1]
+ const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
+ const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
+ int y;
+ for (y = 0; y < 16; y += 4) {
+ // left edge
+ const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
+ const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
+ const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
+ const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+ const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1]
+ const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
+ const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
+ const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
+ const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
+ const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
+ const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
+ const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
+ // Saturate and store the result.
+ const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
+ const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
+ const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
+ const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
+ vst1q_u8(dst + 0 * BPS, row0);
+ vst1q_u8(dst + 1 * BPS, row1);
+ vst1q_u8(dst + 2 * BPS, row2);
+ vst1q_u8(dst + 3 * BPS, row3);
+ dst += 4 * BPS;
+ }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
+ VP8Transform = TransformTwo;
+ VP8TransformAC3 = TransformAC3;
+ VP8TransformDC = TransformDC;
+ VP8TransformWHT = TransformWHT;
+
+ VP8VFilter16 = VFilter16;
+ VP8VFilter16i = VFilter16i;
+ VP8HFilter16 = HFilter16;
+#if !defined(WORK_AROUND_GCC)
+ VP8HFilter16i = HFilter16i;
#endif
+ VP8VFilter8 = VFilter8;
+ VP8VFilter8i = VFilter8i;
+#if !defined(WORK_AROUND_GCC)
+ VP8HFilter8 = HFilter8;
+ VP8HFilter8i = HFilter8i;
+#endif
+ VP8SimpleVFilter16 = SimpleVFilter16;
+ VP8SimpleHFilter16 = SimpleHFilter16;
+ VP8SimpleVFilter16i = SimpleVFilter16i;
+ VP8SimpleHFilter16i = SimpleHFilter16i;
+
+ VP8PredLuma4[0] = DC4;
+ VP8PredLuma4[1] = TM4;
+ VP8PredLuma4[2] = VE4;
+ VP8PredLuma4[4] = RD4;
+ VP8PredLuma4[6] = LD4;
+
+ VP8PredLuma16[0] = DC16TopLeft;
+ VP8PredLuma16[1] = TM16;
+ VP8PredLuma16[2] = VE16;
+ VP8PredLuma16[3] = HE16;
+ VP8PredLuma16[4] = DC16NoTop;
+ VP8PredLuma16[5] = DC16NoLeft;
+ VP8PredLuma16[6] = DC16NoTopLeft;
+
+ VP8PredChroma8[0] = DC8uv;
+ VP8PredChroma8[1] = TM8uv;
+ VP8PredChroma8[2] = VE8uv;
+ VP8PredChroma8[3] = HE8uv;
+ VP8PredChroma8[4] = DC8uvNoTop;
+ VP8PredChroma8[5] = DC8uvNoLeft;
+ VP8PredChroma8[6] = DC8uvNoTopLeft;
+}
+
+#else // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8DspInitNEON)
-#endif // WEBP_USE_NEON
+#endif // WEBP_USE_NEON
diff --git a/drivers/webp/dsp/dec_sse2.c b/drivers/webp/dsp/dec_sse2.c
index 472b68ecb8..d4838b9210 100644
--- a/drivers/webp/dsp/dec_sse2.c
+++ b/drivers/webp/dsp/dec_sse2.c
@@ -1,8 +1,10 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
-// This code is licensed under the same terms as WebM:
-// Software License Agreement: http://www.webmproject.org/license/software/
-// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 version of some decoding functions (idct, loop filtering).
@@ -14,17 +16,17 @@
#if defined(WEBP_USE_SSE2)
+// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
+// one it seems => disable it by default. Uncomment the following to enable:
+// #define USE_TRANSFORM_AC3
+
#include <emmintrin.h>
#include "../dec/vp8i.h"
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
-static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
+static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -50,19 +52,19 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
// vectors will just contain random value we'll never use nor store.
__m128i in0, in1, in2, in3;
{
- in0 = _mm_loadl_epi64((__m128i*)&in[0]);
- in1 = _mm_loadl_epi64((__m128i*)&in[4]);
- in2 = _mm_loadl_epi64((__m128i*)&in[8]);
- in3 = _mm_loadl_epi64((__m128i*)&in[12]);
+ in0 = _mm_loadl_epi64((const __m128i*)&in[0]);
+ in1 = _mm_loadl_epi64((const __m128i*)&in[4]);
+ in2 = _mm_loadl_epi64((const __m128i*)&in[8]);
+ in3 = _mm_loadl_epi64((const __m128i*)&in[12]);
// a00 a10 a20 a30 x x x x
// a01 a11 a21 a31 x x x x
// a02 a12 a22 a32 x x x x
// a03 a13 a23 a33 x x x x
if (do_two) {
- const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
- const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
- const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
- const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
+ const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]);
+ const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]);
+ const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]);
+ const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]);
in0 = _mm_unpacklo_epi64(in0, inB0);
in1 = _mm_unpacklo_epi64(in1, inB1);
in2 = _mm_unpacklo_epi64(in2, inB2);
@@ -194,21 +196,21 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
// Add inverse transform to 'dst' and store.
{
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
// Load the reference(s).
__m128i dst0, dst1, dst2, dst3;
if (do_two) {
// Load eight bytes/pixels per line.
- dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
- dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
- dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
- dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
+ dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS));
+ dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS));
+ dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS));
+ dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
} else {
// Load four bytes/pixels per line.
- dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
- dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
- dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
- dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
+ dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
+ dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
+ dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
+ dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
}
// Convert to 16b.
dst0 = _mm_unpacklo_epi8(dst0, zero);
@@ -228,20 +230,66 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
// Store the results.
if (do_two) {
// Store eight bytes/pixels per line.
- _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
- _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
- _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
- _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
+ _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0);
+ _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1);
+ _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2);
+ _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
} else {
// Store four bytes/pixels per line.
- *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
- *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
- *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
- *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
+ *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
+ *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
+ *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
+ *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
}
}
}
+#if defined(USE_TRANSFORM_AC3)
+#define MUL(a, b) (((a) * (b)) >> 16)
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+ static const int kC1 = 20091 + (1 << 16);
+ static const int kC2 = 35468;
+ const __m128i A = _mm_set1_epi16(in[0] + 4);
+ const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
+ const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
+ const int c1 = MUL(in[1], kC2);
+ const int d1 = MUL(in[1], kC1);
+ const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
+ const __m128i B = _mm_adds_epi16(A, CD);
+ const __m128i m0 = _mm_adds_epi16(B, d4);
+ const __m128i m1 = _mm_adds_epi16(B, c4);
+ const __m128i m2 = _mm_subs_epi16(B, c4);
+ const __m128i m3 = _mm_subs_epi16(B, d4);
+ const __m128i zero = _mm_setzero_si128();
+ // Load the source pixels.
+ __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
+ __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
+ __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
+ __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+ // Convert to 16b.
+ dst0 = _mm_unpacklo_epi8(dst0, zero);
+ dst1 = _mm_unpacklo_epi8(dst1, zero);
+ dst2 = _mm_unpacklo_epi8(dst2, zero);
+ dst3 = _mm_unpacklo_epi8(dst3, zero);
+ // Add the inverse transform.
+ dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
+ dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
+ dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
+ dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
+ // Unsigned saturate to 8b.
+ dst0 = _mm_packus_epi16(dst0, dst0);
+ dst1 = _mm_packus_epi16(dst1, dst1);
+ dst2 = _mm_packus_epi16(dst2, dst2);
+ dst3 = _mm_packus_epi16(dst3, dst3);
+ // Store the results.
+ *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
+ *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
+ *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
+ *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+}
+#undef MUL
+#endif // USE_TRANSFORM_AC3
+
//------------------------------------------------------------------------------
// Loop Filter (Paragraph 15)
@@ -250,20 +298,14 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
_mm_subs_epu8((q), (p)), \
_mm_subs_epu8((p), (q)))
-// Shift each byte of "a" by N bits while preserving by the sign bit.
-//
-// It first shifts the lower bytes of the words and then the upper bytes and
-// then merges the results together.
-#define SIGNED_SHIFT_N(a, N) { \
- __m128i t = a; \
- t = _mm_slli_epi16(t, 8); \
- t = _mm_srai_epi16(t, N); \
- t = _mm_srli_epi16(t, 8); \
- \
- a = _mm_srai_epi16(a, N + 8); \
- a = _mm_slli_epi16(a, 8); \
- \
- a = _mm_or_si128(t, a); \
+// Shift each byte of "x" by 3 bits while preserving by the sign bit.
+static WEBP_INLINE void SignedShift8b(__m128i* const x) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x);
+ const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x);
+ const __m128i lo_1 = _mm_srai_epi16(lo_0, 3 + 8);
+ const __m128i hi_1 = _mm_srai_epi16(hi_0, 3 + 8);
+ *x = _mm_packs_epi16(lo_1, hi_1);
}
#define FLIP_SIGN_BIT2(a, b) { \
@@ -276,103 +318,124 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
FLIP_SIGN_BIT2(c, d); \
}
-#define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) { \
- const __m128i zero = _mm_setzero_si128(); \
- const __m128i t1 = MM_ABS(p1, p0); \
- const __m128i t2 = MM_ABS(q1, q0); \
- \
- const __m128i h = _mm_set1_epi8(hev_thresh); \
- const __m128i t3 = _mm_subs_epu8(t1, h); /* abs(p1 - p0) - hev_tresh */ \
- const __m128i t4 = _mm_subs_epu8(t2, h); /* abs(q1 - q0) - hev_tresh */ \
- \
- not_hev = _mm_or_si128(t3, t4); \
- not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\
-}
-
-#define GET_BASE_DELTA(p1, p0, q0, q1, o) { \
- const __m128i qp0 = _mm_subs_epi8(q0, p0); /* q0 - p0 */ \
- o = _mm_subs_epi8(p1, q1); /* p1 - q1 */ \
- o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 1 * (q0 - p0) */ \
- o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 2 * (q0 - p0) */ \
- o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 3 * (q0 - p0) */ \
-}
-
-#define DO_SIMPLE_FILTER(p0, q0, fl) { \
- const __m128i three = _mm_set1_epi8(3); \
- const __m128i four = _mm_set1_epi8(4); \
- __m128i v3 = _mm_adds_epi8(fl, three); \
- __m128i v4 = _mm_adds_epi8(fl, four); \
- \
- /* Do +4 side */ \
- SIGNED_SHIFT_N(v4, 3); /* v4 >> 3 */ \
- q0 = _mm_subs_epi8(q0, v4); /* q0 -= v4 */ \
- \
- /* Now do +3 side */ \
- SIGNED_SHIFT_N(v3, 3); /* v3 >> 3 */ \
- p0 = _mm_adds_epi8(p0, v3); /* p0 += v3 */ \
+// input/output is uint8_t
+static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
+ const __m128i* const p0,
+ const __m128i* const q0,
+ const __m128i* const q1,
+ int hev_thresh, __m128i* const not_hev) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t_1 = MM_ABS(*p1, *p0);
+ const __m128i t_2 = MM_ABS(*q1, *q0);
+
+ const __m128i h = _mm_set1_epi8(hev_thresh);
+ const __m128i t_max = _mm_max_epu8(t_1, t_2);
+
+ const __m128i t_max_h = _mm_subs_epu8(t_max, h);
+ *not_hev = _mm_cmpeq_epi8(t_max_h, zero); // not_hev <= t1 && not_hev <= t2
}
-// Updates values of 2 pixels at MB edge during complex filtering.
-// Update operations:
-// q = q - a and p = p + a; where a = [(a_hi >> 7), (a_lo >> 7)]
-#define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) { \
- const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7); \
- const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7); \
- const __m128i a = _mm_packs_epi16(a_lo7, a_hi7); \
- pi = _mm_adds_epi8(pi, a); \
- qi = _mm_subs_epi8(qi, a); \
+// input pixels are int8_t
+static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
+ const __m128i* const p0,
+ const __m128i* const q0,
+ const __m128i* const q1,
+ __m128i* const delta) {
+ // beware of addition order, for saturation!
+ const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1); // p1 - q1
+ const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0); // q0 - p0
+ const __m128i s1 = _mm_adds_epi8(p1_q1, q0_p0); // p1 - q1 + 1 * (q0 - p0)
+ const __m128i s2 = _mm_adds_epi8(q0_p0, s1); // p1 - q1 + 2 * (q0 - p0)
+ const __m128i s3 = _mm_adds_epi8(q0_p0, s2); // p1 - q1 + 3 * (q0 - p0)
+ *delta = s3;
}
-static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
- const __m128i* q1, int thresh, __m128i *mask) {
- __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1)
- *mask = _mm_set1_epi8(0xFE);
- t1 = _mm_and_si128(t1, *mask); // set lsb of each byte to zero
- t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2
+// input and output are int8_t
+static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
+ const __m128i* const fl) {
+ const __m128i k3 = _mm_set1_epi8(3);
+ const __m128i k4 = _mm_set1_epi8(4);
+ __m128i v3 = _mm_adds_epi8(*fl, k3);
+ __m128i v4 = _mm_adds_epi8(*fl, k4);
+
+ SignedShift8b(&v4); // v4 >> 3
+ SignedShift8b(&v3); // v3 >> 3
+ *q0 = _mm_subs_epi8(*q0, v4); // q0 -= v4
+ *p0 = _mm_adds_epi8(*p0, v3); // p0 += v3
+}
- *mask = MM_ABS(*p0, *q0); // abs(p0 - q0)
- *mask = _mm_adds_epu8(*mask, *mask); // abs(p0 - q0) * 2
- *mask = _mm_adds_epu8(*mask, t1); // abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+// Updates values of 2 pixels at MB edge during complex filtering.
+// Update operations:
+// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
+// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
+static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
+ const __m128i* const a0_lo,
+ const __m128i* const a0_hi) {
+ const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
+ const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
+ const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
+ const __m128i sign_bit = _mm_set1_epi8(0x80);
+ *pi = _mm_adds_epi8(*pi, delta);
+ *qi = _mm_subs_epi8(*qi, delta);
+ FLIP_SIGN_BIT2(*pi, *qi);
+}
- t1 = _mm_set1_epi8(thresh);
- *mask = _mm_subs_epu8(*mask, t1); // mask <= thresh
- *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
+// input pixels are uint8_t
+static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
+ const __m128i* const p0,
+ const __m128i* const q0,
+ const __m128i* const q1,
+ int thresh, __m128i* const mask) {
+ const __m128i m_thresh = _mm_set1_epi8(thresh);
+ const __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1)
+ const __m128i kFE = _mm_set1_epi8(0xFE);
+ const __m128i t2 = _mm_and_si128(t1, kFE); // set lsb of each byte to zero
+ const __m128i t3 = _mm_srli_epi16(t2, 1); // abs(p1 - q1) / 2
+
+ const __m128i t4 = MM_ABS(*p0, *q0); // abs(p0 - q0)
+ const __m128i t5 = _mm_adds_epu8(t4, t4); // abs(p0 - q0) * 2
+ const __m128i t6 = _mm_adds_epu8(t5, t3); // abs(p0-q0)*2 + abs(p1-q1)/2
+
+ const __m128i t7 = _mm_subs_epu8(t6, m_thresh); // mask <= m_thresh
+ *mask = _mm_cmpeq_epi8(t7, _mm_setzero_si128());
}
//------------------------------------------------------------------------------
// Edge filtering functions
// Applies filter on 2 pixels (p0 and q0)
-static WEBP_INLINE void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
- const __m128i* q1, int thresh) {
+static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
+ __m128i* const q0, __m128i* const q1,
+ int thresh) {
__m128i a, mask;
const __m128i sign_bit = _mm_set1_epi8(0x80);
+ // convert p1/q1 to int8_t (for GetBaseDelta)
const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
NeedsFilter(p1, p0, q0, q1, thresh, &mask);
- // convert to signed values
FLIP_SIGN_BIT2(*p0, *q0);
-
- GET_BASE_DELTA(p1s, *p0, *q0, q1s, a);
+ GetBaseDelta(&p1s, p0, q0, &q1s, &a);
a = _mm_and_si128(a, mask); // mask filter values we don't care about
- DO_SIMPLE_FILTER(*p0, *q0, a);
-
- // unoffset
+ DoSimpleFilter(p0, q0, &a);
FLIP_SIGN_BIT2(*p0, *q0);
}
// Applies filter on 4 pixels (p1, p0, q0 and q1)
-static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
- __m128i* q0, __m128i* q1,
- const __m128i* mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
+ __m128i* const q0, __m128i* const q1,
+ const __m128i* const mask, int hev_thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bit = _mm_set1_epi8(0x80);
+ const __m128i k64 = _mm_set1_epi8(64);
+ const __m128i k3 = _mm_set1_epi8(3);
+ const __m128i k4 = _mm_set1_epi8(4);
__m128i not_hev;
__m128i t1, t2, t3;
- const __m128i sign_bit = _mm_set1_epi8(0x80);
// compute hev mask
- GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
+ GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
// convert to signed values
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
@@ -385,135 +448,115 @@ static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0)
t1 = _mm_and_si128(t1, *mask); // mask filter values we don't care about
- // Do +4 side
- t2 = _mm_set1_epi8(4);
- t2 = _mm_adds_epi8(t1, t2); // 3 * (q0 - p0) + (p1 - q1) + 4
- SIGNED_SHIFT_N(t2, 3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
- t3 = t2; // save t2
- *q0 = _mm_subs_epi8(*q0, t2); // q0 -= t2
-
- // Now do +3 side
- t2 = _mm_set1_epi8(3);
- t2 = _mm_adds_epi8(t1, t2); // +3 instead of +4
- SIGNED_SHIFT_N(t2, 3); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+ t2 = _mm_adds_epi8(t1, k3); // 3 * (q0 - p0) + hev(p1 - q1) + 3
+ t3 = _mm_adds_epi8(t1, k4); // 3 * (q0 - p0) + hev(p1 - q1) + 4
+ SignedShift8b(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+ SignedShift8b(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
*p0 = _mm_adds_epi8(*p0, t2); // p0 += t2
+ *q0 = _mm_subs_epi8(*q0, t3); // q0 -= t3
+ FLIP_SIGN_BIT2(*p0, *q0);
- t2 = _mm_set1_epi8(1);
- t3 = _mm_adds_epi8(t3, t2);
- SIGNED_SHIFT_N(t3, 1); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4
+ // this is equivalent to signed (a + 1) >> 1 calculation
+ t2 = _mm_add_epi8(t3, sign_bit);
+ t3 = _mm_avg_epu8(t2, zero);
+ t3 = _mm_sub_epi8(t3, k64);
t3 = _mm_and_si128(not_hev, t3); // if !hev
*q1 = _mm_subs_epi8(*q1, t3); // q1 -= t3
*p1 = _mm_adds_epi8(*p1, t3); // p1 += t3
-
- // unoffset
- FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
+ FLIP_SIGN_BIT2(*p1, *q1);
}
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
-static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
- __m128i* q0, __m128i* q1, __m128i *q2,
- const __m128i* mask, int hev_thresh) {
- __m128i a, not_hev;
+static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
+ __m128i* const p0, __m128i* const q0,
+ __m128i* const q1, __m128i* const q2,
+ const __m128i* const mask, int hev_thresh) {
+ const __m128i zero = _mm_setzero_si128();
const __m128i sign_bit = _mm_set1_epi8(0x80);
+ __m128i a, not_hev;
// compute hev mask
- GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
+ GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
- // convert to signed values
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
FLIP_SIGN_BIT2(*p2, *q2);
-
- GET_BASE_DELTA(*p1, *p0, *q0, *q1, a);
+ GetBaseDelta(p1, p0, q0, q1, &a);
{ // do simple filter on pixels with hev
const __m128i m = _mm_andnot_si128(not_hev, *mask);
const __m128i f = _mm_and_si128(a, m);
- DO_SIMPLE_FILTER(*p0, *q0, f);
+ DoSimpleFilter(p0, q0, &f);
}
+
{ // do strong filter on pixels with not hev
- const __m128i zero = _mm_setzero_si128();
- const __m128i nine = _mm_set1_epi16(0x0900);
- const __m128i sixty_three = _mm_set1_epi16(63);
+ const __m128i k9 = _mm_set1_epi16(0x0900);
+ const __m128i k63 = _mm_set1_epi16(63);
const __m128i m = _mm_and_si128(not_hev, *mask);
const __m128i f = _mm_and_si128(a, m);
+
const __m128i f_lo = _mm_unpacklo_epi8(zero, f);
const __m128i f_hi = _mm_unpackhi_epi8(zero, f);
- const __m128i f9_lo = _mm_mulhi_epi16(f_lo, nine); // Filter (lo) * 9
- const __m128i f9_hi = _mm_mulhi_epi16(f_hi, nine); // Filter (hi) * 9
- const __m128i f18_lo = _mm_add_epi16(f9_lo, f9_lo); // Filter (lo) * 18
- const __m128i f18_hi = _mm_add_epi16(f9_hi, f9_hi); // Filter (hi) * 18
+ const __m128i f9_lo = _mm_mulhi_epi16(f_lo, k9); // Filter (lo) * 9
+ const __m128i f9_hi = _mm_mulhi_epi16(f_hi, k9); // Filter (hi) * 9
- const __m128i a2_lo = _mm_add_epi16(f9_lo, sixty_three); // Filter * 9 + 63
- const __m128i a2_hi = _mm_add_epi16(f9_hi, sixty_three); // Filter * 9 + 63
+ const __m128i a2_lo = _mm_add_epi16(f9_lo, k63); // Filter * 9 + 63
+ const __m128i a2_hi = _mm_add_epi16(f9_hi, k63); // Filter * 9 + 63
- const __m128i a1_lo = _mm_add_epi16(f18_lo, sixty_three); // F... * 18 + 63
- const __m128i a1_hi = _mm_add_epi16(f18_hi, sixty_three); // F... * 18 + 63
+ const __m128i a1_lo = _mm_add_epi16(a2_lo, f9_lo); // Filter * 18 + 63
+ const __m128i a1_hi = _mm_add_epi16(a2_hi, f9_hi); // Filter * 18 + 63
- const __m128i a0_lo = _mm_add_epi16(f18_lo, a2_lo); // Filter * 27 + 63
- const __m128i a0_hi = _mm_add_epi16(f18_hi, a2_hi); // Filter * 27 + 63
+ const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo); // Filter * 27 + 63
+ const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi); // Filter * 27 + 63
- UPDATE_2PIXELS(*p2, *q2, a2_lo, a2_hi);
- UPDATE_2PIXELS(*p1, *q1, a1_lo, a1_hi);
- UPDATE_2PIXELS(*p0, *q0, a0_lo, a0_hi);
+ Update2Pixels(p2, q2, &a2_lo, &a2_hi);
+ Update2Pixels(p1, q1, &a1_lo, &a1_hi);
+ Update2Pixels(p0, q0, &a0_lo, &a0_hi);
}
+}
- // unoffset
- FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
- FLIP_SIGN_BIT2(*p2, *q2);
+// memcpy() is the safe way of moving potentially unaligned 32b memory.
+static WEBP_INLINE uint32_t MemToUint32(const uint8_t* const ptr) {
+ uint32_t A;
+ memcpy(&A, (const int*)ptr, sizeof(A));
+ return A;
}
// reads 8 rows across a vertical edge.
-//
-// TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
-// two Load4x4() to avoid code duplication.
-static WEBP_INLINE void Load8x4(const uint8_t* b, int stride,
- __m128i* p, __m128i* q) {
- __m128i t1, t2;
-
- // Load 0th, 1st, 4th and 5th rows
- __m128i r0 = _mm_cvtsi32_si128(*((int*)&b[0 * stride])); // 03 02 01 00
- __m128i r1 = _mm_cvtsi32_si128(*((int*)&b[1 * stride])); // 13 12 11 10
- __m128i r4 = _mm_cvtsi32_si128(*((int*)&b[4 * stride])); // 43 42 41 40
- __m128i r5 = _mm_cvtsi32_si128(*((int*)&b[5 * stride])); // 53 52 51 50
-
- r0 = _mm_unpacklo_epi32(r0, r4); // 43 42 41 40 03 02 01 00
- r1 = _mm_unpacklo_epi32(r1, r5); // 53 52 51 50 13 12 11 10
-
- // t1 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
- t1 = _mm_unpacklo_epi8(r0, r1);
-
- // Load 2nd, 3rd, 6th and 7th rows
- r0 = _mm_cvtsi32_si128(*((int*)&b[2 * stride])); // 23 22 21 22
- r1 = _mm_cvtsi32_si128(*((int*)&b[3 * stride])); // 33 32 31 30
- r4 = _mm_cvtsi32_si128(*((int*)&b[6 * stride])); // 63 62 61 60
- r5 = _mm_cvtsi32_si128(*((int*)&b[7 * stride])); // 73 72 71 70
-
- r0 = _mm_unpacklo_epi32(r0, r4); // 63 62 61 60 23 22 21 20
- r1 = _mm_unpacklo_epi32(r1, r5); // 73 72 71 70 33 32 31 30
-
- // t2 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
- t2 = _mm_unpacklo_epi8(r0, r1);
-
- // t1 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
- // t2 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
- r0 = t1;
- t1 = _mm_unpacklo_epi16(t1, t2);
- t2 = _mm_unpackhi_epi16(r0, t2);
+static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
+ __m128i* const p, __m128i* const q) {
+ // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
+ // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
+ const __m128i A0 = _mm_set_epi32(
+ MemToUint32(&b[6 * stride]), MemToUint32(&b[2 * stride]),
+ MemToUint32(&b[4 * stride]), MemToUint32(&b[0 * stride]));
+ const __m128i A1 = _mm_set_epi32(
+ MemToUint32(&b[7 * stride]), MemToUint32(&b[3 * stride]),
+ MemToUint32(&b[5 * stride]), MemToUint32(&b[1 * stride]));
+
+ // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+ // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+ const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
+ const __m128i B1 = _mm_unpackhi_epi8(A0, A1);
+
+ // C0 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+ // C1 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+ const __m128i C0 = _mm_unpacklo_epi16(B0, B1);
+ const __m128i C1 = _mm_unpackhi_epi16(B0, B1);
// *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
// *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
- *p = _mm_unpacklo_epi32(t1, t2);
- *q = _mm_unpackhi_epi32(t1, t2);
+ *p = _mm_unpacklo_epi32(C0, C1);
+ *q = _mm_unpackhi_epi32(C0, C1);
}
-static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8,
+static WEBP_INLINE void Load16x4(const uint8_t* const r0,
+ const uint8_t* const r8,
int stride,
- __m128i* p1, __m128i* p0,
- __m128i* q0, __m128i* q1) {
- __m128i t1, t2;
+ __m128i* const p1, __m128i* const p0,
+ __m128i* const q0, __m128i* const q1) {
// Assume the pixels around the edge (|) are numbered as follows
// 00 01 | 02 03
// 10 11 | 12 13
@@ -532,19 +575,21 @@ static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8,
Load8x4(r0, stride, p1, q0);
Load8x4(r8, stride, p0, q1);
- t1 = *p1;
- t2 = *q0;
- // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
- // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
- *p1 = _mm_unpacklo_epi64(t1, *p0);
- *p0 = _mm_unpackhi_epi64(t1, *p0);
- *q0 = _mm_unpacklo_epi64(t2, *q1);
- *q1 = _mm_unpackhi_epi64(t2, *q1);
+ {
+ // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ const __m128i t1 = *p1;
+ const __m128i t2 = *q0;
+ *p1 = _mm_unpacklo_epi64(t1, *p0);
+ *p0 = _mm_unpackhi_epi64(t1, *p0);
+ *q0 = _mm_unpacklo_epi64(t2, *q1);
+ *q1 = _mm_unpackhi_epi64(t2, *q1);
+ }
}
-static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) {
+static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
int i;
for (i = 0; i < 4; ++i, dst += stride) {
*((int32_t*)dst) = _mm_cvtsi128_si32(*x);
@@ -553,48 +598,51 @@ static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) {
}
// Transpose back and store
-static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride,
- __m128i* p1, __m128i* p0,
- __m128i* q0, __m128i* q1) {
- __m128i t1;
+static WEBP_INLINE void Store16x4(const __m128i* const p1,
+ const __m128i* const p0,
+ const __m128i* const q0,
+ const __m128i* const q1,
+ uint8_t* r0, uint8_t* r8,
+ int stride) {
+ __m128i t1, p1_s, p0_s, q0_s, q1_s;
// p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
// p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
t1 = *p0;
- *p0 = _mm_unpacklo_epi8(*p1, t1);
- *p1 = _mm_unpackhi_epi8(*p1, t1);
+ p0_s = _mm_unpacklo_epi8(*p1, t1);
+ p1_s = _mm_unpackhi_epi8(*p1, t1);
// q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
// q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
t1 = *q0;
- *q0 = _mm_unpacklo_epi8(t1, *q1);
- *q1 = _mm_unpackhi_epi8(t1, *q1);
+ q0_s = _mm_unpacklo_epi8(t1, *q1);
+ q1_s = _mm_unpackhi_epi8(t1, *q1);
// p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
// q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
- t1 = *p0;
- *p0 = _mm_unpacklo_epi16(t1, *q0);
- *q0 = _mm_unpackhi_epi16(t1, *q0);
+ t1 = p0_s;
+ p0_s = _mm_unpacklo_epi16(t1, q0_s);
+ q0_s = _mm_unpackhi_epi16(t1, q0_s);
// p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
// q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
- t1 = *p1;
- *p1 = _mm_unpacklo_epi16(t1, *q1);
- *q1 = _mm_unpackhi_epi16(t1, *q1);
+ t1 = p1_s;
+ p1_s = _mm_unpacklo_epi16(t1, q1_s);
+ q1_s = _mm_unpackhi_epi16(t1, q1_s);
- Store4x4(p0, r0, stride);
+ Store4x4(&p0_s, r0, stride);
r0 += 4 * stride;
- Store4x4(q0, r0, stride);
+ Store4x4(&q0_s, r0, stride);
- Store4x4(p1, r8, stride);
+ Store4x4(&p1_s, r8, stride);
r8 += 4 * stride;
- Store4x4(q1, r8, stride);
+ Store4x4(&q1_s, r8, stride);
}
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
-static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
// Load
__m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
__m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
@@ -605,49 +653,49 @@ static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
// Store
_mm_storeu_si128((__m128i*)&p[-stride], p0);
- _mm_storeu_si128((__m128i*)p, q0);
+ _mm_storeu_si128((__m128i*)&p[0], q0);
}
-static void SimpleHFilter16SSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
__m128i p1, p0, q0, q1;
p -= 2; // beginning of p1
- Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
+ Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
DoFilter2(&p1, &p0, &q0, &q1, thresh);
- Store16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
+ Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
}
-static void SimpleVFilter16iSSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
- SimpleVFilter16SSE2(p, stride, thresh);
+ SimpleVFilter16(p, stride, thresh);
}
}
-static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
- SimpleHFilter16SSE2(p, stride, thresh);
+ SimpleHFilter16(p, stride, thresh);
}
}
//------------------------------------------------------------------------------
// Complex In-loop filtering (Paragraph 15.3)
-#define MAX_DIFF1(p3, p2, p1, p0, m) { \
- m = MM_ABS(p3, p2); \
+#define MAX_DIFF1(p3, p2, p1, p0, m) do { \
+ m = MM_ABS(p1, p0); \
+ m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
- m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
-}
+} while (0)
-#define MAX_DIFF2(p3, p2, p1, p0, m) { \
+#define MAX_DIFF2(p3, p2, p1, p0, m) do { \
+ m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
- m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
-}
+} while (0)
#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \
e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \
@@ -656,10 +704,11 @@ static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \
}
-#define LOADUV_H_EDGE(p, u, v, stride) { \
- p = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \
- p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&(v)[(stride)])); \
-}
+#define LOADUV_H_EDGE(p, u, v, stride) do { \
+ const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \
+ const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]); \
+ p = _mm_unpacklo_epi64(U, V); \
+} while (0)
#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \
LOADUV_H_EDGE(e1, u, v, 0 * stride); \
@@ -674,18 +723,23 @@ static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
_mm_storel_epi64((__m128i*)&v[(stride)], p); \
}
-#define COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask) { \
- __m128i fl_yes; \
- const __m128i it = _mm_set1_epi8(ithresh); \
- mask = _mm_subs_epu8(mask, it); \
- mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128()); \
- NeedsFilter(&p1, &p0, &q0, &q1, thresh, &fl_yes); \
- mask = _mm_and_si128(mask, fl_yes); \
+static WEBP_INLINE void ComplexMask(const __m128i* const p1,
+ const __m128i* const p0,
+ const __m128i* const q0,
+ const __m128i* const q1,
+ int thresh, int ithresh,
+ __m128i* const mask) {
+ const __m128i it = _mm_set1_epi8(ithresh);
+ const __m128i diff = _mm_subs_epu8(*mask, it);
+ const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
+ __m128i filter_mask;
+ NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
+ *mask = _mm_and_si128(thresh_mask, filter_mask);
}
// on macroblock edges
-static void VFilter16SSE2(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void VFilter16(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
__m128i t1;
__m128i mask;
__m128i p2, p1, p0, q0, q1, q2;
@@ -698,20 +752,20 @@ static void VFilter16SSE2(uint8_t* p, int stride,
LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
MAX_DIFF2(t1, q2, q1, q0, mask);
- COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+ ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
// Store
_mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
_mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
_mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
- _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
- _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
- _mm_storeu_si128((__m128i*)&p[2 * stride], q2);
+ _mm_storeu_si128((__m128i*)&p[+0 * stride], q0);
+ _mm_storeu_si128((__m128i*)&p[+1 * stride], q1);
+ _mm_storeu_si128((__m128i*)&p[+2 * stride], q2);
}
-static void HFilter16SSE2(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void HFilter16(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
@@ -722,71 +776,78 @@ static void HFilter16SSE2(uint8_t* p, int stride,
Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3
MAX_DIFF2(q3, q2, q1, q0, mask);
- COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+ ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
- Store16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
- Store16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
+ Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
+ Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
}
// on three inner edges
-static void VFilter16iSSE2(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
int k;
- __m128i mask;
- __m128i t1, t2, p1, p0, q0, q1;
+ __m128i p3, p2, p1, p0; // loop invariants
- for (k = 3; k > 0; --k) {
- // Load p3, p2, p1, p0
- LOAD_H_EDGES4(p, stride, t2, t1, p1, p0);
- MAX_DIFF1(t2, t1, p1, p0, mask);
+ LOAD_H_EDGES4(p, stride, p3, p2, p1, p0); // prologue
+ for (k = 3; k > 0; --k) {
+ __m128i mask, tmp1, tmp2;
+ uint8_t* const b = p + 2 * stride; // beginning of p1
p += 4 * stride;
- // Load q0, q1, q2, q3
- LOAD_H_EDGES4(p, stride, q0, q1, t1, t2);
- MAX_DIFF2(t2, t1, q1, q0, mask);
+ MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask
+ LOAD_H_EDGES4(p, stride, p3, p2, tmp1, tmp2);
+ MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
- COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
- DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+ // p3 and p2 are not just temporary variables here: they will be
+ // re-used for next span. And q2/q3 will become p1/p0 accordingly.
+ ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+ DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
// Store
- _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
- _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
- _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
- _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
+ _mm_storeu_si128((__m128i*)&b[0 * stride], p1);
+ _mm_storeu_si128((__m128i*)&b[1 * stride], p0);
+ _mm_storeu_si128((__m128i*)&b[2 * stride], p3);
+ _mm_storeu_si128((__m128i*)&b[3 * stride], p2);
+
+ // rotate samples
+ p1 = tmp1;
+ p0 = tmp2;
}
}
-static void HFilter16iSSE2(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void HFilter16i(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
int k;
- uint8_t* b;
- __m128i mask;
- __m128i t1, t2, p1, p0, q0, q1;
+ __m128i p3, p2, p1, p0; // loop invariants
+
+ Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue
for (k = 3; k > 0; --k) {
- b = p;
- Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
- MAX_DIFF1(t2, t1, p1, p0, mask);
+ __m128i mask, tmp1, tmp2;
+ uint8_t* const b = p + 2; // beginning of p1
- b += 4; // beginning of q0
- Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
- MAX_DIFF2(t2, t1, q1, q0, mask);
+ p += 4; // beginning of q0 (and next span)
- COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
- DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+ MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask
+ Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
+ MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
- b -= 2; // beginning of p1
- Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1);
+ ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+ DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
- p += 4;
+ Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
+
+ // rotate samples
+ p1 = tmp1;
+ p0 = tmp2;
}
}
// 8-pixels wide variant, for chroma filtering
-static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, p2, p1, p0, q0, q1, q2;
@@ -798,7 +859,7 @@ static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
MAX_DIFF2(t1, q2, q1, q0, mask);
- COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+ ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
// Store
@@ -810,8 +871,8 @@ static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
STOREUV(q2, u, v, 2 * stride);
}
-static void HFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
@@ -823,15 +884,15 @@ static void HFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
Load16x4(u, v, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3
MAX_DIFF2(q3, q2, q1, q0, mask);
- COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+ ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
- Store16x4(tu, tv, stride, &p3, &p2, &p1, &p0);
- Store16x4(u, v, stride, &q0, &q1, &q2, &q3);
+ Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride);
+ Store16x4(&q0, &q1, &q2, &q3, u, v, stride);
}
-static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
@@ -846,7 +907,7 @@ static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
MAX_DIFF2(t2, t1, q1, q0, mask);
- COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+ ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
// Store
@@ -856,8 +917,8 @@ static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
STOREUV(q1, u, v, 1 * stride);
}
-static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
Load16x4(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
@@ -868,36 +929,361 @@ static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
Load16x4(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
MAX_DIFF2(t2, t1, q1, q0, mask);
- COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+ ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
u -= 2; // beginning of p1
v -= 2;
- Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
+ Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
}
-extern void VP8DspInitSSE2(void);
+//------------------------------------------------------------------------------
+// 4x4 predictions
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+
+// We use the following 8b-arithmetic tricks:
+// (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1
+// where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1]
+// and:
+// (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb
+// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
+// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
+
+static void VE4(uint8_t* dst) { // vertical
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+ const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00);
+ const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
+ const __m128i b = _mm_subs_epu8(a, lsb);
+ const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
+ const uint32_t vals = _mm_cvtsi128_si32(avg);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ *(uint32_t*)(dst + i * BPS) = vals;
+ }
+}
+
+static void LD4(uint8_t* dst) { // Down-Left
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+ const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, dst[-BPS + 7], 3);
+ const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0);
+ const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
+ const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+ const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
+ *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( abcdefg );
+ *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1));
+ *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2));
+ *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3));
+}
+
+static void VR4(uint8_t* dst) { // Vertical-Right
+ const __m128i one = _mm_set1_epi8(1);
+ const int I = dst[-1 + 0 * BPS];
+ const int J = dst[-1 + 1 * BPS];
+ const int K = dst[-1 + 2 * BPS];
+ const int X = dst[-1 - BPS];
+ const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
+ const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
+ const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
+ const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
+ const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0);
+ const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
+ const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
+ const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+ const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
+ *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( abcd );
+ *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32( efgh );
+ *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1));
+ *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1));
+
+ // these two are hard to implement in SSE2, so we keep the C-version:
+ DST(0, 2) = AVG3(J, I, X);
+ DST(0, 3) = AVG3(K, J, I);
+}
-void VP8DspInitSSE2(void) {
- VP8Transform = TransformSSE2;
+static void VL4(uint8_t* dst) { // Vertical-Left
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
+ const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
+ const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_);
+ const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_);
+ const __m128i avg3 = _mm_avg_epu8(avg1, avg2);
+ const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one);
+ const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_);
+ const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_);
+ const __m128i abbc = _mm_or_si128(ab, bc);
+ const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
+ const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
+ const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+ *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( avg1 );
+ *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32( avg4 );
+ *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1));
+ *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1));
+
+ // these two are hard to get and irregular
+ DST(3, 2) = (extra_out >> 0) & 0xff;
+ DST(3, 3) = (extra_out >> 8) & 0xff;
+}
+
+static void RD4(uint8_t* dst) { // Down-right
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
+ const __m128i ____XABCD = _mm_slli_si128(XABCD, 4);
+ const uint32_t I = dst[-1 + 0 * BPS];
+ const uint32_t J = dst[-1 + 1 * BPS];
+ const uint32_t K = dst[-1 + 2 * BPS];
+ const uint32_t L = dst[-1 + 3 * BPS];
+ const __m128i LKJI_____ =
+ _mm_cvtsi32_si128(L | (K << 8) | (J << 16) | (I << 24));
+ const __m128i LKJIXABCD = _mm_or_si128(LKJI_____, ____XABCD);
+ const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
+ const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
+ const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD);
+ const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
+ const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+ const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
+ *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32( abcdefg );
+ *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1));
+ *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2));
+ *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3));
+}
+
+#undef DST
+#undef AVG3
+
+//------------------------------------------------------------------------------
+// Luma 16x16
+
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+ const uint8_t* top = dst - BPS;
+ const __m128i zero = _mm_setzero_si128();
+ int y;
+ if (size == 4) {
+ const __m128i top_values = _mm_cvtsi32_si128(MemToUint32(top));
+ const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+ for (y = 0; y < 4; ++y, dst += BPS) {
+ const int val = dst[-1] - top[-1];
+ const __m128i base = _mm_set1_epi16(val);
+ const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+ *(int*)dst = _mm_cvtsi128_si32(out);
+ }
+ } else if (size == 8) {
+ const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+ const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+ for (y = 0; y < 8; ++y, dst += BPS) {
+ const int val = dst[-1] - top[-1];
+ const __m128i base = _mm_set1_epi16(val);
+ const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+ _mm_storel_epi64((__m128i*)dst, out);
+ }
+ } else {
+ const __m128i top_values = _mm_loadu_si128((const __m128i*)top);
+ const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero);
+ const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero);
+ for (y = 0; y < 16; ++y, dst += BPS) {
+ const int val = dst[-1] - top[-1];
+ const __m128i base = _mm_set1_epi16(val);
+ const __m128i out_0 = _mm_add_epi16(base, top_base_0);
+ const __m128i out_1 = _mm_add_epi16(base, top_base_1);
+ const __m128i out = _mm_packus_epi16(out_0, out_1);
+ _mm_storeu_si128((__m128i*)dst, out);
+ }
+ }
+}
+
+static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t* dst) { TrueMotion(dst, 16); }
+
+static void VE16(uint8_t* dst) {
+ const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
+ int j;
+ for (j = 0; j < 16; ++j) {
+ _mm_storeu_si128((__m128i*)(dst + j * BPS), top);
+ }
+}
+
+static void HE16(uint8_t* dst) { // horizontal
+ int j;
+ for (j = 16; j > 0; --j) {
+ const __m128i values = _mm_set1_epi8(dst[-1]);
+ _mm_storeu_si128((__m128i*)dst, values);
+ dst += BPS;
+ }
+}
+
+static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+ int j;
+ const __m128i values = _mm_set1_epi8(v);
+ for (j = 0; j < 16; ++j) {
+ _mm_storeu_si128((__m128i*)(dst + j * BPS), values);
+ }
+}
+
+static void DC16(uint8_t* dst) { // DC
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
+ const __m128i sad8x2 = _mm_sad_epu8(top, zero);
+ // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+ const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+ int left = 0;
+ int j;
+ for (j = 0; j < 16; ++j) {
+ left += dst[-1 + j * BPS];
+ }
+ {
+ const int DC = _mm_cvtsi128_si32(sum) + left + 16;
+ Put16(DC >> 5, dst);
+ }
+}
+
+static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
+ int DC = 8;
+ int j;
+ for (j = 0; j < 16; ++j) {
+ DC += dst[-1 + j * BPS];
+ }
+ Put16(DC >> 4, dst);
+}
+
+static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
+ const __m128i sad8x2 = _mm_sad_epu8(top, zero);
+ // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+ const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+ const int DC = _mm_cvtsi128_si32(sum) + 8;
+ Put16(DC >> 4, dst);
+}
- VP8VFilter16 = VFilter16SSE2;
- VP8HFilter16 = HFilter16SSE2;
- VP8VFilter8 = VFilter8SSE2;
- VP8HFilter8 = HFilter8SSE2;
- VP8VFilter16i = VFilter16iSSE2;
- VP8HFilter16i = HFilter16iSSE2;
- VP8VFilter8i = VFilter8iSSE2;
- VP8HFilter8i = HFilter8iSSE2;
+static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples
+ Put16(0x80, dst);
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void VE8uv(uint8_t* dst) { // vertical
+ int j;
+ const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
+ for (j = 0; j < 8; ++j) {
+ _mm_storel_epi64((__m128i*)(dst + j * BPS), top);
+ }
+}
+
+static void HE8uv(uint8_t* dst) { // horizontal
+ int j;
+ for (j = 0; j < 8; ++j) {
+ const __m128i values = _mm_set1_epi8(dst[-1]);
+ _mm_storel_epi64((__m128i*)dst, values);
+ dst += BPS;
+ }
+}
+
+// helper for chroma-DC predictions
+static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+ int j;
+ const __m128i values = _mm_set1_epi8(v);
+ for (j = 0; j < 8; ++j) {
+ _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
+ }
+}
- VP8SimpleVFilter16 = SimpleVFilter16SSE2;
- VP8SimpleHFilter16 = SimpleHFilter16SSE2;
- VP8SimpleVFilter16i = SimpleVFilter16iSSE2;
- VP8SimpleHFilter16i = SimpleHFilter16iSSE2;
+static void DC8uv(uint8_t* dst) { // DC
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
+ const __m128i sum = _mm_sad_epu8(top, zero);
+ int left = 0;
+ int j;
+ for (j = 0; j < 8; ++j) {
+ left += dst[-1 + j * BPS];
+ }
+ {
+ const int DC = _mm_cvtsi128_si32(sum) + left + 8;
+ Put8x8uv(DC >> 4, dst);
+ }
}
-#if defined(__cplusplus) || defined(c_plusplus)
-} // extern "C"
+static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
+ const __m128i sum = _mm_sad_epu8(top, zero);
+ const int DC = _mm_cvtsi128_si32(sum) + 4;
+ Put8x8uv(DC >> 3, dst);
+}
+
+static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
+ int dc0 = 4;
+ int i;
+ for (i = 0; i < 8; ++i) {
+ dc0 += dst[-1 + i * BPS];
+ }
+ Put8x8uv(dc0 >> 3, dst);
+}
+
+static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
+ Put8x8uv(0x80, dst);
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
+ VP8Transform = Transform;
+#if defined(USE_TRANSFORM_AC3)
+ VP8TransformAC3 = TransformAC3;
#endif
-#endif // WEBP_USE_SSE2
+ VP8VFilter16 = VFilter16;
+ VP8HFilter16 = HFilter16;
+ VP8VFilter8 = VFilter8;
+ VP8HFilter8 = HFilter8;
+ VP8VFilter16i = VFilter16i;
+ VP8HFilter16i = HFilter16i;
+ VP8VFilter8i = VFilter8i;
+ VP8HFilter8i = HFilter8i;
+
+ VP8SimpleVFilter16 = SimpleVFilter16;
+ VP8SimpleHFilter16 = SimpleHFilter16;
+ VP8SimpleVFilter16i = SimpleVFilter16i;
+ VP8SimpleHFilter16i = SimpleHFilter16i;
+
+ VP8PredLuma4[1] = TM4;
+ VP8PredLuma4[2] = VE4;
+ VP8PredLuma4[4] = RD4;
+ VP8PredLuma4[5] = VR4;
+ VP8PredLuma4[6] = LD4;
+ VP8PredLuma4[7] = VL4;
+
+ VP8PredLuma16[0] = DC16;
+ VP8PredLuma16[1] = TM16;
+ VP8PredLuma16[2] = VE16;
+ VP8PredLuma16[3] = HE16;
+ VP8PredLuma16[4] = DC16NoTop;
+ VP8PredLuma16[5] = DC16NoLeft;
+ VP8PredLuma16[6] = DC16NoTopLeft;
+
+ VP8PredChroma8[0] = DC8uv;
+ VP8PredChroma8[1] = TM8uv;
+ VP8PredChroma8[2] = VE8uv;
+ VP8PredChroma8[3] = HE8uv;
+ VP8PredChroma8[4] = DC8uvNoTop;
+ VP8PredChroma8[5] = DC8uvNoLeft;
+ VP8PredChroma8[6] = DC8uvNoTopLeft;
+}
+
+#else // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8DspInitSSE2)
+
+#endif // WEBP_USE_SSE2
diff --git a/drivers/webp/dsp/dsp.h b/drivers/webp/dsp/dsp.h
index fd686a8532..8395df40e4 100644
--- a/drivers/webp/dsp/dsp.h
+++ b/drivers/webp/dsp/dsp.h
@@ -1,8 +1,10 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
-// This code is licensed under the same terms as WebM:
-// Software License Agreement: http://www.webmproject.org/license/software/
-// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical functions.
@@ -12,44 +14,121 @@
#ifndef WEBP_DSP_DSP_H_
#define WEBP_DSP_DSP_H_
-#include "../types.h"
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include "../webp/types.h"
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
extern "C" {
#endif
+#define BPS 32 // this is the common stride for enc/dec
+
//------------------------------------------------------------------------------
// CPU detection
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#if defined(__GNUC__)
+# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+# define LOCAL_GCC_PREREQ(maj, min) \
+ (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GCC_VERSION 0
+# define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER > 1310 && \
+ (defined(_M_X64) || defined(_M_IX86))
#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets
#endif
-#if defined(__SSE2__) || defined(WEBP_MSC_SSE2)
+#if defined(_MSC_VER) && _MSC_VER >= 1500 && \
+ (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE41 // Visual C++ SSE4.1 targets
+#endif
+
+// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
+// files without intrinsics, allowing the corresponding Init() to be called.
+// Files containing intrinsics will need to be built targeting the instruction
+// set so should succeed on one of the earlier tests.
+#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
#define WEBP_USE_SSE2
#endif
-#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__)
+#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41)
+#define WEBP_USE_SSE41
+#endif
+
+#if defined(__AVX2__) || defined(WEBP_HAVE_AVX2)
+#define WEBP_USE_AVX2
+#endif
+
+#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
#define WEBP_ANDROID_NEON // Android targets that might support NEON
#endif
-#if ( (defined(__ARM_NEON__) && !defined(__aarch64__)) || defined(WEBP_ANDROID_NEON)) && !defined(PSP2_ENABLED)
+// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
+// inline assembly would need to be modified for use with Native Client.
+#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
+ defined(__aarch64__)) && !defined(__native_client__)
+#define WEBP_USE_NEON
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
#define WEBP_USE_NEON
+#define WEBP_USE_INTRINSICS
+#endif
+
+#if defined(__mips__) && !defined(__mips64) && \
+ defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
+#define WEBP_USE_MIPS32
+#if (__mips_isa_rev >= 2)
+#define WEBP_USE_MIPS32_R2
+#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2)
+#define WEBP_USE_MIPS_DSP_R2
+#endif
+#endif
+#endif
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#define WEBP_TSAN_IGNORE_FUNCTION
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#undef WEBP_TSAN_IGNORE_FUNCTION
+#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
+#endif
#endif
typedef enum {
kSSE2,
kSSE3,
- kNEON
+ kSSE4_1,
+ kAVX,
+ kAVX2,
+ kNEON,
+ kMIPS32,
+ kMIPSdspR2
} CPUFeature;
// returns true if the CPU supports the feature.
typedef int (*VP8CPUInfo)(CPUFeature feature);
-extern VP8CPUInfo VP8GetCPUInfo;
+WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;
//------------------------------------------------------------------------------
-// Encoding
+// Init stub generator
-int VP8GetAlpha(const int histo[]);
+// Defines an init function stub to ensure each module exposes a symbol,
+// avoiding a compiler warning.
+#define WEBP_DSP_INIT_STUB(func) \
+ extern void func(void); \
+ WEBP_TSAN_IGNORE_FUNCTION void func(void) {}
+
+//------------------------------------------------------------------------------
+// Encoding
// Transforms
// VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
@@ -60,7 +139,7 @@ typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
extern VP8Idct VP8ITransform;
extern VP8Fdct VP8FTransform;
-extern VP8WHT VP8ITransformWHT;
+extern VP8Fdct VP8FTransform2; // performs two transforms at a time
extern VP8WHT VP8FTransformWHT;
// Predictions
// *dst is the destination block. *top and *left can be NULL.
@@ -79,20 +158,63 @@ extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
extern VP8BlockCopy VP8Copy4x4;
+extern VP8BlockCopy VP8Copy16x8;
// Quantization
struct VP8Matrix; // forward declaration
typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
- int n, const struct VP8Matrix* const mtx);
+ const struct VP8Matrix* const mtx);
+// Same as VP8QuantizeBlock, but quantizes two consecutive blocks.
+typedef int (*VP8Quantize2Blocks)(int16_t in[32], int16_t out[32],
+ const struct VP8Matrix* const mtx);
+
extern VP8QuantizeBlock VP8EncQuantizeBlock;
+extern VP8Quantize2Blocks VP8EncQuantize2Blocks;
+
+// specific to 2nd transform:
+typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
+ const struct VP8Matrix* const mtx);
+extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
-// Compute susceptibility based on DCT-coeff histograms:
-// the higher, the "easier" the macroblock is to compress.
-typedef int (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
- int start_block, int end_block);
extern const int VP8DspScan[16 + 4 + 4];
+
+// Collect histogram for susceptibility calculation.
+#define MAX_COEFF_THRESH 31 // size of histogram used by CollectHistogram.
+typedef struct {
+ // We only need to store max_value and last_non_zero, not the distribution.
+ int max_value;
+ int last_non_zero;
+} VP8Histogram;
+typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo);
extern VP8CHisto VP8CollectHistogram;
+// General-purpose util function to help VP8CollectHistogram().
+void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
+ VP8Histogram* const histo);
+
+// must be called before using any of the above
+void VP8EncDspInit(void);
+
+//------------------------------------------------------------------------------
+// cost functions (encoding)
+
+extern const uint16_t VP8EntropyCost[256]; // 8bit fixed-point log(p)
+// approximate cost per level:
+extern const uint16_t VP8LevelFixedCosts[2047 /*MAX_LEVEL*/ + 1];
+extern const uint8_t VP8EncBands[16 + 1];
+
+struct VP8Residual;
+typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
+ struct VP8Residual* const res);
+extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
-void VP8EncDspInit(void); // must be called before using any of the above
+// Cost calculation function.
+typedef int (*VP8GetResidualCostFunc)(int ctx0,
+ const struct VP8Residual* const res);
+extern VP8GetResidualCostFunc VP8GetResidualCost;
+
+// must be called before anything using the above
+void VP8EncDspCostInit(void);
//------------------------------------------------------------------------------
// Decoding
@@ -101,17 +223,26 @@ typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
// when doing two transforms, coeffs is actually int16_t[2][16].
typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
extern VP8DecIdct2 VP8Transform;
+extern VP8DecIdct VP8TransformAC3;
extern VP8DecIdct VP8TransformUV;
extern VP8DecIdct VP8TransformDC;
extern VP8DecIdct VP8TransformDCUV;
-extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
+extern VP8WHT VP8TransformWHT;
// *dst is the destination block, with stride BPS. Boundary samples are
// assumed accessible when needed.
typedef void (*VP8PredFunc)(uint8_t* dst);
-extern const VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
-extern const VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
-extern const VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
+extern VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
+extern VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
+extern VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
+
+// clipping tables (for filtering)
+extern const int8_t* const VP8ksclip1; // clips [-1020, 1020] to [-128, 127]
+extern const int8_t* const VP8ksclip2; // clips [-112, 112] to [-16, 15]
+extern const uint8_t* const VP8kclip1; // clips [-255,511] to [0,255]
+extern const uint8_t* const VP8kabs0; // abs(x) for x in [-255,255]
+// must be called first
+void VP8InitClipTables(void);
// simple filter (only for luma)
typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
@@ -145,6 +276,8 @@ void VP8DspInit(void);
#define FANCY_UPSAMPLING // undefined to remove fancy upsampling support
+// Convert a pair of y/u/v lines together to the output rgb/a colorspace.
+// bottom_y can be NULL if only one line of output is needed (at top/bottom).
typedef void (*WebPUpsampleLinePairFunc)(
const uint8_t* top_y, const uint8_t* bottom_y,
const uint8_t* top_u, const uint8_t* top_v,
@@ -156,18 +289,20 @@ typedef void (*WebPUpsampleLinePairFunc)(
// Fancy upsampling functions to convert YUV to RGB(A) modes
extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
-// Initializes SSE2 version of the fancy upsamplers.
-void WebPInitUpsamplersSSE2(void);
-
#endif // FANCY_UPSAMPLING
-// Point-sampling methods.
-typedef void (*WebPSampleLinePairFunc)(
- const uint8_t* top_y, const uint8_t* bottom_y,
- const uint8_t* u, const uint8_t* v,
- uint8_t* top_dst, uint8_t* bottom_dst, int len);
+// Per-row point-sampling methods.
+typedef void (*WebPSamplerRowFunc)(const uint8_t* y,
+ const uint8_t* u, const uint8_t* v,
+ uint8_t* dst, int len);
+// Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
+void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
+ const uint8_t* u, const uint8_t* v, int uv_stride,
+ uint8_t* dst, int dst_stride,
+ int width, int height, WebPSamplerRowFunc func);
-extern const WebPSampleLinePairFunc WebPSamplers[/* MODE_LAST */];
+// Sampling functions to convert rows of YUV to RGB(A)
+extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
// General function for converting two lines of ARGB or RGBA.
// 'alpha_is_last' should be true if 0xff000000 is stored in memory as
@@ -179,13 +314,84 @@ typedef void (*WebPYUV444Converter)(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len);
-extern const WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
-// Main function to be called
+// Must be called before using the WebPUpsamplers[] (and for premultiplied
+// colorspaces like rgbA, rgbA4444, etc)
void WebPInitUpsamplers(void);
+// Must be called before using WebPSamplers[]
+void WebPInitSamplers(void);
+// Must be called before using WebPYUV444Converters[]
+void WebPInitYUV444Converters(void);
//------------------------------------------------------------------------------
-// Pre-multiply planes with alpha values
+// ARGB -> YUV converters
+
+// Convert ARGB samples to luma Y.
+extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
+// Convert ARGB samples to U/V with downsampling. do_store should be '1' for
+// even lines and '0' for odd ones. 'src_width' is the original width, not
+// the U/V one.
+extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
+ int src_width, int do_store);
+
+// Convert a row of accumulated (four-values) of rgba32 toward U/V
+extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
+ uint8_t* u, uint8_t* v, int width);
+
+// Convert RGB or BGR to Y
+extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
+extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
+
+// used for plain-C fallback.
+extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
+ int src_width, int do_store);
+extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
+ uint8_t* u, uint8_t* v, int width);
+
+// Must be called before using the above.
+void WebPInitConvertARGBToYUV(void);
+
+//------------------------------------------------------------------------------
+// Rescaler
+
+struct WebPRescaler;
+
+// Import a row of data and save its contribution in the rescaler.
+// 'channel' denotes the channel number to be imported. 'Expand' corresponds to
+// the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
+typedef void (*WebPRescalerImportRowFunc)(struct WebPRescaler* const wrk,
+ const uint8_t* src);
+
+extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
+extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
+
+// Export one row (starting at x_out position) from rescaler.
+// 'Expand' corresponds to the wrk->y_expand case.
+// Otherwise 'Shrink' is to be used
+typedef void (*WebPRescalerExportRowFunc)(struct WebPRescaler* const wrk);
+extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
+extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
+
+// Plain-C implementation, as fall-back.
+extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk,
+ const uint8_t* src);
+extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk,
+ const uint8_t* src);
+extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk);
+extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk);
+
+// Main entry calls:
+extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
+ const uint8_t* src);
+// Export one row (starting at x_out position) from rescaler.
+extern void WebPRescalerExportRow(struct WebPRescaler* const wrk);
+
+// Must be called first before using the above.
+void WebPRescalerDspInit(void);
+
+//------------------------------------------------------------------------------
+// Utilities for processing transparent channel.
// Apply alpha pre-multiply on an rgba, bgra or argb plane of size w * h.
// alpha_first should be 0 for argb, 1 for rgba or bgra (where alpha is last).
@@ -196,14 +402,98 @@ extern void (*WebPApplyAlphaMultiply)(
extern void (*WebPApplyAlphaMultiply4444)(
uint8_t* rgba4444, int w, int h, int stride);
+// Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
+// Returns true if alpha[] plane has non-trivial values different from 0xff.
+extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
+ int width, int height,
+ uint8_t* dst, int dst_stride);
+
+// Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
+// A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
+extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
+ int width, int height,
+ uint32_t* dst, int dst_stride);
+
+// Extract the alpha values from 32b values in argb[] and pack them into alpha[]
+// (this is the opposite of WebPDispatchAlpha).
+// Returns true if there's only trivial 0xff alpha values.
+extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
+ int width, int height,
+ uint8_t* alpha, int alpha_stride);
+
+// Pre-Multiply operation transforms x into x * A / 255 (where x=Y,R,G or B).
+// Un-Multiply operation transforms x into x * 255 / A.
+
+// Pre-Multiply or Un-Multiply (if 'inverse' is true) argb values in a row.
+extern void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
+
+// Same a WebPMultARGBRow(), but for several rows.
+void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
+ int inverse);
+
+// Same for a row of single values, with side alpha values.
+extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+ int width, int inverse);
+
+// Same a WebPMultRow(), but for several 'num_rows' rows.
+void WebPMultRows(uint8_t* ptr, int stride,
+ const uint8_t* alpha, int alpha_stride,
+ int width, int num_rows, int inverse);
+
+// Plain-C versions, used as fallback by some implementations.
+void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
+ int width, int inverse);
+void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);
+
// To be called first before using the above.
-void WebPInitPremultiply(void);
+void WebPInitAlphaProcessing(void);
+
+// ARGB packing function: a/r/g/b input is rgba or bgra order.
+extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
+ const uint8_t* g, const uint8_t* b, int len,
+ uint32_t* out);
+
+// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
+extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+ int len, int step, uint32_t* out);
-void WebPInitPremultiplySSE2(void); // should not be called directly.
+// To be called first before using the above.
+void VP8EncDspARGBInit(void);
//------------------------------------------------------------------------------
+// Filter functions
+
+typedef enum { // Filter types.
+ WEBP_FILTER_NONE = 0,
+ WEBP_FILTER_HORIZONTAL,
+ WEBP_FILTER_VERTICAL,
+ WEBP_FILTER_GRADIENT,
+ WEBP_FILTER_LAST = WEBP_FILTER_GRADIENT + 1, // end marker
+ WEBP_FILTER_BEST, // meta-types
+ WEBP_FILTER_FAST
+} WEBP_FILTER_TYPE;
+
+typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
+ int stride, uint8_t* out);
+typedef void (*WebPUnfilterFunc)(int width, int height, int stride,
+ int row, int num_rows, uint8_t* data);
+
+// Filter the given data using the given predictor.
+// 'in' corresponds to a 2-dimensional pixel array of size (stride * height)
+// in raster order.
+// 'stride' is number of bytes per scan line (with possible padding).
+// 'out' should be pre-allocated.
+extern WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
+
+// In-place reconstruct the original data from the given filtered data.
+// The reconstruction will be done for 'num_rows' rows starting from 'row'
+// (assuming rows upto 'row - 1' are already reconstructed).
+extern WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
+
+// To be called first before using the above.
+void VP8FiltersInit(void);
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/drivers/webp/dsp/enc.c b/drivers/webp/dsp/enc.c
index 02234564be..95e63f89ab 100644
--- a/drivers/webp/dsp/enc.c
+++ b/drivers/webp/dsp/enc.c
@@ -1,47 +1,34 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
-// This code is licensed under the same terms as WebM:
-// Software License Agreement: http://www.webmproject.org/license/software/
-// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical encoding functions.
//
// Author: Skal (pascal.massimino@gmail.com)
+#include <assert.h>
#include <stdlib.h> // for abs()
+
#include "./dsp.h"
#include "../enc/vp8enci.h"
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+static WEBP_INLINE uint8_t clip_8b(int v) {
+ return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+}
+
+static WEBP_INLINE int clip_max(int v, int max) {
+ return (v > max) ? max : v;
+}
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
-static int ClipAlpha(int alpha) {
- return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
-}
-
-int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) {
- int num = 0, den = 0, val = 0;
- int k;
- int alpha;
- // note: changing this loop to avoid the numerous "k + 1" slows things down.
- for (k = 0; k < MAX_COEFF_THRESH; ++k) {
- if (histo[k + 1]) {
- val += histo[k + 1];
- num += val * (k + 1);
- den += (k + 1) * (k + 1);
- }
- }
- // we scale the value to a usable [0..255] range
- alpha = den ? 10 * num / den - 5 : 0;
- return ClipAlpha(alpha);
-}
-
const int VP8DspScan[16 + 4 + 4] = {
// Luma
0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
@@ -53,27 +40,41 @@ const int VP8DspScan[16 + 4 + 4] = {
8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
};
-static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
- int start_block, int end_block) {
- int histo[MAX_COEFF_THRESH + 1] = { 0 };
- int16_t out[16];
- int j, k;
+// general-purpose util function
+void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
+ VP8Histogram* const histo) {
+ int max_value = 0, last_non_zero = 1;
+ int k;
+ for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
+ const int value = distribution[k];
+ if (value > 0) {
+ if (value > max_value) max_value = value;
+ last_non_zero = k;
+ }
+ }
+ histo->max_value = max_value;
+ histo->last_non_zero = last_non_zero;
+}
+
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
+ int j;
+ int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
- VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+ int k;
+ int16_t out[16];
- // Convert coefficients to bin (within out[]).
- for (k = 0; k < 16; ++k) {
- const int v = abs(out[k]) >> 2;
- out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
- }
+ VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
- // Use bin to update histogram.
+ // Convert coefficients to bin.
for (k = 0; k < 16; ++k) {
- histo[out[k]]++;
+ const int v = abs(out[k]) >> 3; // TODO(skal): add rounding?
+ const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
+ ++distribution[clipped_value];
}
}
-
- return VP8GetAlpha(histo);
+ VP8SetHistogramData(distribution, histo);
}
//------------------------------------------------------------------------------
@@ -85,19 +86,16 @@ static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
// and make sure it's set to true _last_ (so as to be thread-safe)
static volatile int tables_ok = 0;
-static void InitTables(void) {
+static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
if (!tables_ok) {
int i;
for (i = -255; i <= 255 + 255; ++i) {
- clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
+ clip1[255 + i] = clip_8b(i);
}
tables_ok = 1;
}
}
-static WEBP_INLINE uint8_t clip_8b(int v) {
- return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255;
-}
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
@@ -154,84 +152,63 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
int i;
int tmp[16];
for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
- const int d0 = src[0] - ref[0];
+ const int d0 = src[0] - ref[0]; // 9bit dynamic range ([-255,255])
const int d1 = src[1] - ref[1];
const int d2 = src[2] - ref[2];
const int d3 = src[3] - ref[3];
- const int a0 = (d0 + d3) << 3;
- const int a1 = (d1 + d2) << 3;
- const int a2 = (d1 - d2) << 3;
- const int a3 = (d0 - d3) << 3;
- tmp[0 + i * 4] = (a0 + a1);
- tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12;
- tmp[2 + i * 4] = (a0 - a1);
- tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 7500) >> 12;
+ const int a0 = (d0 + d3); // 10b [-510,510]
+ const int a1 = (d1 + d2);
+ const int a2 = (d1 - d2);
+ const int a3 = (d0 - d3);
+ tmp[0 + i * 4] = (a0 + a1) * 8; // 14b [-8160,8160]
+ tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9; // [-7536,7542]
+ tmp[2 + i * 4] = (a0 - a1) * 8;
+ tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 937) >> 9;
}
for (i = 0; i < 4; ++i) {
- const int a0 = (tmp[0 + i] + tmp[12 + i]);
+ const int a0 = (tmp[0 + i] + tmp[12 + i]); // 15b
const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
const int a3 = (tmp[0 + i] - tmp[12 + i]);
- out[0 + i] = (a0 + a1 + 7) >> 4;
+ out[0 + i] = (a0 + a1 + 7) >> 4; // 12b
out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
out[8 + i] = (a0 - a1 + 7) >> 4;
out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
}
}
-static void ITransformWHT(const int16_t* in, int16_t* out) {
- int tmp[16];
- int i;
- for (i = 0; i < 4; ++i) {
- const int a0 = in[0 + i] + in[12 + i];
- const int a1 = in[4 + i] + in[ 8 + i];
- const int a2 = in[4 + i] - in[ 8 + i];
- const int a3 = in[0 + i] - in[12 + i];
- tmp[0 + i] = a0 + a1;
- tmp[8 + i] = a0 - a1;
- tmp[4 + i] = a3 + a2;
- tmp[12 + i] = a3 - a2;
- }
- for (i = 0; i < 4; ++i) {
- const int dc = tmp[0 + i * 4] + 3; // w/ rounder
- const int a0 = dc + tmp[3 + i * 4];
- const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
- const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
- const int a3 = dc - tmp[3 + i * 4];
- out[ 0] = (a0 + a1) >> 3;
- out[16] = (a3 + a2) >> 3;
- out[32] = (a0 - a1) >> 3;
- out[48] = (a3 - a2) >> 3;
- out += 64;
- }
+static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+ VP8FTransform(src, ref, out);
+ VP8FTransform(src + 4, ref + 4, out + 16);
}
static void FTransformWHT(const int16_t* in, int16_t* out) {
- int tmp[16];
+ // input is 12b signed
+ int32_t tmp[16];
int i;
for (i = 0; i < 4; ++i, in += 64) {
- const int a0 = (in[0 * 16] + in[2 * 16]) << 2;
- const int a1 = (in[1 * 16] + in[3 * 16]) << 2;
- const int a2 = (in[1 * 16] - in[3 * 16]) << 2;
- const int a3 = (in[0 * 16] - in[2 * 16]) << 2;
- tmp[0 + i * 4] = (a0 + a1) + (a0 != 0);
+ const int a0 = (in[0 * 16] + in[2 * 16]); // 13b
+ const int a1 = (in[1 * 16] + in[3 * 16]);
+ const int a2 = (in[1 * 16] - in[3 * 16]);
+ const int a3 = (in[0 * 16] - in[2 * 16]);
+ tmp[0 + i * 4] = a0 + a1; // 14b
tmp[1 + i * 4] = a3 + a2;
tmp[2 + i * 4] = a3 - a2;
tmp[3 + i * 4] = a0 - a1;
}
for (i = 0; i < 4; ++i) {
- const int a0 = (tmp[0 + i] + tmp[8 + i]);
+ const int a0 = (tmp[0 + i] + tmp[8 + i]); // 15b
const int a1 = (tmp[4 + i] + tmp[12+ i]);
const int a2 = (tmp[4 + i] - tmp[12+ i]);
const int a3 = (tmp[0 + i] - tmp[8 + i]);
- const int b0 = a0 + a1;
+ const int b0 = a0 + a1; // 16b
const int b1 = a3 + a2;
const int b2 = a3 - a2;
const int b3 = a0 - a1;
- out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3;
- out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3;
- out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3;
- out[12 + i] = (b3 + (b3 > 0) + 3) >> 3;
+ out[ 0 + i] = b0 >> 1; // 15b
+ out[ 4 + i] = b1 >> 1;
+ out[ 8 + i] = b2 >> 1;
+ out[12 + i] = b3 >> 1;
}
}
@@ -241,8 +218,6 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
//------------------------------------------------------------------------------
// Intra predictions
-#define DST(x, y) dst[(x) + (y) * BPS]
-
static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
int j;
for (j = 0; j < size; ++j) {
@@ -253,7 +228,7 @@ static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
static WEBP_INLINE void VerticalPred(uint8_t* dst,
const uint8_t* top, int size) {
int j;
- if (top) {
+ if (top != NULL) {
for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
} else {
Fill(dst, 127, size);
@@ -262,7 +237,7 @@ static WEBP_INLINE void VerticalPred(uint8_t* dst,
static WEBP_INLINE void HorizontalPred(uint8_t* dst,
const uint8_t* left, int size) {
- if (left) {
+ if (left != NULL) {
int j;
for (j = 0; j < size; ++j) {
memset(dst + j * BPS, left[j], size);
@@ -275,8 +250,8 @@ static WEBP_INLINE void HorizontalPred(uint8_t* dst,
static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
int y;
- if (left) {
- if (top) {
+ if (left != NULL) {
+ if (top != NULL) {
const uint8_t* const clip = clip1 + 255 - left[-1];
for (y = 0; y < size; ++y) {
const uint8_t* const clip_table = clip + left[y];
@@ -294,7 +269,7 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
// is equivalent to VE prediction where you just copy the top samples.
// Note that if top samples are not available, the default value is
// then 129, and not 127 as in the VerticalPred case.
- if (top) {
+ if (top != NULL) {
VerticalPred(dst, top, size);
} else {
Fill(dst, 129, size);
@@ -307,15 +282,15 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
int size, int round, int shift) {
int DC = 0;
int j;
- if (top) {
+ if (top != NULL) {
for (j = 0; j < size; ++j) DC += top[j];
- if (left) { // top and left present
+ if (left != NULL) { // top and left present
for (j = 0; j < size; ++j) DC += left[j];
} else { // top, but no left
DC += DC;
}
DC = (DC + round) >> shift;
- } else if (left) { // left but no top
+ } else if (left != NULL) { // left but no top
for (j = 0; j < size; ++j) DC += left[j];
DC += DC;
DC = (DC + round) >> shift;
@@ -337,8 +312,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
TrueMotion(C8TM8 + dst, left, top, 8);
// V block
dst += 8;
- if (top) top += 8;
- if (left) left += 16;
+ if (top != NULL) top += 8;
+ if (left != NULL) left += 16;
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8);
HorizontalPred(C8HE8 + dst, left, 8);
@@ -359,6 +334,7 @@ static void Intra16Preds(uint8_t* dst,
//------------------------------------------------------------------------------
// luma 4x4 prediction
+#define DST(x, y) dst[(x) + (y) * BPS]
#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
@@ -589,30 +565,30 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
int i;
// horizontal pass
for (i = 0; i < 4; ++i, in += BPS) {
- const int a0 = (in[0] + in[2]) << 2;
- const int a1 = (in[1] + in[3]) << 2;
- const int a2 = (in[1] - in[3]) << 2;
- const int a3 = (in[0] - in[2]) << 2;
- tmp[0 + i * 4] = a0 + a1 + (a0 != 0);
+ const int a0 = in[0] + in[2];
+ const int a1 = in[1] + in[3];
+ const int a2 = in[1] - in[3];
+ const int a3 = in[0] - in[2];
+ tmp[0 + i * 4] = a0 + a1;
tmp[1 + i * 4] = a3 + a2;
tmp[2 + i * 4] = a3 - a2;
tmp[3 + i * 4] = a0 - a1;
}
// vertical pass
for (i = 0; i < 4; ++i, ++w) {
- const int a0 = (tmp[0 + i] + tmp[8 + i]);
- const int a1 = (tmp[4 + i] + tmp[12+ i]);
- const int a2 = (tmp[4 + i] - tmp[12+ i]);
- const int a3 = (tmp[0 + i] - tmp[8 + i]);
+ const int a0 = tmp[0 + i] + tmp[8 + i];
+ const int a1 = tmp[4 + i] + tmp[12+ i];
+ const int a2 = tmp[4 + i] - tmp[12+ i];
+ const int a3 = tmp[0 + i] - tmp[8 + i];
const int b0 = a0 + a1;
const int b1 = a3 + a2;
const int b2 = a3 - a2;
const int b3 = a0 - a1;
- // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
- sum += w[ 0] * ((abs(b0) + 3) >> 3);
- sum += w[ 4] * ((abs(b1) + 3) >> 3);
- sum += w[ 8] * ((abs(b2) + 3) >> 3);
- sum += w[12] * ((abs(b3) + 3) >> 3);
+
+ sum += w[ 0] * abs(b0);
+ sum += w[ 4] * abs(b1);
+ sum += w[ 8] * abs(b2);
+ sum += w[12] * abs(b3);
}
return sum;
}
@@ -621,7 +597,7 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w);
- return (abs(sum2 - sum1) + 8) >> 4;
+ return abs(sum2 - sum1) >> 5;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
@@ -646,21 +622,57 @@ static const uint8_t kZigzag[16] = {
// Simple quantization
static int QuantizeBlock(int16_t in[16], int16_t out[16],
- int n, const VP8Matrix* const mtx) {
+ const VP8Matrix* const mtx) {
int last = -1;
- for (; n < 16; ++n) {
+ int n;
+ for (n = 0; n < 16; ++n) {
+ const int j = kZigzag[n];
+ const int sign = (in[j] < 0);
+ const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+ if (coeff > mtx->zthresh_[j]) {
+ const uint32_t Q = mtx->q_[j];
+ const uint32_t iQ = mtx->iq_[j];
+ const uint32_t B = mtx->bias_[j];
+ int level = QUANTDIV(coeff, iQ, B);
+ if (level > MAX_LEVEL) level = MAX_LEVEL;
+ if (sign) level = -level;
+ in[j] = level * Q;
+ out[n] = level;
+ if (level) last = n;
+ } else {
+ out[n] = 0;
+ in[j] = 0;
+ }
+ }
+ return (last >= 0);
+}
+
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
+ int nz;
+ nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+ nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+ return nz;
+}
+
+static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ int n, last = -1;
+ for (n = 0; n < 16; ++n) {
const int j = kZigzag[n];
const int sign = (in[j] < 0);
- int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
- if (coeff > 2047) coeff = 2047;
+ const uint32_t coeff = sign ? -in[j] : in[j];
+ assert(mtx->sharpen_[j] == 0);
if (coeff > mtx->zthresh_[j]) {
- const int Q = mtx->q_[j];
- const int iQ = mtx->iq_[j];
- const int B = mtx->bias_[j];
- out[n] = QUANTDIV(coeff, iQ, B);
- if (sign) out[n] = -out[n];
- in[j] = out[n] * Q;
- if (out[n]) last = n;
+ const uint32_t Q = mtx->q_[j];
+ const uint32_t iQ = mtx->iq_[j];
+ const uint32_t B = mtx->bias_[j];
+ int level = QUANTDIV(coeff, iQ, B);
+ if (level > MAX_LEVEL) level = MAX_LEVEL;
+ if (sign) level = -level;
+ in[j] = level * Q;
+ out[n] = level;
+ if (level) last = n;
} else {
out[n] = 0;
in[j] = 0;
@@ -672,16 +684,22 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
//------------------------------------------------------------------------------
// Block copy
-static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
+static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
int y;
- for (y = 0; y < size; ++y) {
- memcpy(dst, src, size);
+ for (y = 0; y < h; ++y) {
+ memcpy(dst, src, w);
src += BPS;
dst += BPS;
}
}
-static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
+static void Copy4x4(const uint8_t* src, uint8_t* dst) {
+ Copy(src, dst, 4, 4);
+}
+
+static void Copy16x8(const uint8_t* src, uint8_t* dst) {
+ Copy(src, dst, 16, 8);
+}
//------------------------------------------------------------------------------
// Initialization
@@ -691,7 +709,7 @@ static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
VP8CHisto VP8CollectHistogram;
VP8Idct VP8ITransform;
VP8Fdct VP8FTransform;
-VP8WHT VP8ITransformWHT;
+VP8Fdct VP8FTransform2;
VP8WHT VP8FTransformWHT;
VP8Intra4Preds VP8EncPredLuma4;
VP8IntraPreds VP8EncPredLuma16;
@@ -703,18 +721,32 @@ VP8Metric VP8SSE4x4;
VP8WMetric VP8TDisto4x4;
VP8WMetric VP8TDisto16x16;
VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8Quantize2Blocks VP8EncQuantize2Blocks;
+VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
VP8BlockCopy VP8Copy4x4;
+VP8BlockCopy VP8Copy16x8;
extern void VP8EncDspInitSSE2(void);
+extern void VP8EncDspInitSSE41(void);
+extern void VP8EncDspInitAVX2(void);
+extern void VP8EncDspInitNEON(void);
+extern void VP8EncDspInitMIPS32(void);
+extern void VP8EncDspInitMIPSdspR2(void);
+
+static volatile VP8CPUInfo enc_last_cpuinfo_used =
+ (VP8CPUInfo)&enc_last_cpuinfo_used;
-void VP8EncDspInit(void) {
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
+ if (enc_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+ VP8DspInit(); // common inverse transforms
InitTables();
// default C implementations
VP8CollectHistogram = CollectHistogram;
VP8ITransform = ITransform;
VP8FTransform = FTransform;
- VP8ITransformWHT = ITransformWHT;
+ VP8FTransform2 = FTransform2;
VP8FTransformWHT = FTransformWHT;
VP8EncPredLuma4 = Intra4Preds;
VP8EncPredLuma16 = Intra16Preds;
@@ -726,18 +758,43 @@ void VP8EncDspInit(void) {
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8EncQuantizeBlock = QuantizeBlock;
+ VP8EncQuantize2Blocks = Quantize2Blocks;
+ VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
VP8Copy4x4 = Copy4x4;
+ VP8Copy16x8 = Copy16x8;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
- if (VP8GetCPUInfo) {
+ if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspInitSSE2();
+#if defined(WEBP_USE_SSE41)
+ if (VP8GetCPUInfo(kSSE4_1)) {
+ VP8EncDspInitSSE41();
+ }
+#endif
+ }
+#endif
+#if defined(WEBP_USE_AVX2)
+ if (VP8GetCPUInfo(kAVX2)) {
+ VP8EncDspInitAVX2();
+ }
+#endif
+#if defined(WEBP_USE_NEON)
+ if (VP8GetCPUInfo(kNEON)) {
+ VP8EncDspInitNEON();
+ }
+#endif
+#if defined(WEBP_USE_MIPS32)
+ if (VP8GetCPUInfo(kMIPS32)) {
+ VP8EncDspInitMIPS32();
+ }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+ if (VP8GetCPUInfo(kMIPSdspR2)) {
+ VP8EncDspInitMIPSdspR2();
}
#endif
}
+ enc_last_cpuinfo_used = VP8GetCPUInfo;
}
-
-#if defined(__cplusplus) || defined(c_plusplus)
-} // extern "C"
-#endif
diff --git a/drivers/webp/dsp/enc_sse2.c b/drivers/webp/dsp/enc_sse2.c
index b046761dc1..63d9cecd85 100644
--- a/drivers/webp/dsp/enc_sse2.c
+++ b/drivers/webp/dsp/enc_sse2.c
@@ -1,8 +1,10 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
-// This code is licensed under the same terms as WebM:
-// Software License Agreement: http://www.webmproject.org/license/software/
-// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 version of speed-critical encoding functions.
@@ -15,64 +17,55 @@
#include <stdlib.h> // for abs()
#include <emmintrin.h>
+#include "../enc/cost.h"
#include "../enc/vp8enci.h"
+#include "../utils/utils.h"
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
+//------------------------------------------------------------------------------
+// Quite useful macro for debugging. Left here for convenience.
+
+#if 0
+#include <stdio.h>
+static void PrintReg(const __m128i r, const char* const name, int size) {
+ int n;
+ union {
+ __m128i r;
+ uint8_t i8[16];
+ uint16_t i16[8];
+ uint32_t i32[4];
+ uint64_t i64[2];
+ } tmp;
+ tmp.r = r;
+ fprintf(stderr, "%s\t: ", name);
+ if (size == 8) {
+ for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
+ } else if (size == 16) {
+ for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
+ } else if (size == 32) {
+ for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
+ } else {
+ for (n = 0; n < 2; ++n) fprintf(stderr, "%.16lx ", tmp.i64[n]);
+ }
+ fprintf(stderr, "\n");
+}
#endif
//------------------------------------------------------------------------------
-// Compute susceptibility based on DCT-coeff histograms:
-// the higher, the "easier" the macroblock is to compress.
-
-static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
- int start_block, int end_block) {
- int histo[MAX_COEFF_THRESH + 1] = { 0 };
- int16_t out[16];
- int j, k;
- const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
- for (j = start_block; j < end_block; ++j) {
- VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
-
- // Convert coefficients to bin (within out[]).
- {
- // Load.
- const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
- const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
- // sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative)
- const __m128i sign0 = _mm_srai_epi16(out0, 15);
- const __m128i sign1 = _mm_srai_epi16(out1, 15);
- // abs(out) = (out ^ sign) - sign
- const __m128i xor0 = _mm_xor_si128(out0, sign0);
- const __m128i xor1 = _mm_xor_si128(out1, sign1);
- const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
- const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
- // v = abs(out) >> 2
- const __m128i v0 = _mm_srai_epi16(abs0, 2);
- const __m128i v1 = _mm_srai_epi16(abs1, 2);
- // bin = min(v, MAX_COEFF_THRESH)
- const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
- const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
- // Store.
- _mm_storeu_si128((__m128i*)&out[0], bin0);
- _mm_storeu_si128((__m128i*)&out[8], bin1);
- }
+// util for unaligned loads.
- // Use bin to update histogram.
- for (k = 0; k < 16; ++k) {
- histo[out[k]]++;
- }
- }
-
- return VP8GetAlpha(histo);
+// memcpy() is the safe way of moving potentially unaligned 32b memory.
+static WEBP_INLINE uint32_t MemToUint32(const uint8_t* const ptr) {
+ uint32_t A;
+ memcpy(&A, (const int*)ptr, sizeof(A));
+ return A;
}
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
// Does one or two inverse transforms.
-static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
- int do_two) {
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+ int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -99,19 +92,19 @@ static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
// use nor store.
__m128i in0, in1, in2, in3;
{
- in0 = _mm_loadl_epi64((__m128i*)&in[0]);
- in1 = _mm_loadl_epi64((__m128i*)&in[4]);
- in2 = _mm_loadl_epi64((__m128i*)&in[8]);
- in3 = _mm_loadl_epi64((__m128i*)&in[12]);
+ in0 = _mm_loadl_epi64((const __m128i*)&in[0]);
+ in1 = _mm_loadl_epi64((const __m128i*)&in[4]);
+ in2 = _mm_loadl_epi64((const __m128i*)&in[8]);
+ in3 = _mm_loadl_epi64((const __m128i*)&in[12]);
// a00 a10 a20 a30 x x x x
// a01 a11 a21 a31 x x x x
// a02 a12 a22 a32 x x x x
// a03 a13 a23 a33 x x x x
if (do_two) {
- const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
- const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
- const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
- const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
+ const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]);
+ const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]);
+ const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]);
+ const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]);
in0 = _mm_unpacklo_epi64(in0, inB0);
in1 = _mm_unpacklo_epi64(in1, inB1);
in2 = _mm_unpacklo_epi64(in2, inB2);
@@ -243,21 +236,21 @@ static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
// Add inverse transform to 'ref' and store.
{
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
// Load the reference(s).
__m128i ref0, ref1, ref2, ref3;
if (do_two) {
// Load eight bytes/pixels per line.
- ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
- ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
- ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
- ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
+ ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+ ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+ ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+ ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
} else {
// Load four bytes/pixels per line.
- ref0 = _mm_cvtsi32_si128(*(int*)&ref[0 * BPS]);
- ref1 = _mm_cvtsi32_si128(*(int*)&ref[1 * BPS]);
- ref2 = _mm_cvtsi32_si128(*(int*)&ref[2 * BPS]);
- ref3 = _mm_cvtsi32_si128(*(int*)&ref[3 * BPS]);
+ ref0 = _mm_cvtsi32_si128(MemToUint32(&ref[0 * BPS]));
+ ref1 = _mm_cvtsi32_si128(MemToUint32(&ref[1 * BPS]));
+ ref2 = _mm_cvtsi32_si128(MemToUint32(&ref[2 * BPS]));
+ ref3 = _mm_cvtsi32_si128(MemToUint32(&ref[3 * BPS]));
}
// Convert to 16b.
ref0 = _mm_unpacklo_epi8(ref0, zero);
@@ -291,200 +284,865 @@ static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
}
}
-static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
- int16_t* out) {
+static void FTransformPass1(const __m128i* const in01,
+ const __m128i* const in23,
+ __m128i* const out01,
+ __m128i* const out32) {
+ const __m128i k937 = _mm_set1_epi32(937);
+ const __m128i k1812 = _mm_set1_epi32(1812);
+
+ const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
+ const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
+ const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
+ 2217, 5352, 2217, 5352);
+ const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
+ -5352, 2217, -5352, 2217);
+
+ // *in01 = 00 01 10 11 02 03 12 13
+ // *in23 = 20 21 30 31 22 23 32 33
+ const __m128i shuf01_p = _mm_shufflehi_epi16(*in01, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i shuf23_p = _mm_shufflehi_epi16(*in23, _MM_SHUFFLE(2, 3, 0, 1));
+ // 00 01 10 11 03 02 13 12
+ // 20 21 30 31 23 22 33 32
+ const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
+ const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
+ // 00 01 10 11 20 21 30 31
+ // 03 02 13 12 23 22 33 32
+ const __m128i a01 = _mm_add_epi16(s01, s32);
+ const __m128i a32 = _mm_sub_epi16(s01, s32);
+ // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
+ // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
+
+ const __m128i tmp0 = _mm_madd_epi16(a01, k88p); // [ (a0 + a1) << 3, ... ]
+ const __m128i tmp2 = _mm_madd_epi16(a01, k88m); // [ (a0 - a1) << 3, ... ]
+ const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
+ const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
+ const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
+ const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
+ const __m128i tmp1 = _mm_srai_epi32(tmp1_2, 9);
+ const __m128i tmp3 = _mm_srai_epi32(tmp3_2, 9);
+ const __m128i s03 = _mm_packs_epi32(tmp0, tmp2);
+ const __m128i s12 = _mm_packs_epi32(tmp1, tmp3);
+ const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1...
+ const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3
+ const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);
+ *out01 = _mm_unpacklo_epi32(s_lo, s_hi);
+ *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2..
+}
+
+static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
+ int16_t* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i seven = _mm_set1_epi16(7);
- const __m128i k7500 = _mm_set1_epi32(7500);
- const __m128i k14500 = _mm_set1_epi32(14500);
- const __m128i k51000 = _mm_set1_epi32(51000);
- const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,
5352, 2217, 5352, 2217);
const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
2217, -5352, 2217, -5352);
+ const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
+ const __m128i k51000 = _mm_set1_epi32(51000);
+
+ // Same operations are done on the (0,3) and (1,2) pairs.
+ // a0 = v0 + v3
+ // a1 = v1 + v2
+ // a3 = v0 - v3
+ // a2 = v1 - v2
+ const __m128i a01 = _mm_add_epi16(*v01, *v32);
+ const __m128i a32 = _mm_sub_epi16(*v01, *v32);
+ const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
+ const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
+ const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
+
+ // d0 = (a0 + a1 + 7) >> 4;
+ // d2 = (a0 - a1 + 7) >> 4;
+ const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
+ const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
+ const __m128i d0 = _mm_srai_epi16(c0, 4);
+ const __m128i d2 = _mm_srai_epi16(c2, 4);
+
+ // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
+ // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
+ const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
+ const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
+ const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
+ const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
+ const __m128i d3 = _mm_add_epi32(c3, k51000);
+ const __m128i e1 = _mm_srai_epi32(d1, 16);
+ const __m128i e3 = _mm_srai_epi32(d3, 16);
+ const __m128i f1 = _mm_packs_epi32(e1, e1);
+ const __m128i f3 = _mm_packs_epi32(e3, e3);
+ // f1 = f1 + (a3 != 0);
+ // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
+ // desired (0, 1), we add one earlier through k12000_plus_one.
+ // -> f1 = f1 + 1 - (a3 == 0)
+ const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
+
+ const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
+ const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
+ _mm_storeu_si128((__m128i*)&out[0], d0_g1);
+ _mm_storeu_si128((__m128i*)&out[8], d2_f3);
+}
+
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+ const __m128i zero = _mm_setzero_si128();
+ // Load src and convert to 16b.
+ const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
+ const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
+ const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
+ const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
+ const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
+ const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
+ const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
+ const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
+ // Load ref and convert to 16b.
+ const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+ const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+ const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+ const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
+ const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
+ const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
+ const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
+ const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
+ // Compute difference. -> 00 01 02 03 00 00 00 00
+ const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
+ const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
+ const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
+ const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
+
+ // Unpack and shuffle
+ // 00 01 02 03 0 0 0 0
+ // 10 11 12 13 0 0 0 0
+ // 20 21 22 23 0 0 0 0
+ // 30 31 32 33 0 0 0 0
+ const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
+ const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
__m128i v01, v32;
- // Difference between src and ref and initial transpose.
- {
- // Load src and convert to 16b.
- const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
- const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
- const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
- const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
- const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
- const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
- const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
- const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
- // Load ref and convert to 16b.
- const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
- const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
- const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
- const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
- const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
- const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
- const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
- const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
- // Compute difference.
- const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
- const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
- const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
- const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
-
- // Transpose.
- // 00 01 02 03 0 0 0 0
- // 10 11 12 13 0 0 0 0
- // 20 21 22 23 0 0 0 0
- // 30 31 32 33 0 0 0 0
- const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
- const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
- v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
- v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
- // a02 a12 a22 a32 a03 a13 a23 a33
- // a00 a10 a20 a30 a01 a11 a21 a31
- // a03 a13 a23 a33 a02 a12 a22 a32
- }
-
- // First pass and subsequent transpose.
- {
- // Same operations are done on the (0,3) and (1,2) pairs.
- // b0 = (a0 + a3) << 3
- // b1 = (a1 + a2) << 3
- // b3 = (a0 - a3) << 3
- // b2 = (a1 - a2) << 3
- const __m128i a01 = _mm_add_epi16(v01, v32);
- const __m128i a32 = _mm_sub_epi16(v01, v32);
- const __m128i b01 = _mm_slli_epi16(a01, 3);
- const __m128i b32 = _mm_slli_epi16(a32, 3);
- const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
- const __m128i b22 = _mm_unpackhi_epi64(b32, b32);
-
- // e0 = b0 + b1
- // e2 = b0 - b1
- const __m128i e0 = _mm_add_epi16(b01, b11);
- const __m128i e2 = _mm_sub_epi16(b01, b11);
- const __m128i e02 = _mm_unpacklo_epi64(e0, e2);
-
- // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
- // e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12
- const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
- const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
- const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
- const __m128i d1 = _mm_add_epi32(c1, k14500);
- const __m128i d3 = _mm_add_epi32(c3, k7500);
- const __m128i e1 = _mm_srai_epi32(d1, 12);
- const __m128i e3 = _mm_srai_epi32(d3, 12);
- const __m128i e13 = _mm_packs_epi32(e1, e3);
-
- // Transpose.
- // 00 01 02 03 20 21 22 23
- // 10 11 12 13 30 31 32 33
- const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
- const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
- v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
- v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
- // 02 12 22 32 03 13 23 33
- // 00 10 20 30 01 11 21 31
- // 03 13 23 33 02 12 22 32
- }
+ // First pass
+ FTransformPass1(&shuf01, &shuf23, &v01, &v32);
// Second pass
+ FTransformPass2(&v01, &v32, out);
+}
+
+static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+ const __m128i zero = _mm_setzero_si128();
+
+ // Load src and convert to 16b.
+ const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
+ const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
+ const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
+ const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
+ const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
+ const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
+ const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
+ const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
+ // Load ref and convert to 16b.
+ const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+ const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+ const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+ const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
+ const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
+ const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
+ const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
+ const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
+ // Compute difference. -> 00 01 02 03 00' 01' 02' 03'
+ const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
+ const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
+ const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
+ const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
+
+ // Unpack and shuffle
+ // 00 01 02 03 0 0 0 0
+ // 10 11 12 13 0 0 0 0
+ // 20 21 22 23 0 0 0 0
+ // 30 31 32 33 0 0 0 0
+ const __m128i shuf01l = _mm_unpacklo_epi32(diff0, diff1);
+ const __m128i shuf23l = _mm_unpacklo_epi32(diff2, diff3);
+ const __m128i shuf01h = _mm_unpackhi_epi32(diff0, diff1);
+ const __m128i shuf23h = _mm_unpackhi_epi32(diff2, diff3);
+ __m128i v01l, v32l;
+ __m128i v01h, v32h;
+
+ // First pass
+ FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
+ FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);
+
+ // Second pass
+ FTransformPass2(&v01l, &v32l, out + 0);
+ FTransformPass2(&v01h, &v32h, out + 16);
+}
+
+static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
+ const __m128i kMult1 = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
+ const __m128i kMult2 = _mm_set_epi16(0, 0, 0, 0, -1, 1, -1, 1);
+ const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
+ const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
+ const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]);
+ const __m128i src3 = _mm_loadl_epi64((__m128i*)&in[3 * 16]);
+ const __m128i A01 = _mm_unpacklo_epi16(src0, src1); // A0 A1 | ...
+ const __m128i A23 = _mm_unpacklo_epi16(src2, src3); // A2 A3 | ...
+ const __m128i B0 = _mm_adds_epi16(A01, A23); // a0 | a1 | ...
+ const __m128i B1 = _mm_subs_epi16(A01, A23); // a3 | a2 | ...
+ const __m128i C0 = _mm_unpacklo_epi32(B0, B1); // a0 | a1 | a3 | a2
+ const __m128i C1 = _mm_unpacklo_epi32(B1, B0); // a3 | a2 | a0 | a1
+ const __m128i D0 = _mm_madd_epi16(C0, kMult1); // out0, out1
+ const __m128i D1 = _mm_madd_epi16(C1, kMult2); // out2, out3
+ *out = _mm_unpacklo_epi64(D0, D1);
+}
+
+static void FTransformWHT(const int16_t* in, int16_t* out) {
+ __m128i row0, row1, row2, row3;
+ FTransformWHTRow(in + 0 * 64, &row0);
+ FTransformWHTRow(in + 1 * 64, &row1);
+ FTransformWHTRow(in + 2 * 64, &row2);
+ FTransformWHTRow(in + 3 * 64, &row3);
+
{
- // Same operations are done on the (0,3) and (1,2) pairs.
- // a0 = v0 + v3
- // a1 = v1 + v2
- // a3 = v0 - v3
- // a2 = v1 - v2
- const __m128i a01 = _mm_add_epi16(v01, v32);
- const __m128i a32 = _mm_sub_epi16(v01, v32);
- const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
- const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
-
- // d0 = (a0 + a1 + 7) >> 4;
- // d2 = (a0 - a1 + 7) >> 4;
- const __m128i b0 = _mm_add_epi16(a01, a11);
- const __m128i b2 = _mm_sub_epi16(a01, a11);
- const __m128i c0 = _mm_add_epi16(b0, seven);
- const __m128i c2 = _mm_add_epi16(b2, seven);
- const __m128i d0 = _mm_srai_epi16(c0, 4);
- const __m128i d2 = _mm_srai_epi16(c2, 4);
-
- // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
- // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
- const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
- const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
- const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
- const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
- const __m128i d3 = _mm_add_epi32(c3, k51000);
- const __m128i e1 = _mm_srai_epi32(d1, 16);
- const __m128i e3 = _mm_srai_epi32(d3, 16);
- const __m128i f1 = _mm_packs_epi32(e1, e1);
- const __m128i f3 = _mm_packs_epi32(e3, e3);
- // f1 = f1 + (a3 != 0);
- // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
- // desired (0, 1), we add one earlier through k12000_plus_one.
- const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
-
- _mm_storel_epi64((__m128i*)&out[ 0], d0);
- _mm_storel_epi64((__m128i*)&out[ 4], g1);
- _mm_storel_epi64((__m128i*)&out[ 8], d2);
- _mm_storel_epi64((__m128i*)&out[12], f3);
+ const __m128i a0 = _mm_add_epi32(row0, row2);
+ const __m128i a1 = _mm_add_epi32(row1, row3);
+ const __m128i a2 = _mm_sub_epi32(row1, row3);
+ const __m128i a3 = _mm_sub_epi32(row0, row2);
+ const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);
+ const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);
+ const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);
+ const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);
+ const __m128i out0 = _mm_packs_epi32(b0, b1);
+ const __m128i out1 = _mm_packs_epi32(b2, b3);
+ _mm_storeu_si128((__m128i*)&out[0], out0);
+ _mm_storeu_si128((__m128i*)&out[8], out1);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+ int j;
+ int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+ for (j = start_block; j < end_block; ++j) {
+ int16_t out[16];
+ int k;
+
+ FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+ // Convert coefficients to bin (within out[]).
+ {
+ // Load.
+ const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
+ const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
+ const __m128i d0 = _mm_sub_epi16(zero, out0);
+ const __m128i d1 = _mm_sub_epi16(zero, out1);
+ const __m128i abs0 = _mm_max_epi16(out0, d0); // abs(v), 16b
+ const __m128i abs1 = _mm_max_epi16(out1, d1);
+ // v = abs(out) >> 3
+ const __m128i v0 = _mm_srai_epi16(abs0, 3);
+ const __m128i v1 = _mm_srai_epi16(abs1, 3);
+ // bin = min(v, MAX_COEFF_THRESH)
+ const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
+ const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
+ // Store.
+ _mm_storeu_si128((__m128i*)&out[0], bin0);
+ _mm_storeu_si128((__m128i*)&out[8], bin1);
+ }
+
+ // Convert coefficients to bin.
+ for (k = 0; k < 16; ++k) {
+ ++distribution[out[k]];
+ }
}
+ VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+// helper for chroma-DC predictions
+static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+ int j;
+ const __m128i values = _mm_set1_epi8(v);
+ for (j = 0; j < 8; ++j) {
+ _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
+ }
+}
+
+static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+ int j;
+ const __m128i values = _mm_set1_epi8(v);
+ for (j = 0; j < 16; ++j) {
+ _mm_store_si128((__m128i*)(dst + j * BPS), values);
+ }
+}
+
+static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
+ if (size == 4) {
+ int j;
+ for (j = 0; j < 4; ++j) {
+ memset(dst + j * BPS, value, 4);
+ }
+ } else if (size == 8) {
+ Put8x8uv(value, dst);
+ } else {
+ Put16(value, dst);
+ }
+}
+
+static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
+ int j;
+ const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+ for (j = 0; j < 8; ++j) {
+ _mm_storel_epi64((__m128i*)(dst + j * BPS), top_values);
+ }
+}
+
+static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
+ const __m128i top_values = _mm_load_si128((const __m128i*)top);
+ int j;
+ for (j = 0; j < 16; ++j) {
+ _mm_store_si128((__m128i*)(dst + j * BPS), top_values);
+ }
+}
+
+static WEBP_INLINE void VerticalPred(uint8_t* dst,
+ const uint8_t* top, int size) {
+ if (top != NULL) {
+ if (size == 8) {
+ VE8uv(dst, top);
+ } else {
+ VE16(dst, top);
+ }
+ } else {
+ Fill(dst, 127, size);
+ }
+}
+
+static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
+ int j;
+ for (j = 0; j < 8; ++j) {
+ const __m128i values = _mm_set1_epi8(left[j]);
+ _mm_storel_epi64((__m128i*)dst, values);
+ dst += BPS;
+ }
+}
+
+static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
+ int j;
+ for (j = 0; j < 16; ++j) {
+ const __m128i values = _mm_set1_epi8(left[j]);
+ _mm_store_si128((__m128i*)dst, values);
+ dst += BPS;
+ }
+}
+
+static WEBP_INLINE void HorizontalPred(uint8_t* dst,
+ const uint8_t* left, int size) {
+ if (left != NULL) {
+ if (size == 8) {
+ HE8uv(dst, left);
+ } else {
+ HE16(dst, left);
+ }
+ } else {
+ Fill(dst, 129, size);
+ }
+}
+
+static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top, int size) {
+ const __m128i zero = _mm_setzero_si128();
+ int y;
+ if (size == 8) {
+ const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+ const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+ for (y = 0; y < 8; ++y, dst += BPS) {
+ const int val = left[y] - left[-1];
+ const __m128i base = _mm_set1_epi16(val);
+ const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+ _mm_storel_epi64((__m128i*)dst, out);
+ }
+ } else {
+ const __m128i top_values = _mm_load_si128((const __m128i*)top);
+ const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero);
+ const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero);
+ for (y = 0; y < 16; ++y, dst += BPS) {
+ const int val = left[y] - left[-1];
+ const __m128i base = _mm_set1_epi16(val);
+ const __m128i out_0 = _mm_add_epi16(base, top_base_0);
+ const __m128i out_1 = _mm_add_epi16(base, top_base_1);
+ const __m128i out = _mm_packus_epi16(out_0, out_1);
+ _mm_store_si128((__m128i*)dst, out);
+ }
+ }
+}
+
+static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top, int size) {
+ if (left != NULL) {
+ if (top != NULL) {
+ TM(dst, left, top, size);
+ } else {
+ HorizontalPred(dst, left, size);
+ }
+ } else {
+ // true motion without left samples (hence: with default 129 value)
+ // is equivalent to VE prediction where you just copy the top samples.
+ // Note that if top samples are not available, the default value is
+ // then 129, and not 127 as in the VerticalPred case.
+ if (top != NULL) {
+ VerticalPred(dst, top, size);
+ } else {
+ Fill(dst, 129, size);
+ }
+ }
+}
+
+static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+ const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
+ const __m128i sum_top = _mm_sad_epu8(top_values, zero);
+ const __m128i sum_left = _mm_sad_epu8(left_values, zero);
+ const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 8;
+ Put8x8uv(DC >> 4, dst);
+}
+
+static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+ const __m128i sum = _mm_sad_epu8(top_values, zero);
+ const int DC = _mm_cvtsi128_si32(sum) + 4;
+ Put8x8uv(DC >> 3, dst);
+}
+
+static WEBP_INLINE void DC8uvNoTop(uint8_t* dst, const uint8_t* left) {
+ // 'left' is contiguous so we can reuse the top summation.
+ DC8uvNoLeft(dst, left);
+}
+
+static WEBP_INLINE void DC8uvNoTopLeft(uint8_t* dst) {
+ Put8x8uv(0x80, dst);
+}
+
+static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ if (top != NULL) {
+ if (left != NULL) { // top and left present
+ DC8uv(dst, left, top);
+ } else { // top, but no left
+ DC8uvNoLeft(dst, top);
+ }
+ } else if (left != NULL) { // left but no top
+ DC8uvNoTop(dst, left);
+ } else { // no top, no left, nothing.
+ DC8uvNoTopLeft(dst);
+ }
+}
+
+static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top_row = _mm_load_si128((const __m128i*)top);
+ const __m128i left_row = _mm_load_si128((const __m128i*)left);
+ const __m128i sad8x2 = _mm_sad_epu8(top_row, zero);
+ // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+ const __m128i sum_top = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+ const __m128i sad8x2_left = _mm_sad_epu8(left_row, zero);
+ // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+ const __m128i sum_left =
+ _mm_add_epi16(sad8x2_left, _mm_shuffle_epi32(sad8x2_left, 2));
+ const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 16;
+ Put16(DC >> 5, dst);
+}
+
+static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top_row = _mm_load_si128((const __m128i*)top);
+ const __m128i sad8x2 = _mm_sad_epu8(top_row, zero);
+ // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+ const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+ const int DC = _mm_cvtsi128_si32(sum) + 8;
+ Put16(DC >> 4, dst);
+}
+
+static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {
+ // 'left' is contiguous so we can reuse the top summation.
+ DC16NoLeft(dst, left);
+}
+
+static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {
+ Put16(0x80, dst);
+}
+
+static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ if (top != NULL) {
+ if (left != NULL) { // top and left present
+ DC16(dst, left, top);
+ } else { // top, but no left
+ DC16NoLeft(dst, top);
+ }
+ } else if (left != NULL) { // left but no top
+ DC16NoTop(dst, left);
+ } else { // no top, no left, nothing.
+ DC16NoTopLeft(dst);
+ }
+}
+
+//------------------------------------------------------------------------------
+// 4x4 predictions
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+// We use the following 8b-arithmetic tricks:
+// (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1
+// where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1]
+// and:
+// (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb
+// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
+// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
+
+static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+ const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00);
+ const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
+ const __m128i b = _mm_subs_epu8(a, lsb);
+ const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
+ const uint32_t vals = _mm_cvtsi128_si32(avg);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ *(uint32_t*)(dst + i * BPS) = vals;
+ }
+}
+
+static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
+ const int X = top[-1];
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int L = top[-5];
+ *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J);
+ *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K);
+ *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L);
+ *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L);
+}
+
+static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+ uint32_t dc = 4;
+ int i;
+ for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+ Fill(dst, dc >> 3, 4);
+}
+
+static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+ const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, top[7], 3);
+ const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0);
+ const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
+ const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+ const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
+ *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( abcdefg );
+ *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1));
+ *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2));
+ *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3));
+}
+
+static WEBP_INLINE void VR4(uint8_t* dst,
+ const uint8_t* top) { // Vertical-Right
+ const __m128i one = _mm_set1_epi8(1);
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int X = top[-1];
+ const __m128i XABCD = _mm_loadl_epi64((const __m128i*)(top - 1));
+ const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
+ const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
+ const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
+ const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0);
+ const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
+ const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
+ const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+ const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
+ *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( abcd );
+ *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32( efgh );
+ *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1));
+ *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1));
+
+ // these two are hard to implement in SSE2, so we keep the C-version:
+ DST(0, 2) = AVG3(J, I, X);
+ DST(0, 3) = AVG3(K, J, I);
+}
+
+static WEBP_INLINE void VL4(uint8_t* dst,
+ const uint8_t* top) { // Vertical-Left
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
+ const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
+ const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2);
+ const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_);
+ const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_);
+ const __m128i avg3 = _mm_avg_epu8(avg1, avg2);
+ const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one);
+ const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_);
+ const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_);
+ const __m128i abbc = _mm_or_si128(ab, bc);
+ const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
+ const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
+ const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+ *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( avg1 );
+ *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32( avg4 );
+ *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1));
+ *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1));
+
+ // these two are hard to get and irregular
+ DST(3, 2) = (extra_out >> 0) & 0xff;
+ DST(3, 3) = (extra_out >> 8) & 0xff;
+}
+
+static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
+ const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
+ const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
+ const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
+ const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD);
+ const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
+ const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+ const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
+ *(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32( abcdefg );
+ *(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1));
+ *(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2));
+ *(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3));
+}
+
+static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int L = top[-5];
+ DST(0, 0) = AVG2(I, J);
+ DST(2, 0) = DST(0, 1) = AVG2(J, K);
+ DST(2, 1) = DST(0, 2) = AVG2(K, L);
+ DST(1, 0) = AVG3(I, J, K);
+ DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+ DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+ DST(3, 2) = DST(2, 2) =
+ DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+ const int X = top[-1];
+ const int I = top[-2];
+ const int J = top[-3];
+ const int K = top[-4];
+ const int L = top[-5];
+ const int A = top[0];
+ const int B = top[1];
+ const int C = top[2];
+
+ DST(0, 0) = DST(2, 1) = AVG2(I, X);
+ DST(0, 1) = DST(2, 2) = AVG2(J, I);
+ DST(0, 2) = DST(2, 3) = AVG2(K, J);
+ DST(0, 3) = AVG2(L, K);
+
+ DST(3, 0) = AVG3(A, B, C);
+ DST(2, 0) = AVG3(X, A, B);
+ DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+ DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+ DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+ DST(1, 3) = AVG3(L, K, J);
+}
+
+static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top_values = _mm_cvtsi32_si128(MemToUint32(top));
+ const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+ int y;
+ for (y = 0; y < 4; ++y, dst += BPS) {
+ const int val = top[-2 - y] - top[-1];
+ const __m128i base = _mm_set1_epi16(val);
+ const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+ *(int*)dst = _mm_cvtsi128_si32(out);
+ }
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+//------------------------------------------------------------------------------
+// luma 4x4 prediction
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+ DC4(I4DC4 + dst, top);
+ TM4(I4TM4 + dst, top);
+ VE4(I4VE4 + dst, top);
+ HE4(I4HE4 + dst, top);
+ RD4(I4RD4 + dst, top);
+ VR4(I4VR4 + dst, top);
+ LD4(I4LD4 + dst, top);
+ VL4(I4VL4 + dst, top);
+ HD4(I4HD4 + dst, top);
+ HU4(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
+ // U block
+ DC8uvMode(C8DC8 + dst, left, top);
+ VerticalPred(C8VE8 + dst, top, 8);
+ HorizontalPred(C8HE8 + dst, left, 8);
+ TrueMotion(C8TM8 + dst, left, top, 8);
+ // V block
+ dst += 8;
+ if (top != NULL) top += 8;
+ if (left != NULL) left += 16;
+ DC8uvMode(C8DC8 + dst, left, top);
+ VerticalPred(C8VE8 + dst, top, 8);
+ HorizontalPred(C8HE8 + dst, left, 8);
+ TrueMotion(C8TM8 + dst, left, top, 8);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds(uint8_t* dst,
+ const uint8_t* left, const uint8_t* top) {
+ DC16Mode(I16DC16 + dst, left, top);
+ VerticalPred(I16VE16 + dst, top, 16);
+ HorizontalPred(I16HE16 + dst, left, 16);
+ TrueMotion(I16TM16 + dst, left, top, 16);
}
//------------------------------------------------------------------------------
// Metric
-static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
- const __m128i zero = _mm_set1_epi16(0);
+static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
+ __m128i* const sum) {
+ // take abs(a-b) in 8b
+ const __m128i a_b = _mm_subs_epu8(a, b);
+ const __m128i b_a = _mm_subs_epu8(b, a);
+ const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
+ // zero-extend to 16b
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
+ const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
+ // multiply with self
+ const __m128i sum1 = _mm_madd_epi16(C0, C0);
+ const __m128i sum2 = _mm_madd_epi16(C1, C1);
+ *sum = _mm_add_epi32(sum1, sum2);
+}
- // Load values.
- const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
- const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
- const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
- const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]);
- const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]);
- const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]);
- const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]);
- const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]);
+static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
+ int num_pairs) {
+ __m128i sum = _mm_setzero_si128();
+ int32_t tmp[4];
+ int i;
+
+ for (i = 0; i < num_pairs; ++i) {
+ const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[BPS * 0]);
+ const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[BPS * 0]);
+ const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
+ const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
+ __m128i sum1, sum2;
+ SubtractAndAccumulate(a0, b0, &sum1);
+ SubtractAndAccumulate(a1, b1, &sum2);
+ sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
+ a += 2 * BPS;
+ b += 2 * BPS;
+ }
+ _mm_storeu_si128((__m128i*)tmp, sum);
+ return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+ return SSE_16xN(a, b, 8);
+}
- // Combine pair of lines and convert to 16b.
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+ return SSE_16xN(a, b, 4);
+}
+
+#define LOAD_8x16b(ptr) \
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
+
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+ const __m128i zero = _mm_setzero_si128();
+ int num_pairs = 4;
+ __m128i sum = zero;
+ int32_t tmp[4];
+ while (num_pairs-- > 0) {
+ const __m128i a0 = LOAD_8x16b(&a[BPS * 0]);
+ const __m128i a1 = LOAD_8x16b(&a[BPS * 1]);
+ const __m128i b0 = LOAD_8x16b(&b[BPS * 0]);
+ const __m128i b1 = LOAD_8x16b(&b[BPS * 1]);
+ // subtract
+ const __m128i c0 = _mm_subs_epi16(a0, b0);
+ const __m128i c1 = _mm_subs_epi16(a1, b1);
+ // multiply/accumulate with self
+ const __m128i d0 = _mm_madd_epi16(c0, c0);
+ const __m128i d1 = _mm_madd_epi16(c1, c1);
+ // collect
+ const __m128i sum01 = _mm_add_epi32(d0, d1);
+ sum = _mm_add_epi32(sum, sum01);
+ a += 2 * BPS;
+ b += 2 * BPS;
+ }
+ _mm_storeu_si128((__m128i*)tmp, sum);
+ return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+#undef LOAD_8x16b
+
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+ const __m128i zero = _mm_setzero_si128();
+
+ // Load values. Note that we read 8 pixels instead of 4,
+ // but the a/b buffers are over-allocated to that effect.
+ const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]);
+ const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]);
+ const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]);
+ const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]);
+ const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]);
+ const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]);
+ const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]);
+ const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]);
+ // Combine pair of lines.
const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
+ // Convert to 16b.
const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
+ // subtract, square and accumulate
+ const __m128i d0 = _mm_subs_epi16(a01s, b01s);
+ const __m128i d1 = _mm_subs_epi16(a23s, b23s);
+ const __m128i e0 = _mm_madd_epi16(d0, d0);
+ const __m128i e1 = _mm_madd_epi16(d1, d1);
+ const __m128i sum = _mm_add_epi32(e0, e1);
- // Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2
- // TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't
- // need absolute values, there is no need to do calculation
- // in 8bit as we are already in 16bit, ... Yet this is what
- // benchmarks the fastest!
- const __m128i d0 = _mm_subs_epu8(a01s, b01s);
- const __m128i d1 = _mm_subs_epu8(b01s, a01s);
- const __m128i d2 = _mm_subs_epu8(a23s, b23s);
- const __m128i d3 = _mm_subs_epu8(b23s, a23s);
-
- // Square and add them all together.
- const __m128i madd0 = _mm_madd_epi16(d0, d0);
- const __m128i madd1 = _mm_madd_epi16(d1, d1);
- const __m128i madd2 = _mm_madd_epi16(d2, d2);
- const __m128i madd3 = _mm_madd_epi16(d3, d3);
- const __m128i sum0 = _mm_add_epi32(madd0, madd1);
- const __m128i sum1 = _mm_add_epi32(madd2, madd3);
- const __m128i sum2 = _mm_add_epi32(sum0, sum1);
int32_t tmp[4];
- _mm_storeu_si128((__m128i*)tmp, sum2);
+ _mm_storeu_si128((__m128i*)tmp, sum);
return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
@@ -497,24 +1155,22 @@ static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
// Hadamard transform
// Returns the difference between the weighted sum of the absolute value of
// transformed coefficients.
-static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
- const uint16_t* const w) {
+static int TTransform(const uint8_t* inA, const uint8_t* inB,
+ const uint16_t* const w) {
int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- const __m128i three = _mm_set1_epi16(3);
- // Load, combine and tranpose inputs.
+ // Load, combine and transpose inputs.
{
- const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
- const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
- const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]);
- const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]);
- const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]);
- const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]);
- const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]);
- const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]);
+ const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
+ const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
+ const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
+ const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
+ const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
+ const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
+ const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
+ const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
// Combine inA and inB (we'll do two transforms in parallel).
const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
@@ -550,17 +1206,14 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
// Horizontal pass and subsequent transpose.
{
// Calculate a and b (two 4x4 at once).
- const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);
- const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);
- const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);
- const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);
- // b0_extra = (a0 != 0);
- const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
- const __m128i b0_base = _mm_add_epi16(a0, a1);
+ const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+ const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+ const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+ const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+ const __m128i b0 = _mm_add_epi16(a0, a1);
const __m128i b1 = _mm_add_epi16(a3, a2);
const __m128i b2 = _mm_sub_epi16(a3, a2);
const __m128i b3 = _mm_sub_epi16(a0, a1);
- const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
@@ -598,8 +1251,8 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
// Load all inputs.
// TODO(cduvivier): Make variable declarations and allocations aligned so
// we can use _mm_load_si128 instead of _mm_loadu_si128.
- const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]);
- const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]);
+ const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
+ const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
// Calculate a and b (two 4x4 at once).
const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
@@ -618,36 +1271,16 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
__m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
{
- // sign(b) = b >> 15 (0x0000 if positive, 0xffff if negative)
- const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15);
- const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15);
- const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15);
- const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15);
-
- // b = abs(b) = (b ^ sign) - sign
- A_b0 = _mm_xor_si128(A_b0, sign_A_b0);
- A_b2 = _mm_xor_si128(A_b2, sign_A_b2);
- B_b0 = _mm_xor_si128(B_b0, sign_B_b0);
- B_b2 = _mm_xor_si128(B_b2, sign_B_b2);
- A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);
- A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);
- B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);
- B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
+ const __m128i d0 = _mm_sub_epi16(zero, A_b0);
+ const __m128i d1 = _mm_sub_epi16(zero, A_b2);
+ const __m128i d2 = _mm_sub_epi16(zero, B_b0);
+ const __m128i d3 = _mm_sub_epi16(zero, B_b2);
+ A_b0 = _mm_max_epi16(A_b0, d0); // abs(v), 16b
+ A_b2 = _mm_max_epi16(A_b2, d1);
+ B_b0 = _mm_max_epi16(B_b0, d2);
+ B_b2 = _mm_max_epi16(B_b2, d3);
}
- // b = abs(b) + 3
- A_b0 = _mm_add_epi16(A_b0, three);
- A_b2 = _mm_add_epi16(A_b2, three);
- B_b0 = _mm_add_epi16(B_b0, three);
- B_b2 = _mm_add_epi16(B_b2, three);
-
- // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
- // b = (abs(b) + 3) >> 3
- A_b0 = _mm_srai_epi16(A_b0, 3);
- A_b2 = _mm_srai_epi16(A_b2, 3);
- B_b0 = _mm_srai_epi16(B_b0, 3);
- B_b2 = _mm_srai_epi16(B_b2, 3);
-
// weighted sums
A_b0 = _mm_madd_epi16(A_b0, w_0);
A_b2 = _mm_madd_epi16(A_b2, w_8);
@@ -663,35 +1296,33 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
return sum[0] + sum[1] + sum[2] + sum[3];
}
-static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
- const int diff_sum = TTransformSSE2(a, b, w);
- return (abs(diff_sum) + 8) >> 4;
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ const int diff_sum = TTransform(a, b, w);
+ return abs(diff_sum) >> 5;
}
-static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
- D += Disto4x4SSE2(a + x + y, b + x + y, w);
+ D += Disto4x4(a + x + y, b + x + y, w);
}
}
return D;
}
-
//------------------------------------------------------------------------------
// Quantization
//
-// Simple quantization
-static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
- int n, const VP8Matrix* const mtx) {
- const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
- const __m128i zero = _mm_set1_epi16(0);
- __m128i sign0, sign8;
+static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
+ const uint16_t* const sharpen,
+ const VP8Matrix* const mtx) {
+ const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
+ const __m128i zero = _mm_setzero_si128();
__m128i coeff0, coeff8;
__m128i out0, out8;
__m128i packed_out;
@@ -701,20 +1332,14 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
// we can use _mm_load_si128 instead of _mm_loadu_si128.
__m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
__m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
- const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
- const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
- const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
- const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
- const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
- const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
- const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
- const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
- const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
- const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);
-
- // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative)
- sign0 = _mm_srai_epi16(in0, 15);
- sign8 = _mm_srai_epi16(in8, 15);
+ const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
+ const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
+ const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
+ const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
+
+ // extract sign(in) (0x0000 if positive, 0xffff if negative)
+ const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
+ const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);
// coeff = abs(in) = (in ^ sign) - sign
coeff0 = _mm_xor_si128(in0, sign0);
@@ -723,43 +1348,47 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
coeff8 = _mm_sub_epi16(coeff8, sign8);
// coeff = abs(in) + sharpen
- coeff0 = _mm_add_epi16(coeff0, sharpen0);
- coeff8 = _mm_add_epi16(coeff8, sharpen8);
-
- // if (coeff > 2047) coeff = 2047
- coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
- coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);
+ if (sharpen != NULL) {
+ const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
+ const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
+ coeff0 = _mm_add_epi16(coeff0, sharpen0);
+ coeff8 = _mm_add_epi16(coeff8, sharpen8);
+ }
- // out = (coeff * iQ + B) >> QFIX;
+ // out = (coeff * iQ + B) >> QFIX
{
// doing calculations with 32b precision (QFIX=17)
// out = (coeff * iQ)
- __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
- __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
- __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
- __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+ const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+ const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+ const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+ const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
__m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
__m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
- // expand bias from 16b to 32b
- __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
- __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
- __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
- __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
// out = (coeff * iQ + B)
+ const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
+ const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
+ const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
+ const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
out_00 = _mm_add_epi32(out_00, bias_00);
out_04 = _mm_add_epi32(out_04, bias_04);
out_08 = _mm_add_epi32(out_08, bias_08);
out_12 = _mm_add_epi32(out_12, bias_12);
- // out = (coeff * iQ + B) >> QFIX;
+ // out = QUANTDIV(coeff, iQ, B, QFIX)
out_00 = _mm_srai_epi32(out_00, QFIX);
out_04 = _mm_srai_epi32(out_04, QFIX);
out_08 = _mm_srai_epi32(out_08, QFIX);
out_12 = _mm_srai_epi32(out_12, QFIX);
+
// pack result as 16b
out0 = _mm_packs_epi32(out_00, out_04);
out8 = _mm_packs_epi32(out_08, out_12);
+
+ // if (coeff > 2047) coeff = 2047
+ out0 = _mm_min_epi16(out0, max_coeff_2047);
+ out8 = _mm_min_epi16(out8, max_coeff_2047);
}
// get sign back (if (sign[j]) out_n = -out_n)
@@ -772,17 +1401,8 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
in0 = _mm_mullo_epi16(out0, q0);
in8 = _mm_mullo_epi16(out8, q8);
- // if (coeff <= mtx->zthresh_) {in=0; out=0;}
- {
- __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
- __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
- in0 = _mm_and_si128(in0, cmp0);
- in8 = _mm_and_si128(in8, cmp8);
- _mm_storeu_si128((__m128i*)&in[0], in0);
- _mm_storeu_si128((__m128i*)&in[8], in8);
- out0 = _mm_and_si128(out0, cmp0);
- out8 = _mm_and_si128(out8, cmp8);
- }
+ _mm_storeu_si128((__m128i*)&in[0], in0);
+ _mm_storeu_si128((__m128i*)&in[8], in8);
// zigzag the output before storing it.
//
@@ -809,29 +1429,55 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
}
// detect if all 'out' values are zeroes or not
- {
- int32_t tmp[4];
- _mm_storeu_si128((__m128i*)tmp, packed_out);
- if (n) {
- tmp[0] &= ~0xff;
- }
- return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
- }
+ return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
+}
+
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+}
+
+static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ return DoQuantizeBlock(in, out, NULL, mtx);
+}
+
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
+ int nz;
+ const uint16_t* const sharpen = &mtx->sharpen_[0];
+ nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+ nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+ return nz;
}
+//------------------------------------------------------------------------------
+// Entry point
+
extern void VP8EncDspInitSSE2(void);
-void VP8EncDspInitSSE2(void) {
- VP8CollectHistogram = CollectHistogramSSE2;
- VP8EncQuantizeBlock = QuantizeBlockSSE2;
- VP8ITransform = ITransformSSE2;
- VP8FTransform = FTransformSSE2;
- VP8SSE4x4 = SSE4x4SSE2;
- VP8TDisto4x4 = Disto4x4SSE2;
- VP8TDisto16x16 = Disto16x16SSE2;
-}
-
-#if defined(__cplusplus) || defined(c_plusplus)
-} // extern "C"
-#endif
-#endif // WEBP_USE_SSE2
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
+ VP8CollectHistogram = CollectHistogram;
+ VP8EncPredLuma16 = Intra16Preds;
+ VP8EncPredChroma8 = IntraChromaPreds;
+ VP8EncPredLuma4 = Intra4Preds;
+ VP8EncQuantizeBlock = QuantizeBlock;
+ VP8EncQuantize2Blocks = Quantize2Blocks;
+ VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
+ VP8ITransform = ITransform;
+ VP8FTransform = FTransform;
+ VP8FTransform2 = FTransform2;
+ VP8FTransformWHT = FTransformWHT;
+ VP8SSE16x16 = SSE16x16;
+ VP8SSE16x8 = SSE16x8;
+ VP8SSE8x8 = SSE8x8;
+ VP8SSE4x4 = SSE4x4;
+ VP8TDisto4x4 = Disto4x4;
+ VP8TDisto16x16 = Disto16x16;
+}
+
+#else // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
+
+#endif // WEBP_USE_SSE2
diff --git a/drivers/webp/dsp/lossless.c b/drivers/webp/dsp/lossless.c
index 62a6b7b15a..5702eb3b17 100644
--- a/drivers/webp/dsp/lossless.c
+++ b/drivers/webp/dsp/lossless.c
@@ -1,8 +1,10 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
-// This code is licensed under the same terms as WebM:
-// Software License Agreement: http://www.webmproject.org/license/software/
-// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Image transforms and color space conversion methods for lossless decoder.
@@ -11,170 +13,16 @@
// Jyrki Alakuijala (jyrki@google.com)
// Urvang Joshi (urvang@google.com)
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "./dsp.h"
#include <math.h>
#include <stdlib.h>
-#include "./lossless.h"
#include "../dec/vp8li.h"
-#include "../dsp/yuv.h"
-#include "../dsp/dsp.h"
-#include "../enc/histogram.h"
+#include "../utils/endian_inl.h"
+#include "./lossless.h"
#define MAX_DIFF_COST (1e30f)
-// lookup table for small values of log2(int)
-#define APPROX_LOG_MAX 4096
-#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
-#define LOG_LOOKUP_IDX_MAX 256
-static const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
- 0.0000000000000000f, 0.0000000000000000f,
- 1.0000000000000000f, 1.5849625007211560f,
- 2.0000000000000000f, 2.3219280948873621f,
- 2.5849625007211560f, 2.8073549220576041f,
- 3.0000000000000000f, 3.1699250014423121f,
- 3.3219280948873621f, 3.4594316186372973f,
- 3.5849625007211560f, 3.7004397181410921f,
- 3.8073549220576041f, 3.9068905956085187f,
- 4.0000000000000000f, 4.0874628412503390f,
- 4.1699250014423121f, 4.2479275134435852f,
- 4.3219280948873626f, 4.3923174227787606f,
- 4.4594316186372973f, 4.5235619560570130f,
- 4.5849625007211560f, 4.6438561897747243f,
- 4.7004397181410917f, 4.7548875021634682f,
- 4.8073549220576037f, 4.8579809951275718f,
- 4.9068905956085187f, 4.9541963103868749f,
- 5.0000000000000000f, 5.0443941193584533f,
- 5.0874628412503390f, 5.1292830169449663f,
- 5.1699250014423121f, 5.2094533656289501f,
- 5.2479275134435852f, 5.2854022188622487f,
- 5.3219280948873626f, 5.3575520046180837f,
- 5.3923174227787606f, 5.4262647547020979f,
- 5.4594316186372973f, 5.4918530963296747f,
- 5.5235619560570130f, 5.5545888516776376f,
- 5.5849625007211560f, 5.6147098441152083f,
- 5.6438561897747243f, 5.6724253419714951f,
- 5.7004397181410917f, 5.7279204545631987f,
- 5.7548875021634682f, 5.7813597135246599f,
- 5.8073549220576037f, 5.8328900141647412f,
- 5.8579809951275718f, 5.8826430493618415f,
- 5.9068905956085187f, 5.9307373375628866f,
- 5.9541963103868749f, 5.9772799234999167f,
- 6.0000000000000000f, 6.0223678130284543f,
- 6.0443941193584533f, 6.0660891904577720f,
- 6.0874628412503390f, 6.1085244567781691f,
- 6.1292830169449663f, 6.1497471195046822f,
- 6.1699250014423121f, 6.1898245588800175f,
- 6.2094533656289501f, 6.2288186904958804f,
- 6.2479275134435852f, 6.2667865406949010f,
- 6.2854022188622487f, 6.3037807481771030f,
- 6.3219280948873626f, 6.3398500028846243f,
- 6.3575520046180837f, 6.3750394313469245f,
- 6.3923174227787606f, 6.4093909361377017f,
- 6.4262647547020979f, 6.4429434958487279f,
- 6.4594316186372973f, 6.4757334309663976f,
- 6.4918530963296747f, 6.5077946401986963f,
- 6.5235619560570130f, 6.5391588111080309f,
- 6.5545888516776376f, 6.5698556083309478f,
- 6.5849625007211560f, 6.5999128421871278f,
- 6.6147098441152083f, 6.6293566200796094f,
- 6.6438561897747243f, 6.6582114827517946f,
- 6.6724253419714951f, 6.6865005271832185f,
- 6.7004397181410917f, 6.7142455176661224f,
- 6.7279204545631987f, 6.7414669864011464f,
- 6.7548875021634682f, 6.7681843247769259f,
- 6.7813597135246599f, 6.7944158663501061f,
- 6.8073549220576037f, 6.8201789624151878f,
- 6.8328900141647412f, 6.8454900509443747f,
- 6.8579809951275718f, 6.8703647195834047f,
- 6.8826430493618415f, 6.8948177633079437f,
- 6.9068905956085187f, 6.9188632372745946f,
- 6.9307373375628866f, 6.9425145053392398f,
- 6.9541963103868749f, 6.9657842846620869f,
- 6.9772799234999167f, 6.9886846867721654f,
- 7.0000000000000000f, 7.0112272554232539f,
- 7.0223678130284543f, 7.0334230015374501f,
- 7.0443941193584533f, 7.0552824355011898f,
- 7.0660891904577720f, 7.0768155970508308f,
- 7.0874628412503390f, 7.0980320829605263f,
- 7.1085244567781691f, 7.1189410727235076f,
- 7.1292830169449663f, 7.1395513523987936f,
- 7.1497471195046822f, 7.1598713367783890f,
- 7.1699250014423121f, 7.1799090900149344f,
- 7.1898245588800175f, 7.1996723448363644f,
- 7.2094533656289501f, 7.2191685204621611f,
- 7.2288186904958804f, 7.2384047393250785f,
- 7.2479275134435852f, 7.2573878426926521f,
- 7.2667865406949010f, 7.2761244052742375f,
- 7.2854022188622487f, 7.2946207488916270f,
- 7.3037807481771030f, 7.3128829552843557f,
- 7.3219280948873626f, 7.3309168781146167f,
- 7.3398500028846243f, 7.3487281542310771f,
- 7.3575520046180837f, 7.3663222142458160f,
- 7.3750394313469245f, 7.3837042924740519f,
- 7.3923174227787606f, 7.4008794362821843f,
- 7.4093909361377017f, 7.4178525148858982f,
- 7.4262647547020979f, 7.4346282276367245f,
- 7.4429434958487279f, 7.4512111118323289f,
- 7.4594316186372973f, 7.4676055500829976f,
- 7.4757334309663976f, 7.4838157772642563f,
- 7.4918530963296747f, 7.4998458870832056f,
- 7.5077946401986963f, 7.5156998382840427f,
- 7.5235619560570130f, 7.5313814605163118f,
- 7.5391588111080309f, 7.5468944598876364f,
- 7.5545888516776376f, 7.5622424242210728f,
- 7.5698556083309478f, 7.5774288280357486f,
- 7.5849625007211560f, 7.5924570372680806f,
- 7.5999128421871278f, 7.6073303137496104f,
- 7.6147098441152083f, 7.6220518194563764f,
- 7.6293566200796094f, 7.6366246205436487f,
- 7.6438561897747243f, 7.6510516911789281f,
- 7.6582114827517946f, 7.6653359171851764f,
- 7.6724253419714951f, 7.6794800995054464f,
- 7.6865005271832185f, 7.6934869574993252f,
- 7.7004397181410917f, 7.7073591320808825f,
- 7.7142455176661224f, 7.7210991887071855f,
- 7.7279204545631987f, 7.7347096202258383f,
- 7.7414669864011464f, 7.7481928495894605f,
- 7.7548875021634682f, 7.7615512324444795f,
- 7.7681843247769259f, 7.7747870596011736f,
- 7.7813597135246599f, 7.7879025593914317f,
- 7.7944158663501061f, 7.8008998999203047f,
- 7.8073549220576037f, 7.8137811912170374f,
- 7.8201789624151878f, 7.8265484872909150f,
- 7.8328900141647412f, 7.8392037880969436f,
- 7.8454900509443747f, 7.8517490414160571f,
- 7.8579809951275718f, 7.8641861446542797f,
- 7.8703647195834047f, 7.8765169465649993f,
- 7.8826430493618415f, 7.8887432488982591f,
- 7.8948177633079437f, 7.9008668079807486f,
- 7.9068905956085187f, 7.9128893362299619f,
- 7.9188632372745946f, 7.9248125036057812f,
- 7.9307373375628866f, 7.9366379390025709f,
- 7.9425145053392398f, 7.9483672315846778f,
- 7.9541963103868749f, 7.9600019320680805f,
- 7.9657842846620869f, 7.9715435539507719f,
- 7.9772799234999167f, 7.9829935746943103f,
- 7.9886846867721654f, 7.9943534368588577f
-};
-
-float VP8LFastLog2(int v) {
- if (v < LOG_LOOKUP_IDX_MAX) {
- return kLog2Table[v];
- } else if (v < APPROX_LOG_MAX) {
- int log_cnt = 0;
- while (v >= LOG_LOOKUP_IDX_MAX) {
- ++log_cnt;
- v = v >> 1;
- }
- return kLog2Table[v] + (float)log_cnt;
- } else {
- return (float)(LOG_2_RECIPROCAL * log((double)v));
- }
-}
-
//------------------------------------------------------------------------------
// Image transforms.
@@ -186,7 +34,7 @@ static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
}
static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
- return (((a0 ^ a1) & 0xfefefefeL) >> 1) + (a0 & a1);
+ return (((a0 ^ a1) & 0xfefefefeu) >> 1) + (a0 & a1);
}
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
@@ -221,7 +69,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
(c1 >> 8) & 0xff,
(c2 >> 8) & 0xff);
const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
- return (a << 24) | (r << 16) | (g << 8) | b;
+ return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
}
static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
@@ -235,22 +83,30 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
- return (a << 24) | (r << 16) | (g << 8) | b;
+ return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
}
-static WEBP_INLINE int Sub3(int a, int b, int c) {
- const int pa = b - c;
- const int pb = a - c;
- return abs(pa) - abs(pb);
+// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
+#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
+# define LOCAL_INLINE __attribute__ ((noinline))
+#else
+# define LOCAL_INLINE WEBP_INLINE
+#endif
+
+static LOCAL_INLINE int Sub3(int a, int b, int c) {
+ const int pb = b - c;
+ const int pa = a - c;
+ return abs(pb) - abs(pa);
}
+#undef LOCAL_INLINE
+
static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
const int pa_minus_pb =
Sub3((a >> 24) , (b >> 24) , (c >> 24) ) +
Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
Sub3((a >> 8) & 0xff, (b >> 8) & 0xff, (c >> 8) & 0xff) +
Sub3((a ) & 0xff, (b ) & 0xff, (c ) & 0xff);
-
return (pa_minus_pb <= 0) ? a : b;
}
@@ -317,208 +173,7 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
return pred;
}
-typedef uint32_t (*PredictorFunc)(uint32_t left, const uint32_t* const top);
-static const PredictorFunc kPredictors[16] = {
- Predictor0, Predictor1, Predictor2, Predictor3,
- Predictor4, Predictor5, Predictor6, Predictor7,
- Predictor8, Predictor9, Predictor10, Predictor11,
- Predictor12, Predictor13,
- Predictor0, Predictor0 // <- padding security sentinels
-};
-
-// TODO(vikasa): Replace 256 etc with defines.
-static float PredictionCostSpatial(const int* counts,
- int weight_0, double exp_val) {
- const int significant_symbols = 16;
- const double exp_decay_factor = 0.6;
- double bits = weight_0 * counts[0];
- int i;
- for (i = 1; i < significant_symbols; ++i) {
- bits += exp_val * (counts[i] + counts[256 - i]);
- exp_val *= exp_decay_factor;
- }
- return (float)(-0.1 * bits);
-}
-
-// Compute the Shanon's entropy: Sum(p*log2(p))
-static float ShannonEntropy(const int* const array, int n) {
- int i;
- float retval = 0.f;
- int sum = 0;
- for (i = 0; i < n; ++i) {
- if (array[i] != 0) {
- sum += array[i];
- retval -= VP8LFastSLog2(array[i]);
- }
- }
- retval += VP8LFastSLog2(sum);
- return retval;
-}
-
-static float PredictionCostSpatialHistogram(int accumulated[4][256],
- int tile[4][256]) {
- int i;
- int k;
- int combo[256];
- double retval = 0;
- for (i = 0; i < 4; ++i) {
- const double exp_val = 0.94;
- retval += PredictionCostSpatial(&tile[i][0], 1, exp_val);
- retval += ShannonEntropy(&tile[i][0], 256);
- for (k = 0; k < 256; ++k) {
- combo[k] = accumulated[i][k] + tile[i][k];
- }
- retval += ShannonEntropy(&combo[0], 256);
- }
- return (float)retval;
-}
-
-static int GetBestPredictorForTile(int width, int height,
- int tile_x, int tile_y, int bits,
- int accumulated[4][256],
- const uint32_t* const argb_scratch) {
- const int kNumPredModes = 14;
- const int col_start = tile_x << bits;
- const int row_start = tile_y << bits;
- const int tile_size = 1 << bits;
- const int ymax = (tile_size <= height - row_start) ?
- tile_size : height - row_start;
- const int xmax = (tile_size <= width - col_start) ?
- tile_size : width - col_start;
- int histo[4][256];
- float best_diff = MAX_DIFF_COST;
- int best_mode = 0;
-
- int mode;
- for (mode = 0; mode < kNumPredModes; ++mode) {
- const uint32_t* current_row = argb_scratch;
- const PredictorFunc pred_func = kPredictors[mode];
- float cur_diff;
- int y;
- memset(&histo[0][0], 0, sizeof(histo));
- for (y = 0; y < ymax; ++y) {
- int x;
- const int row = row_start + y;
- const uint32_t* const upper_row = current_row;
- current_row = upper_row + width;
- for (x = 0; x < xmax; ++x) {
- const int col = col_start + x;
- uint32_t predict;
- uint32_t predict_diff;
- if (row == 0) {
- predict = (col == 0) ? ARGB_BLACK : current_row[col - 1]; // Left.
- } else if (col == 0) {
- predict = upper_row[col]; // Top.
- } else {
- predict = pred_func(current_row[col - 1], upper_row + col);
- }
- predict_diff = VP8LSubPixels(current_row[col], predict);
- ++histo[0][predict_diff >> 24];
- ++histo[1][((predict_diff >> 16) & 0xff)];
- ++histo[2][((predict_diff >> 8) & 0xff)];
- ++histo[3][(predict_diff & 0xff)];
- }
- }
- cur_diff = PredictionCostSpatialHistogram(accumulated, histo);
- if (cur_diff < best_diff) {
- best_diff = cur_diff;
- best_mode = mode;
- }
- }
-
- return best_mode;
-}
-
-static void CopyTileWithPrediction(int width, int height,
- int tile_x, int tile_y, int bits, int mode,
- const uint32_t* const argb_scratch,
- uint32_t* const argb) {
- const int col_start = tile_x << bits;
- const int row_start = tile_y << bits;
- const int tile_size = 1 << bits;
- const int ymax = (tile_size <= height - row_start) ?
- tile_size : height - row_start;
- const int xmax = (tile_size <= width - col_start) ?
- tile_size : width - col_start;
- const PredictorFunc pred_func = kPredictors[mode];
- const uint32_t* current_row = argb_scratch;
-
- int y;
- for (y = 0; y < ymax; ++y) {
- int x;
- const int row = row_start + y;
- const uint32_t* const upper_row = current_row;
- current_row = upper_row + width;
- for (x = 0; x < xmax; ++x) {
- const int col = col_start + x;
- const int pix = row * width + col;
- uint32_t predict;
- if (row == 0) {
- predict = (col == 0) ? ARGB_BLACK : current_row[col - 1]; // Left.
- } else if (col == 0) {
- predict = upper_row[col]; // Top.
- } else {
- predict = pred_func(current_row[col - 1], upper_row + col);
- }
- argb[pix] = VP8LSubPixels(current_row[col], predict);
- }
- }
-}
-
-void VP8LResidualImage(int width, int height, int bits,
- uint32_t* const argb, uint32_t* const argb_scratch,
- uint32_t* const image) {
- const int max_tile_size = 1 << bits;
- const int tiles_per_row = VP8LSubSampleSize(width, bits);
- const int tiles_per_col = VP8LSubSampleSize(height, bits);
- uint32_t* const upper_row = argb_scratch;
- uint32_t* const current_tile_rows = argb_scratch + width;
- int tile_y;
- int histo[4][256];
- memset(histo, 0, sizeof(histo));
- for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
- const int tile_y_offset = tile_y * max_tile_size;
- const int this_tile_height =
- (tile_y < tiles_per_col - 1) ? max_tile_size : height - tile_y_offset;
- int tile_x;
- if (tile_y > 0) {
- memcpy(upper_row, current_tile_rows + (max_tile_size - 1) * width,
- width * sizeof(*upper_row));
- }
- memcpy(current_tile_rows, &argb[tile_y_offset * width],
- this_tile_height * width * sizeof(*current_tile_rows));
- for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
- int pred;
- int y;
- const int tile_x_offset = tile_x * max_tile_size;
- int all_x_max = tile_x_offset + max_tile_size;
- if (all_x_max > width) {
- all_x_max = width;
- }
- pred = GetBestPredictorForTile(width, height, tile_x, tile_y, bits, histo,
- argb_scratch);
- image[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
- CopyTileWithPrediction(width, height, tile_x, tile_y, bits, pred,
- argb_scratch, argb);
- for (y = 0; y < max_tile_size; ++y) {
- int ix;
- int all_x;
- int all_y = tile_y_offset + y;
- if (all_y >= height) {
- break;
- }
- ix = all_y * width + tile_x_offset;
- for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
- const uint32_t a = argb[ix];
- ++histo[0][a >> 24];
- ++histo[1][((a >> 16) & 0xff)];
- ++histo[2][((a >> 8) & 0xff)];
- ++histo[3][(a & 0xff)];
- }
- }
- }
- }
-}
+//------------------------------------------------------------------------------
// Inverse prediction.
static void PredictorInverseTransform(const VP8LTransform* const transform,
@@ -538,29 +193,36 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
{
int y = y_start;
- const int mask = (1 << transform->bits_) - 1;
+ const int tile_width = 1 << transform->bits_;
+ const int mask = tile_width - 1;
+ const int safe_width = width & ~mask;
const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
const uint32_t* pred_mode_base =
transform->data_ + (y >> transform->bits_) * tiles_per_row;
while (y < y_end) {
- int x;
const uint32_t pred2 = Predictor2(data[-1], data - width);
const uint32_t* pred_mode_src = pred_mode_base;
- PredictorFunc pred_func;
-
+ VP8LPredictorFunc pred_func;
+ int x = 1;
+ int t = 1;
// First pixel follows the T (mode=2) mode.
AddPixelsEq(data, pred2);
-
// .. the rest:
- pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
- for (x = 1; x < width; ++x) {
- uint32_t pred;
- if ((x & mask) == 0) { // start of tile. Read predictor function.
- pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
+ while (x < safe_width) {
+ pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
+ for (; t < tile_width; ++t, ++x) {
+ const uint32_t pred = pred_func(data[x - 1], data + x - width);
+ AddPixelsEq(data + x, pred);
+ }
+ t = 0;
+ }
+ if (x < width) {
+ pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
+ for (; x < width; ++x) {
+ const uint32_t pred = pred_func(data[x - 1], data + x - width);
+ AddPixelsEq(data + x, pred);
}
- pred = pred_func(data[x - 1], data + x - width);
- AddPixelsEq(data + x, pred);
}
data += width;
++y;
@@ -571,326 +233,47 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
}
}
-void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
- int i;
- for (i = 0; i < num_pixs; ++i) {
- const uint32_t argb = argb_data[i];
- const uint32_t green = (argb >> 8) & 0xff;
- const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
- const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
- argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
- }
-}
-
// Add green to blue and red channels (i.e. perform the inverse transform of
// 'subtract green').
-static void AddGreenToBlueAndRed(const VP8LTransform* const transform,
- int y_start, int y_end, uint32_t* data) {
- const int width = transform->xsize_;
- const uint32_t* const data_end = data + (y_end - y_start) * width;
- while (data < data_end) {
- const uint32_t argb = *data;
- // "* 0001001u" is equivalent to "(green << 16) + green)"
+void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) {
+ int i;
+ for (i = 0; i < num_pixels; ++i) {
+ const uint32_t argb = data[i];
const uint32_t green = ((argb >> 8) & 0xff);
uint32_t red_blue = (argb & 0x00ff00ffu);
red_blue += (green << 16) | green;
red_blue &= 0x00ff00ffu;
- *data++ = (argb & 0xff00ff00u) | red_blue;
+ data[i] = (argb & 0xff00ff00u) | red_blue;
}
}
-typedef struct {
- // Note: the members are uint8_t, so that any negative values are
- // automatically converted to "mod 256" values.
- uint8_t green_to_red_;
- uint8_t green_to_blue_;
- uint8_t red_to_blue_;
-} Multipliers;
-
-static WEBP_INLINE void MultipliersClear(Multipliers* m) {
- m->green_to_red_ = 0;
- m->green_to_blue_ = 0;
- m->red_to_blue_ = 0;
-}
-
static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
int8_t color) {
return (uint32_t)((int)(color_pred) * color) >> 5;
}
static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
- Multipliers* const m) {
+ VP8LMultipliers* const m) {
m->green_to_red_ = (color_code >> 0) & 0xff;
m->green_to_blue_ = (color_code >> 8) & 0xff;
m->red_to_blue_ = (color_code >> 16) & 0xff;
}
-static WEBP_INLINE uint32_t MultipliersToColorCode(Multipliers* const m) {
- return 0xff000000u |
- ((uint32_t)(m->red_to_blue_) << 16) |
- ((uint32_t)(m->green_to_blue_) << 8) |
- m->green_to_red_;
-}
-
-static WEBP_INLINE uint32_t TransformColor(const Multipliers* const m,
- uint32_t argb, int inverse) {
- const uint32_t green = argb >> 8;
- const uint32_t red = argb >> 16;
- uint32_t new_red = red;
- uint32_t new_blue = argb;
-
- if (inverse) {
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
+ int num_pixels) {
+ int i;
+ for (i = 0; i < num_pixels; ++i) {
+ const uint32_t argb = data[i];
+ const uint32_t green = argb >> 8;
+ const uint32_t red = argb >> 16;
+ uint32_t new_red = red;
+ uint32_t new_blue = argb;
new_red += ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue += ColorTransformDelta(m->green_to_blue_, green);
new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
new_blue &= 0xff;
- } else {
- new_red -= ColorTransformDelta(m->green_to_red_, green);
- new_red &= 0xff;
- new_blue -= ColorTransformDelta(m->green_to_blue_, green);
- new_blue -= ColorTransformDelta(m->red_to_blue_, red);
- new_blue &= 0xff;
- }
- return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
-}
-
-static WEBP_INLINE int SkipRepeatedPixels(const uint32_t* const argb,
- int ix, int xsize) {
- const uint32_t v = argb[ix];
- if (ix >= xsize + 3) {
- if (v == argb[ix - xsize] &&
- argb[ix - 1] == argb[ix - xsize - 1] &&
- argb[ix - 2] == argb[ix - xsize - 2] &&
- argb[ix - 3] == argb[ix - xsize - 3]) {
- return 1;
- }
- return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
- } else if (ix >= 3) {
- return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
- }
- return 0;
-}
-
-static float PredictionCostCrossColor(const int accumulated[256],
- const int counts[256]) {
- // Favor low entropy, locally and globally.
- int i;
- int combo[256];
- for (i = 0; i < 256; ++i) {
- combo[i] = accumulated[i] + counts[i];
- }
- return ShannonEntropy(combo, 256) +
- ShannonEntropy(counts, 256) +
- PredictionCostSpatial(counts, 3, 2.4); // Favor small absolute values.
-}
-
-static Multipliers GetBestColorTransformForTile(
- int tile_x, int tile_y, int bits,
- Multipliers prevX,
- Multipliers prevY,
- int step, int xsize, int ysize,
- int* accumulated_red_histo,
- int* accumulated_blue_histo,
- const uint32_t* const argb) {
- float best_diff = MAX_DIFF_COST;
- float cur_diff;
- const int halfstep = step / 2;
- const int max_tile_size = 1 << bits;
- const int tile_y_offset = tile_y * max_tile_size;
- const int tile_x_offset = tile_x * max_tile_size;
- int green_to_red;
- int green_to_blue;
- int red_to_blue;
- int all_x_max = tile_x_offset + max_tile_size;
- int all_y_max = tile_y_offset + max_tile_size;
- Multipliers best_tx;
- MultipliersClear(&best_tx);
- if (all_x_max > xsize) {
- all_x_max = xsize;
- }
- if (all_y_max > ysize) {
- all_y_max = ysize;
- }
- for (green_to_red = -64; green_to_red <= 64; green_to_red += halfstep) {
- int histo[256] = { 0 };
- int all_y;
- Multipliers tx;
- MultipliersClear(&tx);
- tx.green_to_red_ = green_to_red & 0xff;
-
- for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
- uint32_t predict;
- int ix = all_y * xsize + tile_x_offset;
- int all_x;
- for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
- if (SkipRepeatedPixels(argb, ix, xsize)) {
- continue;
- }
- predict = TransformColor(&tx, argb[ix], 0);
- ++histo[(predict >> 16) & 0xff]; // red.
- }
- }
- cur_diff = PredictionCostCrossColor(&accumulated_red_histo[0], &histo[0]);
- if (tx.green_to_red_ == prevX.green_to_red_) {
- cur_diff -= 3; // favor keeping the areas locally similar
- }
- if (tx.green_to_red_ == prevY.green_to_red_) {
- cur_diff -= 3; // favor keeping the areas locally similar
- }
- if (tx.green_to_red_ == 0) {
- cur_diff -= 3;
- }
- if (cur_diff < best_diff) {
- best_diff = cur_diff;
- best_tx = tx;
- }
- }
- best_diff = MAX_DIFF_COST;
- green_to_red = best_tx.green_to_red_;
- for (green_to_blue = -32; green_to_blue <= 32; green_to_blue += step) {
- for (red_to_blue = -32; red_to_blue <= 32; red_to_blue += step) {
- int all_y;
- int histo[256] = { 0 };
- Multipliers tx;
- tx.green_to_red_ = green_to_red;
- tx.green_to_blue_ = green_to_blue;
- tx.red_to_blue_ = red_to_blue;
- for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
- uint32_t predict;
- int all_x;
- int ix = all_y * xsize + tile_x_offset;
- for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
- if (SkipRepeatedPixels(argb, ix, xsize)) {
- continue;
- }
- predict = TransformColor(&tx, argb[ix], 0);
- ++histo[predict & 0xff]; // blue.
- }
- }
- cur_diff =
- PredictionCostCrossColor(&accumulated_blue_histo[0], &histo[0]);
- if (tx.green_to_blue_ == prevX.green_to_blue_) {
- cur_diff -= 3; // favor keeping the areas locally similar
- }
- if (tx.green_to_blue_ == prevY.green_to_blue_) {
- cur_diff -= 3; // favor keeping the areas locally similar
- }
- if (tx.red_to_blue_ == prevX.red_to_blue_) {
- cur_diff -= 3; // favor keeping the areas locally similar
- }
- if (tx.red_to_blue_ == prevY.red_to_blue_) {
- cur_diff -= 3; // favor keeping the areas locally similar
- }
- if (tx.green_to_blue_ == 0) {
- cur_diff -= 3;
- }
- if (tx.red_to_blue_ == 0) {
- cur_diff -= 3;
- }
- if (cur_diff < best_diff) {
- best_diff = cur_diff;
- best_tx = tx;
- }
- }
- }
- return best_tx;
-}
-
-static void CopyTileWithColorTransform(int xsize, int ysize,
- int tile_x, int tile_y, int bits,
- Multipliers color_transform,
- uint32_t* const argb) {
- int y;
- int xscan = 1 << bits;
- int yscan = 1 << bits;
- tile_x <<= bits;
- tile_y <<= bits;
- if (xscan > xsize - tile_x) {
- xscan = xsize - tile_x;
- }
- if (yscan > ysize - tile_y) {
- yscan = ysize - tile_y;
- }
- yscan += tile_y;
- for (y = tile_y; y < yscan; ++y) {
- int ix = y * xsize + tile_x;
- const int end_ix = ix + xscan;
- for (; ix < end_ix; ++ix) {
- argb[ix] = TransformColor(&color_transform, argb[ix], 0);
- }
- }
-}
-
-void VP8LColorSpaceTransform(int width, int height, int bits, int step,
- uint32_t* const argb, uint32_t* image) {
- const int max_tile_size = 1 << bits;
- int tile_xsize = VP8LSubSampleSize(width, bits);
- int tile_ysize = VP8LSubSampleSize(height, bits);
- int accumulated_red_histo[256] = { 0 };
- int accumulated_blue_histo[256] = { 0 };
- int tile_y;
- int tile_x;
- Multipliers prevX;
- Multipliers prevY;
- MultipliersClear(&prevY);
- MultipliersClear(&prevX);
- for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
- for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
- Multipliers color_transform;
- int all_x_max;
- int y;
- const int tile_y_offset = tile_y * max_tile_size;
- const int tile_x_offset = tile_x * max_tile_size;
- if (tile_y != 0) {
- ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
- ColorCodeToMultipliers(image[(tile_y - 1) * tile_xsize + tile_x],
- &prevY);
- } else if (tile_x != 0) {
- ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
- }
- color_transform =
- GetBestColorTransformForTile(tile_x, tile_y, bits,
- prevX, prevY,
- step, width, height,
- &accumulated_red_histo[0],
- &accumulated_blue_histo[0],
- argb);
- image[tile_y * tile_xsize + tile_x] =
- MultipliersToColorCode(&color_transform);
- CopyTileWithColorTransform(width, height, tile_x, tile_y, bits,
- color_transform, argb);
-
- // Gather accumulated histogram data.
- all_x_max = tile_x_offset + max_tile_size;
- if (all_x_max > width) {
- all_x_max = width;
- }
- for (y = 0; y < max_tile_size; ++y) {
- int ix;
- int all_x;
- int all_y = tile_y_offset + y;
- if (all_y >= height) {
- break;
- }
- ix = all_y * width + tile_x_offset;
- for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
- if (ix >= 2 &&
- argb[ix] == argb[ix - 2] &&
- argb[ix] == argb[ix - 1]) {
- continue; // repeated pixels are handled by backward references
- }
- if (ix >= width + 2 &&
- argb[ix - 2] == argb[ix - width - 2] &&
- argb[ix - 1] == argb[ix - width - 1] &&
- argb[ix] == argb[ix - width]) {
- continue; // repeated pixels are handled by backward references
- }
- ++accumulated_red_histo[(argb[ix] >> 16) & 0xff];
- ++accumulated_blue_histo[argb[ix] & 0xff];
- }
- }
- }
+ data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
}
}
@@ -898,7 +281,10 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int step,
static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
int y_start, int y_end, uint32_t* data) {
const int width = transform->xsize_;
- const int mask = (1 << transform->bits_) - 1;
+ const int tile_width = 1 << transform->bits_;
+ const int mask = tile_width - 1;
+ const int safe_width = width & ~mask;
+ const int remaining_width = width - safe_width;
const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
int y = y_start;
const uint32_t* pred_row =
@@ -906,68 +292,89 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
while (y < y_end) {
const uint32_t* pred = pred_row;
- Multipliers m = { 0, 0, 0 };
- int x;
-
- for (x = 0; x < width; ++x) {
- if ((x & mask) == 0) ColorCodeToMultipliers(*pred++, &m);
- data[x] = TransformColor(&m, data[x], 1);
+ VP8LMultipliers m = { 0, 0, 0 };
+ const uint32_t* const data_safe_end = data + safe_width;
+ const uint32_t* const data_end = data + width;
+ while (data < data_safe_end) {
+ ColorCodeToMultipliers(*pred++, &m);
+ VP8LTransformColorInverse(&m, data, tile_width);
+ data += tile_width;
+ }
+ if (data < data_end) { // Left-overs using C-version.
+ ColorCodeToMultipliers(*pred++, &m);
+ VP8LTransformColorInverse(&m, data, remaining_width);
+ data += remaining_width;
}
- data += width;
++y;
- if ((y & mask) == 0) pred_row += tiles_per_row;;
+ if ((y & mask) == 0) pred_row += tiles_per_row;
}
}
// Separate out pixels packed together using pixel-bundling.
-static void ColorIndexInverseTransform(
- const VP8LTransform* const transform,
- int y_start, int y_end, const uint32_t* src, uint32_t* dst) {
- int y;
- const int bits_per_pixel = 8 >> transform->bits_;
- const int width = transform->xsize_;
- const uint32_t* const color_map = transform->data_;
- if (bits_per_pixel < 8) {
- const int pixels_per_byte = 1 << transform->bits_;
- const int count_mask = pixels_per_byte - 1;
- const uint32_t bit_mask = (1 << bits_per_pixel) - 1;
- for (y = y_start; y < y_end; ++y) {
- uint32_t packed_pixels = 0;
- int x;
- for (x = 0; x < width; ++x) {
- // We need to load fresh 'packed_pixels' once every 'pixels_per_byte'
- // increments of x. Fortunately, pixels_per_byte is a power of 2, so
- // can just use a mask for that, instead of decrementing a counter.
- if ((x & count_mask) == 0) packed_pixels = ((*src++) >> 8) & 0xff;
- *dst++ = color_map[packed_pixels & bit_mask];
- packed_pixels >>= bits_per_pixel;
- }
- }
- } else {
- for (y = y_start; y < y_end; ++y) {
- int x;
- for (x = 0; x < width; ++x) {
- *dst++ = color_map[((*src++) >> 8) & 0xff];
- }
- }
- }
-}
+// We define two methods for ARGB data (uint32_t) and alpha-only data (uint8_t).
+#define COLOR_INDEX_INVERSE(FUNC_NAME, F_NAME, STATIC_DECL, TYPE, BIT_SUFFIX, \
+ GET_INDEX, GET_VALUE) \
+static void F_NAME(const TYPE* src, const uint32_t* const color_map, \
+ TYPE* dst, int y_start, int y_end, int width) { \
+ int y; \
+ for (y = y_start; y < y_end; ++y) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]); \
+ } \
+ } \
+} \
+STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform, \
+ int y_start, int y_end, const TYPE* src, \
+ TYPE* dst) { \
+ int y; \
+ const int bits_per_pixel = 8 >> transform->bits_; \
+ const int width = transform->xsize_; \
+ const uint32_t* const color_map = transform->data_; \
+ if (bits_per_pixel < 8) { \
+ const int pixels_per_byte = 1 << transform->bits_; \
+ const int count_mask = pixels_per_byte - 1; \
+ const uint32_t bit_mask = (1 << bits_per_pixel) - 1; \
+ for (y = y_start; y < y_end; ++y) { \
+ uint32_t packed_pixels = 0; \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ /* We need to load fresh 'packed_pixels' once every */ \
+ /* 'pixels_per_byte' increments of x. Fortunately, pixels_per_byte */ \
+ /* is a power of 2, so can just use a mask for that, instead of */ \
+ /* decrementing a counter. */ \
+ if ((x & count_mask) == 0) packed_pixels = GET_INDEX(*src++); \
+ *dst++ = GET_VALUE(color_map[packed_pixels & bit_mask]); \
+ packed_pixels >>= bits_per_pixel; \
+ } \
+ } \
+ } else { \
+ VP8LMapColor##BIT_SUFFIX(src, color_map, dst, y_start, y_end, width); \
+ } \
+}
+
+COLOR_INDEX_INVERSE(ColorIndexInverseTransform, MapARGB, static, uint32_t, 32b,
+ VP8GetARGBIndex, VP8GetARGBValue)
+COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha, , uint8_t,
+ 8b, VP8GetAlphaIndex, VP8GetAlphaValue)
+
+#undef COLOR_INDEX_INVERSE
void VP8LInverseTransform(const VP8LTransform* const transform,
int row_start, int row_end,
const uint32_t* const in, uint32_t* const out) {
+ const int width = transform->xsize_;
assert(row_start < row_end);
assert(row_end <= transform->ysize_);
switch (transform->type_) {
case SUBTRACT_GREEN:
- AddGreenToBlueAndRed(transform, row_start, row_end, out);
+ VP8LAddGreenToBlueAndRed(out, (row_end - row_start) * width);
break;
case PREDICTOR_TRANSFORM:
PredictorInverseTransform(transform, row_start, row_end, out);
if (row_end != transform->ysize_) {
// The last predicted row in this iteration will be the top-pred row
// for the first row in next iteration.
- const int width = transform->xsize_;
memcpy(out - width, out + (row_end - row_start - 1) * width,
width * sizeof(*out));
}
@@ -982,7 +389,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
// Also, note that this is the only transform that applies on
// the effective width of VP8LSubSampleSize(xsize_, bits_). All other
// transforms work on effective width of xsize_.
- const int out_stride = (row_end - row_start) * transform->xsize_;
+ const int out_stride = (row_end - row_start) * width;
const int in_stride = (row_end - row_start) *
VP8LSubSampleSize(transform->xsize_, transform->bits_);
uint32_t* const src = out + out_stride - in_stride;
@@ -1006,8 +413,8 @@ static int is_big_endian(void) {
return (tmp.b[0] != 1);
}
-static void ConvertBGRAToRGB(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGB_C(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
@@ -1017,8 +424,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
}
}
-static void ConvertBGRAToRGBA(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
@@ -1029,28 +436,42 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
}
}
-static void ConvertBGRAToRGBA4444(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
- *dst++ = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
- *dst++ = ((argb >> 0) & 0xf0) | ((argb >> 28) & 0xf);
+ const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
+ const uint8_t ba = ((argb >> 0) & 0xf0) | ((argb >> 28) & 0xf);
+#ifdef WEBP_SWAP_16BIT_CSP
+ *dst++ = ba;
+ *dst++ = rg;
+#else
+ *dst++ = rg;
+ *dst++ = ba;
+#endif
}
}
-static void ConvertBGRAToRGB565(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
- *dst++ = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
- *dst++ = ((argb >> 5) & 0xe0) | ((argb >> 3) & 0x1f);
+ const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
+ const uint8_t gb = ((argb >> 5) & 0xe0) | ((argb >> 3) & 0x1f);
+#ifdef WEBP_SWAP_16BIT_CSP
+ *dst++ = gb;
+ *dst++ = rg;
+#else
+ *dst++ = rg;
+ *dst++ = gb;
+#endif
}
}
-static void ConvertBGRAToBGR(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToBGR_C(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
@@ -1065,21 +486,24 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
if (is_big_endian() == swap_on_big_endian) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
- uint32_t argb = *src++;
-#if !defined(__BIG_ENDIAN__) && (defined(__i386__) || defined(__x86_64__))
- __asm__ volatile("bswap %0" : "=r"(argb) : "0"(argb));
- *(uint32_t*)dst = argb;
- dst += sizeof(argb);
-#elif !defined(__BIG_ENDIAN__) && defined(_MSC_VER)
- argb = _byteswap_ulong(argb);
- *(uint32_t*)dst = argb;
- dst += sizeof(argb);
-#else
- *dst++ = (argb >> 24) & 0xff;
- *dst++ = (argb >> 16) & 0xff;
- *dst++ = (argb >> 8) & 0xff;
- *dst++ = (argb >> 0) & 0xff;
+ const uint32_t argb = *src++;
+
+#if !defined(WORDS_BIGENDIAN)
+#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
+ *(uint32_t*)dst = BSwap32(argb);
+#else // WEBP_REFERENCE_IMPLEMENTATION
+ dst[0] = (argb >> 24) & 0xff;
+ dst[1] = (argb >> 16) & 0xff;
+ dst[2] = (argb >> 8) & 0xff;
+ dst[3] = (argb >> 0) & 0xff;
#endif
+#else // WORDS_BIGENDIAN
+ dst[0] = (argb >> 0) & 0xff;
+ dst[1] = (argb >> 8) & 0xff;
+ dst[2] = (argb >> 16) & 0xff;
+ dst[3] = (argb >> 24) & 0xff;
+#endif
+ dst += sizeof(argb);
}
} else {
memcpy(dst, src, num_pixels * sizeof(*src));
@@ -1090,17 +514,17 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
WEBP_CSP_MODE out_colorspace, uint8_t* const rgba) {
switch (out_colorspace) {
case MODE_RGB:
- ConvertBGRAToRGB(in_data, num_pixels, rgba);
+ VP8LConvertBGRAToRGB(in_data, num_pixels, rgba);
break;
case MODE_RGBA:
- ConvertBGRAToRGBA(in_data, num_pixels, rgba);
+ VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
break;
case MODE_rgbA:
- ConvertBGRAToRGBA(in_data, num_pixels, rgba);
+ VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
break;
case MODE_BGR:
- ConvertBGRAToBGR(in_data, num_pixels, rgba);
+ VP8LConvertBGRAToBGR(in_data, num_pixels, rgba);
break;
case MODE_BGRA:
CopyOrSwap(in_data, num_pixels, rgba, 1);
@@ -1117,14 +541,14 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
WebPApplyAlphaMultiply(rgba, 1, num_pixels, 1, 0);
break;
case MODE_RGBA_4444:
- ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
+ VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
break;
case MODE_rgbA_4444:
- ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
+ VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
WebPApplyAlphaMultiply4444(rgba, num_pixels, 1, 0);
break;
case MODE_RGB_565:
- ConvertBGRAToRGB565(in_data, num_pixels, rgba);
+ VP8LConvertBGRAToRGB565(in_data, num_pixels, rgba);
break;
default:
assert(0); // Code flow should not reach here.
@@ -1133,6 +557,79 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
//------------------------------------------------------------------------------
-#if defined(__cplusplus) || defined(c_plusplus)
-} // extern "C"
+VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+VP8LPredictorFunc VP8LPredictors[16];
+
+VP8LTransformColorFunc VP8LTransformColorInverse;
+
+VP8LConvertFunc VP8LConvertBGRAToRGB;
+VP8LConvertFunc VP8LConvertBGRAToRGBA;
+VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
+VP8LConvertFunc VP8LConvertBGRAToRGB565;
+VP8LConvertFunc VP8LConvertBGRAToBGR;
+
+VP8LMapARGBFunc VP8LMapColor32b;
+VP8LMapAlphaFunc VP8LMapColor8b;
+
+extern void VP8LDspInitSSE2(void);
+extern void VP8LDspInitNEON(void);
+extern void VP8LDspInitMIPSdspR2(void);
+
+static volatile VP8CPUInfo lossless_last_cpuinfo_used =
+ (VP8CPUInfo)&lossless_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
+ if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+ VP8LPredictors[0] = Predictor0;
+ VP8LPredictors[1] = Predictor1;
+ VP8LPredictors[2] = Predictor2;
+ VP8LPredictors[3] = Predictor3;
+ VP8LPredictors[4] = Predictor4;
+ VP8LPredictors[5] = Predictor5;
+ VP8LPredictors[6] = Predictor6;
+ VP8LPredictors[7] = Predictor7;
+ VP8LPredictors[8] = Predictor8;
+ VP8LPredictors[9] = Predictor9;
+ VP8LPredictors[10] = Predictor10;
+ VP8LPredictors[11] = Predictor11;
+ VP8LPredictors[12] = Predictor12;
+ VP8LPredictors[13] = Predictor13;
+ VP8LPredictors[14] = Predictor0; // <- padding security sentinels
+ VP8LPredictors[15] = Predictor0;
+
+ VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
+
+ VP8LTransformColorInverse = VP8LTransformColorInverse_C;
+
+ VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
+ VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
+ VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
+ VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
+ VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
+
+ VP8LMapColor32b = MapARGB;
+ VP8LMapColor8b = MapAlpha;
+
+ // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+ if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+ if (VP8GetCPUInfo(kSSE2)) {
+ VP8LDspInitSSE2();
+ }
+#endif
+#if defined(WEBP_USE_NEON)
+ if (VP8GetCPUInfo(kNEON)) {
+ VP8LDspInitNEON();
+ }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+ if (VP8GetCPUInfo(kMIPSdspR2)) {
+ VP8LDspInitMIPSdspR2();
+ }
#endif
+ }
+ lossless_last_cpuinfo_used = VP8GetCPUInfo;
+}
+
+//------------------------------------------------------------------------------
diff --git a/drivers/webp/dsp/lossless.h b/drivers/webp/dsp/lossless.h
index 7c7d5555ed..ee6771333f 100644
--- a/drivers/webp/dsp/lossless.h
+++ b/drivers/webp/dsp/lossless.h
@@ -1,8 +1,10 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
-// This code is licensed under the same terms as WebM:
-// Software License Agreement: http://www.webmproject.org/license/software/
-// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Image transforms and color space conversion methods for lossless decoder.
@@ -13,15 +15,42 @@
#ifndef WEBP_DSP_LOSSLESS_H_
#define WEBP_DSP_LOSSLESS_H_
-#include "../types.h"
-#include "../decode.h"
+#include "../webp/types.h"
+#include "../webp/decode.h"
-#if defined(__cplusplus) || defined(c_plusplus)
+#include "../enc/histogram.h"
+#include "../utils/utils.h"
+
+#ifdef __cplusplus
extern "C" {
#endif
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+#include "../enc/delta_palettization.h"
+#endif // WEBP_EXPERIMENTAL_FEATURES
+
+// Not a trivial literal symbol.
+#define VP8L_NON_TRIVIAL_SYM (0xffffffff)
+
//------------------------------------------------------------------------------
-// Image transforms.
+// Decoding
+
+typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
+extern VP8LPredictorFunc VP8LPredictors[16];
+
+typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels);
+extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+
+typedef struct {
+ // Note: the members are uint8_t, so that any negative values are
+ // automatically converted to "mod 256" values.
+ uint8_t green_to_red_;
+ uint8_t green_to_blue_;
+ uint8_t red_to_blue_;
+} VP8LMultipliers;
+typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
+ uint32_t* argb_data, int num_pixels);
+extern VP8LTransformColorFunc VP8LTransformColorInverse;
struct VP8LTransform; // Defined in dec/vp8li.h.
@@ -33,23 +62,110 @@ void VP8LInverseTransform(const struct VP8LTransform* const transform,
int row_start, int row_end,
const uint32_t* const in, uint32_t* const out);
-// Subtracts green from blue and red channels.
-void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs);
-
-void VP8LResidualImage(int width, int height, int bits,
- uint32_t* const argb, uint32_t* const argb_scratch,
- uint32_t* const image);
-
-void VP8LColorSpaceTransform(int width, int height, int bits, int step,
- uint32_t* const argb, uint32_t* image);
-
-//------------------------------------------------------------------------------
// Color space conversion.
+typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
+ uint8_t* dst);
+extern VP8LConvertFunc VP8LConvertBGRAToRGB;
+extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
+extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
+extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
+extern VP8LConvertFunc VP8LConvertBGRAToBGR;
// Converts from BGRA to other color spaces.
void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
WEBP_CSP_MODE out_colorspace, uint8_t* const rgba);
+// color mapping related functions.
+static WEBP_INLINE uint32_t VP8GetARGBIndex(uint32_t idx) {
+ return (idx >> 8) & 0xff;
+}
+
+static WEBP_INLINE uint8_t VP8GetAlphaIndex(uint8_t idx) {
+ return idx;
+}
+
+static WEBP_INLINE uint32_t VP8GetARGBValue(uint32_t val) {
+ return val;
+}
+
+static WEBP_INLINE uint8_t VP8GetAlphaValue(uint32_t val) {
+ return (val >> 8) & 0xff;
+}
+
+typedef void (*VP8LMapARGBFunc)(const uint32_t* src,
+ const uint32_t* const color_map,
+ uint32_t* dst, int y_start,
+ int y_end, int width);
+typedef void (*VP8LMapAlphaFunc)(const uint8_t* src,
+ const uint32_t* const color_map,
+ uint8_t* dst, int y_start,
+ int y_end, int width);
+
+extern VP8LMapARGBFunc VP8LMapColor32b;
+extern VP8LMapAlphaFunc VP8LMapColor8b;
+
+// Similar to the static method ColorIndexInverseTransform() that is part of
+// lossless.c, but used only for alpha decoding. It takes uint8_t (rather than
+// uint32_t) arguments for 'src' and 'dst'.
+void VP8LColorIndexInverseTransformAlpha(
+ const struct VP8LTransform* const transform, int y_start, int y_end,
+ const uint8_t* src, uint8_t* dst);
+
+// Expose some C-only fallback functions
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
+ uint32_t* data, int num_pixels);
+
+void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
+ int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
+ int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
+
+// Must be called before calling any of the above methods.
+void VP8LDspInit(void);
+
+//------------------------------------------------------------------------------
+// Encoding
+
+extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+extern VP8LTransformColorFunc VP8LTransformColor;
+typedef void (*VP8LCollectColorBlueTransformsFunc)(
+ const uint32_t* argb, int stride,
+ int tile_width, int tile_height,
+ int green_to_blue, int red_to_blue, int histo[]);
+extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
+
+typedef void (*VP8LCollectColorRedTransformsFunc)(
+ const uint32_t* argb, int stride,
+ int tile_width, int tile_height,
+ int green_to_red, int histo[]);
+extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
+
+// Expose some C-only fallback functions
+void VP8LTransformColor_C(const VP8LMultipliers* const m,
+ uint32_t* data, int num_pixels);
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
+void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
+ int tile_width, int tile_height,
+ int green_to_red, int histo[]);
+void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
+ int tile_width, int tile_height,
+ int green_to_blue, int red_to_blue,
+ int histo[]);
+
+//------------------------------------------------------------------------------
+// Image transforms.
+
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
+ uint32_t* const argb, uint32_t* const argb_scratch,
+ uint32_t* const image);
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+ uint32_t* const argb, uint32_t* image);
+
//------------------------------------------------------------------------------
// Misc methods.
@@ -59,10 +175,136 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
return (size + (1 << sampling_bits) - 1) >> sampling_bits;
}
-// Faster logarithm for integers, with the property of log2(0) == 0.
-float VP8LFastLog2(int v);
+// -----------------------------------------------------------------------------
+// Faster logarithm for integers. Small values use a look-up table.
+#define LOG_LOOKUP_IDX_MAX 256
+extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
+extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
+typedef float (*VP8LFastLog2SlowFunc)(uint32_t v);
+
+extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
+static WEBP_INLINE float VP8LFastLog2(uint32_t v) {
+ return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
+}
// Fast calculation of v * log2(v) for integer input.
-static WEBP_INLINE float VP8LFastSLog2(int v) { return VP8LFastLog2(v) * v; }
+static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
+ return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
+}
+
+// -----------------------------------------------------------------------------
+// Huffman-cost related functions.
+
+typedef double (*VP8LCostFunc)(const uint32_t* population, int length);
+typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
+ int length);
+
+extern VP8LCostFunc VP8LExtraCost;
+extern VP8LCostCombinedFunc VP8LExtraCostCombined;
+
+typedef struct { // small struct to hold counters
+ int counts[2]; // index: 0=zero steak, 1=non-zero streak
+ int streaks[2][2]; // [zero/non-zero][streak<3 / streak>=3]
+} VP8LStreaks;
+
+typedef VP8LStreaks (*VP8LCostCountFunc)(const uint32_t* population,
+ int length);
+typedef VP8LStreaks (*VP8LCostCombinedCountFunc)(const uint32_t* X,
+ const uint32_t* Y, int length);
+
+extern VP8LCostCountFunc VP8LHuffmanCostCount;
+extern VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;
+
+// Get the symbol entropy for the distribution 'population'.
+// Set 'trivial_sym', if there's only one symbol present in the distribution.
+double VP8LPopulationCost(const uint32_t* const population, int length,
+ uint32_t* const trivial_sym);
+
+// Get the combined symbol entropy for the distributions 'X' and 'Y'.
+double VP8LGetCombinedEntropy(const uint32_t* const X,
+ const uint32_t* const Y, int length);
+
+double VP8LBitsEntropy(const uint32_t* const array, int n,
+ uint32_t* const trivial_symbol);
+
+// Estimate how many bits the combined entropy of literals and distance
+// approximately maps to.
+double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
+
+// This function estimates the cost in bits excluding the bits needed to
+// represent the entropy code itself.
+double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p);
+
+typedef void (*VP8LHistogramAddFunc)(const VP8LHistogram* const a,
+ const VP8LHistogram* const b,
+ VP8LHistogram* const out);
+extern VP8LHistogramAddFunc VP8LHistogramAdd;
+
+// -----------------------------------------------------------------------------
+// PrefixEncode()
+
+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+ const int log_floor = BitsLog2Floor(n);
+ if (n == (n & ~(n - 1))) // zero or a power of two.
+ return log_floor;
+ else
+ return log_floor + 1;
+}
+
+// Splitting of distance and length codes into prefixes and
+// extra bits. The prefixes are encoded with an entropy code
+// while the extra bits are stored just as normal bits.
+static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
+ int* const extra_bits) {
+ const int highest_bit = BitsLog2Floor(--distance);
+ const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+ *extra_bits = highest_bit - 1;
+ *code = 2 * highest_bit + second_highest_bit;
+}
+
+static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
+ int* const extra_bits,
+ int* const extra_bits_value) {
+ const int highest_bit = BitsLog2Floor(--distance);
+ const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+ *extra_bits = highest_bit - 1;
+ *extra_bits_value = distance & ((1 << *extra_bits) - 1);
+ *code = 2 * highest_bit + second_highest_bit;
+}
+
+#define PREFIX_LOOKUP_IDX_MAX 512
+typedef struct {
+ int8_t code_;
+ int8_t extra_bits_;
+} VP8LPrefixCode;
+
+// These tables are derived using VP8LPrefixEncodeNoLUT.
+extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
+extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
+static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
+ int* const extra_bits) {
+ if (distance < PREFIX_LOOKUP_IDX_MAX) {
+ const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+ *code = prefix_code.code_;
+ *extra_bits = prefix_code.extra_bits_;
+ } else {
+ VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
+ }
+}
+
+static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
+ int* const extra_bits,
+ int* const extra_bits_value) {
+ if (distance < PREFIX_LOOKUP_IDX_MAX) {
+ const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+ *code = prefix_code.code_;
+ *extra_bits = prefix_code.extra_bits_;
+ *extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
+ } else {
+ VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
+ }
+}
// In-place difference of each component with mod 256.
static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
@@ -73,9 +315,15 @@ static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
}
+void VP8LBundleColorMap(const uint8_t* const row, int width,
+ int xbits, uint32_t* const dst);
+
+// Must be called before calling any of the above methods.
+void VP8LEncDspInit(void);
+
//------------------------------------------------------------------------------
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/drivers/webp/dsp/upsampling.c b/drivers/webp/dsp/upsampling.c
index 4855eb1432..651274fcee 100644
--- a/drivers/webp/dsp/upsampling.c
+++ b/drivers/webp/dsp/upsampling.c
@@ -1,8 +1,10 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
-// This code is licensed under the same terms as WebM:
-// Software License Agreement: http://www.webmproject.org/license/software/
-// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// YUV to RGB upsampling functions.
@@ -12,9 +14,7 @@
#include "./dsp.h"
#include "./yuv.h"
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include <assert.h>
//------------------------------------------------------------------------------
// Fancy upsampler
@@ -32,7 +32,7 @@ WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
// ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16
// We process u and v together stashed into 32bit (16bit each).
-#define LOAD_UV(u,v) ((u) | ((v) << 16))
+#define LOAD_UV(u, v) ((u) | ((v) << 16))
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
@@ -43,11 +43,12 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
uint32_t l_uv = LOAD_UV(cur_u[0], cur_v[0]); /* left-sample */ \
- if (top_y) { \
+ assert(top_y != NULL); \
+ { \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \
} \
- if (bottom_y) { \
+ if (bottom_y != NULL) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst); \
} \
@@ -58,7 +59,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u; \
const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3; \
const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3; \
- if (top_y) { \
+ { \
const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \
const uint32_t uv1 = (diag_03 + t_uv) >> 1; \
FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
@@ -66,7 +67,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \
top_dst + (2 * x - 0) * XSTEP); \
} \
- if (bottom_y) { \
+ if (bottom_y != NULL) { \
const uint32_t uv0 = (diag_03 + l_uv) >> 1; \
const uint32_t uv1 = (diag_12 + uv) >> 1; \
FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
@@ -78,12 +79,12 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
l_uv = uv; \
} \
if (!(len & 1)) { \
- if (top_y) { \
+ { \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (len - 1) * XSTEP); \
} \
- if (bottom_y) { \
+ if (bottom_y != NULL) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (len - 1) * XSTEP); \
@@ -106,57 +107,6 @@ UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2)
#endif // FANCY_UPSAMPLING
//------------------------------------------------------------------------------
-// simple point-sampling
-
-#define SAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
- const uint8_t* u, const uint8_t* v, \
- uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
- int i; \
- for (i = 0; i < len - 1; i += 2) { \
- FUNC(top_y[0], u[0], v[0], top_dst); \
- FUNC(top_y[1], u[0], v[0], top_dst + XSTEP); \
- FUNC(bottom_y[0], u[0], v[0], bottom_dst); \
- FUNC(bottom_y[1], u[0], v[0], bottom_dst + XSTEP); \
- top_y += 2; \
- bottom_y += 2; \
- u++; \
- v++; \
- top_dst += 2 * XSTEP; \
- bottom_dst += 2 * XSTEP; \
- } \
- if (i == len - 1) { /* last one */ \
- FUNC(top_y[0], u[0], v[0], top_dst); \
- FUNC(bottom_y[0], u[0], v[0], bottom_dst); \
- } \
-}
-
-// All variants implemented.
-SAMPLE_FUNC(SampleRgbLinePair, VP8YuvToRgb, 3)
-SAMPLE_FUNC(SampleBgrLinePair, VP8YuvToBgr, 3)
-SAMPLE_FUNC(SampleRgbaLinePair, VP8YuvToRgba, 4)
-SAMPLE_FUNC(SampleBgraLinePair, VP8YuvToBgra, 4)
-SAMPLE_FUNC(SampleArgbLinePair, VP8YuvToArgb, 4)
-SAMPLE_FUNC(SampleRgba4444LinePair, VP8YuvToRgba4444, 2)
-SAMPLE_FUNC(SampleRgb565LinePair, VP8YuvToRgb565, 2)
-
-#undef SAMPLE_FUNC
-
-const WebPSampleLinePairFunc WebPSamplers[MODE_LAST] = {
- SampleRgbLinePair, // MODE_RGB
- SampleRgbaLinePair, // MODE_RGBA
- SampleBgrLinePair, // MODE_BGR
- SampleBgraLinePair, // MODE_BGRA
- SampleArgbLinePair, // MODE_ARGB
- SampleRgba4444LinePair, // MODE_RGBA_4444
- SampleRgb565LinePair, // MODE_RGB_565
- SampleRgbaLinePair, // MODE_rgbA
- SampleBgraLinePair, // MODE_bgrA
- SampleArgbLinePair, // MODE_Argb
- SampleRgba4444LinePair // MODE_rgbA_4444
-};
-
-//------------------------------------------------------------------------------
#if !defined(FANCY_UPSAMPLING)
#define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC) \
@@ -166,7 +116,8 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \
uint8_t* top_dst, uint8_t* bot_dst, int len) { \
const int half_len = len >> 1; \
int x; \
- if (top_dst != NULL) { \
+ assert(top_dst != NULL); \
+ { \
for (x = 0; x < half_len; ++x) { \
FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x + 0); \
FUNC(top_y[2 * x + 1], top_u[x], top_v[x], top_dst + 8 * x + 4); \
@@ -202,116 +153,75 @@ WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
// YUV444 converter
#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
-static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
- uint8_t* dst, int len) { \
+extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
+ uint8_t* dst, int len); \
+void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
+ uint8_t* dst, int len) { \
int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
}
-YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb, 3)
-YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr, 3)
-YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba, 4)
-YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra, 4)
-YUV444_FUNC(Yuv444ToArgb, VP8YuvToArgb, 4)
-YUV444_FUNC(Yuv444ToRgba4444, VP8YuvToRgba4444, 2)
-YUV444_FUNC(Yuv444ToRgb565, VP8YuvToRgb565, 2)
+YUV444_FUNC(WebPYuv444ToRgbC, VP8YuvToRgb, 3)
+YUV444_FUNC(WebPYuv444ToBgrC, VP8YuvToBgr, 3)
+YUV444_FUNC(WebPYuv444ToRgbaC, VP8YuvToRgba, 4)
+YUV444_FUNC(WebPYuv444ToBgraC, VP8YuvToBgra, 4)
+YUV444_FUNC(WebPYuv444ToArgbC, VP8YuvToArgb, 4)
+YUV444_FUNC(WebPYuv444ToRgba4444C, VP8YuvToRgba4444, 2)
+YUV444_FUNC(WebPYuv444ToRgb565C, VP8YuvToRgb565, 2)
#undef YUV444_FUNC
-const WebPYUV444Converter WebPYUV444Converters[MODE_LAST] = {
- Yuv444ToRgb, // MODE_RGB
- Yuv444ToRgba, // MODE_RGBA
- Yuv444ToBgr, // MODE_BGR
- Yuv444ToBgra, // MODE_BGRA
- Yuv444ToArgb, // MODE_ARGB
- Yuv444ToRgba4444, // MODE_RGBA_4444
- Yuv444ToRgb565, // MODE_RGB_565
- Yuv444ToRgba, // MODE_rgbA
- Yuv444ToBgra, // MODE_bgrA
- Yuv444ToArgb, // MODE_Argb
- Yuv444ToRgba4444 // MODE_rgbA_4444
-};
-
-//------------------------------------------------------------------------------
-// Premultiplied modes
-
-// non dithered-modes
-
-// (x * a * 32897) >> 23 is bit-wise equivalent to (int)(x * a / 255.)
-// for all 8bit x or a. For bit-wise equivalence to (int)(x * a / 255. + .5),
-// one can use instead: (x * a * 65793 + (1 << 23)) >> 24
-#if 1 // (int)(x * a / 255.)
-#define MULTIPLIER(a) ((a) * 32897UL)
-#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
-#else // (int)(x * a / 255. + .5)
-#define MULTIPLIER(a) ((a) * 65793UL)
-#define PREMULTIPLY(x, m) (((x) * (m) + (1UL << 23)) >> 24)
-#endif
-
-static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
- int w, int h, int stride) {
- while (h-- > 0) {
- uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
- const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
- int i;
- for (i = 0; i < w; ++i) {
- const uint32_t a = alpha[4 * i];
- if (a != 0xff) {
- const uint32_t mult = MULTIPLIER(a);
- rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
- rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
- rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
- }
- }
- rgba += stride;
- }
-}
-#undef MULTIPLIER
-#undef PREMULTIPLY
-
-// rgbA4444
+WebPYUV444Converter WebPYUV444Converters[MODE_LAST];
-#define MULTIPLIER(a) ((a) * 0x1111) // 0x1111 ~= (1 << 16) / 15
+extern void WebPInitYUV444ConvertersMIPSdspR2(void);
+extern void WebPInitYUV444ConvertersSSE2(void);
-static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
- return (x & 0xf0) | (x >> 4);
-}
+static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
+ (VP8CPUInfo)&upsampling_last_cpuinfo_used1;
-static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
- return (x & 0x0f) | (x << 4);
-}
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
+ if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;
-static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
- return (x * m) >> 16;
-}
+ WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgbC;
+ WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgbaC;
+ WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgrC;
+ WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgraC;
+ WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgbC;
+ WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444C;
+ WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565C;
+ WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgbaC;
+ WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgraC;
+ WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgbC;
+ WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444C;
-static void ApplyAlphaMultiply4444(uint8_t* rgba4444,
- int w, int h, int stride) {
- while (h-- > 0) {
- int i;
- for (i = 0; i < w; ++i) {
- const uint8_t a = (rgba4444[2 * i + 1] & 0x0f);
- const uint32_t mult = MULTIPLIER(a);
- const uint8_t r = multiply(dither_hi(rgba4444[2 * i + 0]), mult);
- const uint8_t g = multiply(dither_lo(rgba4444[2 * i + 0]), mult);
- const uint8_t b = multiply(dither_hi(rgba4444[2 * i + 1]), mult);
- rgba4444[2 * i + 0] = (r & 0xf0) | ((g >> 4) & 0x0f);
- rgba4444[2 * i + 1] = (b & 0xf0) | a;
+ if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+ if (VP8GetCPUInfo(kSSE2)) {
+ WebPInitYUV444ConvertersSSE2();
+ }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+ if (VP8GetCPUInfo(kMIPSdspR2)) {
+ WebPInitYUV444ConvertersMIPSdspR2();
}
- rgba4444 += stride;
+#endif
}
+ upsampling_last_cpuinfo_used1 = VP8GetCPUInfo;
}
-#undef MULTIPLIER
-
-void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int)
- = ApplyAlphaMultiply;
-void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int)
- = ApplyAlphaMultiply4444;
//------------------------------------------------------------------------------
-// Main call
+// Main calls
+
+extern void WebPInitUpsamplersSSE2(void);
+extern void WebPInitUpsamplersNEON(void);
+extern void WebPInitUpsamplersMIPSdspR2(void);
+
+static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 =
+ (VP8CPUInfo)&upsampling_last_cpuinfo_used2;
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
+ if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
-void WebPInitUpsamplers(void) {
#ifdef FANCY_UPSAMPLING
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
@@ -320,38 +230,31 @@ void WebPInitUpsamplers(void) {
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
-
- // If defined, use CPUInfo() to overwrite some pointers with faster versions.
- if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
- if (VP8GetCPUInfo(kSSE2)) {
- WebPInitUpsamplersSSE2();
- }
-#endif
- }
-#endif // FANCY_UPSAMPLING
-}
-
-void WebPInitPremultiply(void) {
- WebPApplyAlphaMultiply = ApplyAlphaMultiply;
- WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply4444;
-
-#ifdef FANCY_UPSAMPLING
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+ // If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
- WebPInitPremultiplySSE2();
+ WebPInitUpsamplersSSE2();
+ }
+#endif
+#if defined(WEBP_USE_NEON)
+ if (VP8GetCPUInfo(kNEON)) {
+ WebPInitUpsamplersNEON();
+ }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+ if (VP8GetCPUInfo(kMIPSdspR2)) {
+ WebPInitUpsamplersMIPSdspR2();
}
#endif
}
#endif // FANCY_UPSAMPLING
+ upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
}
-#if defined(__cplusplus) || defined(c_plusplus)
-} // extern "C"
-#endif
+//------------------------------------------------------------------------------
diff --git a/drivers/webp/dsp/upsampling_sse2.c b/drivers/webp/dsp/upsampling_sse2.c
index 8cb275a02b..b85808e271 100644
--- a/drivers/webp/dsp/upsampling_sse2.c
+++ b/drivers/webp/dsp/upsampling_sse2.c
@@ -1,8 +1,10 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
-// This code is licensed under the same terms as WebM:
-// Software License Agreement: http://www.webmproject.org/license/software/
-// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 version of YUV to RGB upsampling functions.
@@ -18,10 +20,6 @@
#include <string.h>
#include "./yuv.h"
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
#ifdef FANCY_UPSAMPLING
// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
@@ -49,23 +47,23 @@ extern "C" {
(out) = _mm_sub_epi8(tmp0, tmp4); /* (k + in + 1) / 2 - lsb_correction */ \
} while (0)
-// pack and store two alterning pixel rows
+// pack and store two alternating pixel rows
#define PACK_AND_STORE(a, b, da, db, out) do { \
- const __m128i ta = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \
- const __m128i tb = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \
- const __m128i t1 = _mm_unpacklo_epi8(ta, tb); \
- const __m128i t2 = _mm_unpackhi_epi8(ta, tb); \
- _mm_store_si128(((__m128i*)(out)) + 0, t1); \
- _mm_store_si128(((__m128i*)(out)) + 1, t2); \
+ const __m128i t_a = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \
+ const __m128i t_b = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \
+ const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b); \
+ const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b); \
+ _mm_store_si128(((__m128i*)(out)) + 0, t_1); \
+ _mm_store_si128(((__m128i*)(out)) + 1, t_2); \
} while (0)
// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
#define UPSAMPLE_32PIXELS(r1, r2, out) { \
const __m128i one = _mm_set1_epi8(1); \
- const __m128i a = _mm_loadu_si128((__m128i*)&(r1)[0]); \
- const __m128i b = _mm_loadu_si128((__m128i*)&(r1)[1]); \
- const __m128i c = _mm_loadu_si128((__m128i*)&(r2)[0]); \
- const __m128i d = _mm_loadu_si128((__m128i*)&(r2)[1]); \
+ const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]); \
+ const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]); \
+ const __m128i c = _mm_loadu_si128((const __m128i*)&(r2)[0]); \
+ const __m128i d = _mm_loadu_si128((const __m128i*)&(r2)[1]); \
\
const __m128i s = _mm_avg_epu8(a, d); /* s = (a + d + 1) / 2 */ \
const __m128i t = _mm_avg_epu8(b, c); /* t = (b + c + 1) / 2 */ \
@@ -85,8 +83,8 @@ extern "C" {
GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \
\
/* pack the alternate pixels */ \
- PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]); \
- PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]); \
+ PACK_AND_STORE(a, b, diag1, diag2, out + 0); /* store top */ \
+ PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32); /* store bottom */ \
}
// Turn the macro into a function for reducing code-size when non-critical
@@ -106,104 +104,140 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
Upsample32Pixels(r1, r2, out); \
}
-#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv, \
+#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, \
top_dst, bottom_dst, cur_x, num_pixels) { \
int n; \
- if (top_y) { \
- for (n = 0; n < (num_pixels); ++n) { \
- FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n], \
- top_dst + ((cur_x) + n) * XSTEP); \
- } \
+ for (n = 0; n < (num_pixels); ++n) { \
+ FUNC(top_y[(cur_x) + n], r_u[n], r_v[n], \
+ top_dst + ((cur_x) + n) * XSTEP); \
} \
- if (bottom_y) { \
+ if (bottom_y != NULL) { \
for (n = 0; n < (num_pixels); ++n) { \
- FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n], \
+ FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n], \
bottom_dst + ((cur_x) + n) * XSTEP); \
} \
} \
}
+#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \
+ top_dst, bottom_dst, cur_x) do { \
+ FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP); \
+ if (bottom_y != NULL) { \
+ FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64, \
+ bottom_dst + (cur_x) * XSTEP); \
+ } \
+} while (0)
+
#define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
- int b; \
- /* 16 byte aligned array to cache reconstructed u and v */ \
+ int uv_pos, pos; \
+ /* 16byte-aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[4 * 32 + 15]; \
- uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
- const int uv_len = (len + 1) >> 1; \
- /* 17 pixels must be read-able for each block */ \
- const int num_blocks = (uv_len - 1) >> 4; \
- const int leftover = uv_len - num_blocks * 16; \
- const int last_pos = 1 + 32 * num_blocks; \
- \
- const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
- const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
+ uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
+ uint8_t* const r_v = r_u + 32; \
\
- assert(len > 0); \
- /* Treat the first pixel in regular way */ \
- if (top_y) { \
- const int u0 = (top_u[0] + u_diag) >> 1; \
- const int v0 = (top_v[0] + v_diag) >> 1; \
- FUNC(top_y[0], u0, v0, top_dst); \
+ assert(top_y != NULL); \
+ { /* Treat the first pixel in regular way */ \
+ const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
+ const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
+ const int u0_t = (top_u[0] + u_diag) >> 1; \
+ const int v0_t = (top_v[0] + v_diag) >> 1; \
+ FUNC(top_y[0], u0_t, v0_t, top_dst); \
+ if (bottom_y != NULL) { \
+ const int u0_b = (cur_u[0] + u_diag) >> 1; \
+ const int v0_b = (cur_v[0] + v_diag) >> 1; \
+ FUNC(bottom_y[0], u0_b, v0_b, bottom_dst); \
+ } \
} \
- if (bottom_y) { \
- const int u0 = (cur_u[0] + u_diag) >> 1; \
- const int v0 = (cur_v[0] + v_diag) >> 1; \
- FUNC(bottom_y[0], u0, v0, bottom_dst); \
+ /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */ \
+ for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) { \
+ UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u); \
+ UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v); \
+ CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos); \
} \
- \
- for (b = 0; b < num_blocks; ++b) { \
- UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32); \
- UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32); \
- CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \
- 32 * b + 1, 32) \
- top_u += 16; \
- cur_u += 16; \
- top_v += 16; \
- cur_v += 16; \
+ if (len > 1) { \
+ const int left_over = ((len + 1) >> 1) - (pos >> 1); \
+ assert(left_over > 0); \
+ UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \
+ UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \
+ CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, \
+ pos, len - pos); \
} \
- \
- UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32); \
- UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32); \
- CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \
- last_pos, len - last_pos); \
}
// SSE2 variants of the fancy upsampler.
-SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePairSSE2, VP8YuvToRgb, 3)
-SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePairSSE2, VP8YuvToBgr, 3)
-SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePairSSE2, VP8YuvToRgba, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3)
+SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
#undef GET_M
#undef PACK_AND_STORE
#undef UPSAMPLE_32PIXELS
#undef UPSAMPLE_LAST_BLOCK
#undef CONVERT2RGB
+#undef CONVERT2RGB_32
#undef SSE2_UPSAMPLE_FUNC
//------------------------------------------------------------------------------
+// Entry point
extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
-void WebPInitUpsamplersSSE2(void) {
- WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePairSSE2;
- WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
- WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePairSSE2;
- WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;
-}
+extern void WebPInitUpsamplersSSE2(void);
-void WebPInitPremultiplySSE2(void) {
- WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairSSE2;
- WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairSSE2;
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
+ VP8YUVInitSSE2();
+ WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
+ WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
+ WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
+ WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
+ WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
+ WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
}
#endif // FANCY_UPSAMPLING
-#if defined(__cplusplus) || defined(c_plusplus)
-} // extern "C"
-#endif
+//------------------------------------------------------------------------------
+
+extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+extern void WebPInitYUV444ConvertersSSE2(void);
+
+#define YUV444_FUNC(FUNC_NAME, CALL, XSTEP) \
+extern void WebP##FUNC_NAME##C(const uint8_t* y, const uint8_t* u, \
+ const uint8_t* v, uint8_t* dst, int len); \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
+ uint8_t* dst, int len) { \
+ int i; \
+ const int max_len = len & ~31; \
+ for (i = 0; i < max_len; i += 32) CALL(y + i, u + i, v + i, dst + i * XSTEP);\
+ if (i < len) { /* C-fallback */ \
+ WebP##FUNC_NAME##C(y + i, u + i, v + i, dst + i * XSTEP, len - i); \
+ } \
+}
-#endif // WEBP_USE_SSE2
+YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba32, 4);
+YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra32, 4);
+YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb32, 3);
+YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr32, 3);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) {
+ VP8YUVInitSSE2();
+ WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
+ WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
+ WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
+ WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
+}
+
+#else
+
+WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersSSE2)
+
+#endif // WEBP_USE_SSE2
+
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_SSE2))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersSSE2)
+#endif
diff --git a/drivers/webp/dsp/yuv.c b/drivers/webp/dsp/yuv.c
index 7f05f9a3aa..f50a253168 100644
--- a/drivers/webp/dsp/yuv.c
+++ b/drivers/webp/dsp/yuv.c
@@ -1,26 +1,19 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
-// This code is licensed under the same terms as WebM:
-// Software License Agreement: http://www.webmproject.org/license/software/
-// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
-// YUV->RGB conversion function
+// YUV->RGB conversion functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./yuv.h"
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-enum { YUV_HALF = 1 << (YUV_FIX - 1) };
-
-int16_t VP8kVToR[256], VP8kUToB[256];
-int32_t VP8kVToG[256], VP8kUToG[256];
-uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
+#if defined(WEBP_YUV_USE_TABLE)
static int done = 0;
@@ -28,11 +21,17 @@ static WEBP_INLINE uint8_t clip(int v, int max_value) {
return v < 0 ? 0 : v > max_value ? max_value : v;
}
-void VP8YUVInit(void) {
+int16_t VP8kVToR[256], VP8kUToB[256];
+int32_t VP8kVToG[256], VP8kUToG[256];
+uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {
int i;
if (done) {
return;
}
+#ifndef USE_YUVj
for (i = 0; i < 256; ++i) {
VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
@@ -44,9 +43,238 @@ void VP8YUVInit(void) {
VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
}
+#else
+ for (i = 0; i < 256; ++i) {
+ VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX;
+ VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF;
+ VP8kVToG[i] = -46802 * (i - 128);
+ VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX;
+ }
+ for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
+ const int k = i;
+ VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
+ VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
+ }
+#endif
+
done = 1;
}
-#if defined(__cplusplus) || defined(c_plusplus)
-} // extern "C"
-#endif
+#else
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
+
+#endif // WEBP_YUV_USE_TABLE
+
+//-----------------------------------------------------------------------------
+// Plain-C version
+
+#define ROW_FUNC(FUNC_NAME, FUNC, XSTEP) \
+static void FUNC_NAME(const uint8_t* y, \
+ const uint8_t* u, const uint8_t* v, \
+ uint8_t* dst, int len) { \
+ const uint8_t* const end = dst + (len & ~1) * XSTEP; \
+ while (dst != end) { \
+ FUNC(y[0], u[0], v[0], dst); \
+ FUNC(y[1], u[0], v[0], dst + XSTEP); \
+ y += 2; \
+ ++u; \
+ ++v; \
+ dst += 2 * XSTEP; \
+ } \
+ if (len & 1) { \
+ FUNC(y[0], u[0], v[0], dst); \
+ } \
+} \
+
+// All variants implemented.
+ROW_FUNC(YuvToRgbRow, VP8YuvToRgb, 3)
+ROW_FUNC(YuvToBgrRow, VP8YuvToBgr, 3)
+ROW_FUNC(YuvToRgbaRow, VP8YuvToRgba, 4)
+ROW_FUNC(YuvToBgraRow, VP8YuvToBgra, 4)
+ROW_FUNC(YuvToArgbRow, VP8YuvToArgb, 4)
+ROW_FUNC(YuvToRgba4444Row, VP8YuvToRgba4444, 2)
+ROW_FUNC(YuvToRgb565Row, VP8YuvToRgb565, 2)
+
+#undef ROW_FUNC
+
+// Main call for processing a plane with a WebPSamplerRowFunc function:
+void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
+ const uint8_t* u, const uint8_t* v, int uv_stride,
+ uint8_t* dst, int dst_stride,
+ int width, int height, WebPSamplerRowFunc func) {
+ int j;
+ for (j = 0; j < height; ++j) {
+ func(y, u, v, dst, width);
+ y += y_stride;
+ if (j & 1) {
+ u += uv_stride;
+ v += uv_stride;
+ }
+ dst += dst_stride;
+ }
+}
+
+//-----------------------------------------------------------------------------
+// Main call
+
+WebPSamplerRowFunc WebPSamplers[MODE_LAST];
+
+extern void WebPInitSamplersSSE2(void);
+extern void WebPInitSamplersMIPS32(void);
+extern void WebPInitSamplersMIPSdspR2(void);
+
+static volatile VP8CPUInfo yuv_last_cpuinfo_used =
+ (VP8CPUInfo)&yuv_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
+ if (yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+ WebPSamplers[MODE_RGB] = YuvToRgbRow;
+ WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
+ WebPSamplers[MODE_BGR] = YuvToBgrRow;
+ WebPSamplers[MODE_BGRA] = YuvToBgraRow;
+ WebPSamplers[MODE_ARGB] = YuvToArgbRow;
+ WebPSamplers[MODE_RGBA_4444] = YuvToRgba4444Row;
+ WebPSamplers[MODE_RGB_565] = YuvToRgb565Row;
+ WebPSamplers[MODE_rgbA] = YuvToRgbaRow;
+ WebPSamplers[MODE_bgrA] = YuvToBgraRow;
+ WebPSamplers[MODE_Argb] = YuvToArgbRow;
+ WebPSamplers[MODE_rgbA_4444] = YuvToRgba4444Row;
+
+ // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+ if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+ if (VP8GetCPUInfo(kSSE2)) {
+ WebPInitSamplersSSE2();
+ }
+#endif // WEBP_USE_SSE2
+#if defined(WEBP_USE_MIPS32)
+ if (VP8GetCPUInfo(kMIPS32)) {
+ WebPInitSamplersMIPS32();
+ }
+#endif // WEBP_USE_MIPS32
+#if defined(WEBP_USE_MIPS_DSP_R2)
+ if (VP8GetCPUInfo(kMIPSdspR2)) {
+ WebPInitSamplersMIPSdspR2();
+ }
+#endif // WEBP_USE_MIPS_DSP_R2
+ }
+ yuv_last_cpuinfo_used = VP8GetCPUInfo;
+}
+
+//-----------------------------------------------------------------------------
+// ARGB -> YUV converters
+
+static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ const uint32_t p = argb[i];
+ y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
+ YUV_HALF);
+ }
+}
+
+void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
+ int src_width, int do_store) {
+ // No rounding. Last pixel is dealt with separately.
+ const int uv_width = src_width >> 1;
+ int i;
+ for (i = 0; i < uv_width; ++i) {
+ const uint32_t v0 = argb[2 * i + 0];
+ const uint32_t v1 = argb[2 * i + 1];
+ // VP8RGBToU/V expects four accumulated pixels. Hence we need to
+ // scale r/g/b value by a factor 2. We just shift v0/v1 one bit less.
+ const int r = ((v0 >> 15) & 0x1fe) + ((v1 >> 15) & 0x1fe);
+ const int g = ((v0 >> 7) & 0x1fe) + ((v1 >> 7) & 0x1fe);
+ const int b = ((v0 << 1) & 0x1fe) + ((v1 << 1) & 0x1fe);
+ const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
+ const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
+ if (do_store) {
+ u[i] = tmp_u;
+ v[i] = tmp_v;
+ } else {
+ // Approximated average-of-four. But it's an acceptable diff.
+ u[i] = (u[i] + tmp_u + 1) >> 1;
+ v[i] = (v[i] + tmp_v + 1) >> 1;
+ }
+ }
+ if (src_width & 1) { // last pixel
+ const uint32_t v0 = argb[2 * i + 0];
+ const int r = (v0 >> 14) & 0x3fc;
+ const int g = (v0 >> 6) & 0x3fc;
+ const int b = (v0 << 2) & 0x3fc;
+ const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
+ const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
+ if (do_store) {
+ u[i] = tmp_u;
+ v[i] = tmp_v;
+ } else {
+ u[i] = (u[i] + tmp_u + 1) >> 1;
+ v[i] = (v[i] + tmp_v + 1) >> 1;
+ }
+ }
+}
+
+//-----------------------------------------------------------------------------
+
+static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
+ int i;
+ for (i = 0; i < width; ++i, rgb += 3) {
+ y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
+ }
+}
+
+static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
+ int i;
+ for (i = 0; i < width; ++i, bgr += 3) {
+ y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
+ }
+}
+
+void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
+ uint8_t* u, uint8_t* v, int width) {
+ int i;
+ for (i = 0; i < width; i += 1, rgb += 4) {
+ const int r = rgb[0], g = rgb[1], b = rgb[2];
+ u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
+ v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
+ }
+}
+
+//-----------------------------------------------------------------------------
+
+void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
+void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
+void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
+ uint8_t* u, uint8_t* v, int width);
+
+void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
+void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
+ int src_width, int do_store);
+
+static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
+ (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;
+
+extern void WebPInitConvertARGBToYUVSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
+ if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+ WebPConvertARGBToY = ConvertARGBToY;
+ WebPConvertARGBToUV = WebPConvertARGBToUV_C;
+
+ WebPConvertRGB24ToY = ConvertRGB24ToY;
+ WebPConvertBGR24ToY = ConvertBGR24ToY;
+
+ WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
+
+ if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+ if (VP8GetCPUInfo(kSSE2)) {
+ WebPInitConvertARGBToYUVSSE2();
+ }
+#endif // WEBP_USE_SSE2
+ }
+ rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/drivers/webp/dsp/yuv.h b/drivers/webp/dsp/yuv.h
index a569109c54..af435a5b3e 100644
--- a/drivers/webp/dsp/yuv.h
+++ b/drivers/webp/dsp/yuv.h
@@ -1,36 +1,165 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
-// This code is licensed under the same terms as WebM:
-// Software License Agreement: http://www.webmproject.org/license/software/
-// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// inline YUV<->RGB conversion function
//
+// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
+// More information at: http://en.wikipedia.org/wiki/YCbCr
+// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
+// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
+// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
+// We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
+//
+// For the Y'CbCr to RGB conversion, the BT.601 specification reads:
+// R = 1.164 * (Y-16) + 1.596 * (V-128)
+// G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.391 * (U-128)
+// B = 1.164 * (Y-16) + 2.018 * (U-128)
+// where Y is in the [16,235] range, and U/V in the [16,240] range.
+// In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor
+// "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table.
+// So in this case the formulae should read:
+// R = 1.164 * [Y + 1.371 * (V-128) ] - 18.624
+// G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624
+// B = 1.164 * [Y + 1.733 * (U-128)] - 18.624
+// once factorized.
+// For YUV->RGB conversion, only 14bit fixed precision is used (YUV_FIX2).
+// That's the maximum possible for a convenient ARM implementation.
+//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DSP_YUV_H_
#define WEBP_DSP_YUV_H_
+#include "./dsp.h"
#include "../dec/decode_vp8.h"
+// Define the following to use the LUT-based code:
+// #define WEBP_YUV_USE_TABLE
+
+#if defined(WEBP_EXPERIMENTAL_FEATURES)
+// Do NOT activate this feature for real compression. This is only experimental!
+// This flag is for comparison purpose against JPEG's "YUVj" natural colorspace.
+// This colorspace is close to Rec.601's Y'CbCr model with the notable
+// difference of allowing larger range for luma/chroma.
+// See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its
+// difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
+// #define USE_YUVj
+#endif
+
//------------------------------------------------------------------------------
// YUV -> RGB conversion
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
extern "C" {
#endif
-enum { YUV_FIX = 16, // fixed-point precision
- YUV_RANGE_MIN = -227, // min value of r/g/b output
- YUV_RANGE_MAX = 256 + 226 // max value of r/g/b output
+enum {
+ YUV_FIX = 16, // fixed-point precision for RGB->YUV
+ YUV_HALF = 1 << (YUV_FIX - 1),
+ YUV_MASK = (256 << YUV_FIX) - 1,
+ YUV_RANGE_MIN = -227, // min value of r/g/b output
+ YUV_RANGE_MAX = 256 + 226, // max value of r/g/b output
+
+ YUV_FIX2 = 14, // fixed-point precision for YUV->RGB
+ YUV_HALF2 = 1 << (YUV_FIX2 - 1),
+ YUV_MASK2 = (256 << YUV_FIX2) - 1
};
+
+// These constants are 14b fixed-point version of ITU-R BT.601 constants.
+#define kYScale 19077 // 1.164 = 255 / 219
+#define kVToR 26149 // 1.596 = 255 / 112 * 0.701
+#define kUToG 6419 // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
+#define kVToG 13320 // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
+#define kUToB 33050 // 2.018 = 255 / 112 * 0.886
+#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF2)
+#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF2)
+#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF2)
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_YUV_USE_TABLE)
+
+// slower on x86 by ~7-8%, but bit-exact with the SSE2 version
+
+static WEBP_INLINE int VP8Clip8(int v) {
+ return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : (v < 0) ? 0 : 255;
+}
+
+static WEBP_INLINE int VP8YUVToR(int y, int v) {
+ return VP8Clip8(kYScale * y + kVToR * v + kRCst);
+}
+
+static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
+ return VP8Clip8(kYScale * y - kUToG * u - kVToG * v + kGCst);
+}
+
+static WEBP_INLINE int VP8YUVToB(int y, int u) {
+ return VP8Clip8(kYScale * y + kUToB * u + kBCst);
+}
+
+static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
+ uint8_t* const rgb) {
+ rgb[0] = VP8YUVToR(y, v);
+ rgb[1] = VP8YUVToG(y, u, v);
+ rgb[2] = VP8YUVToB(y, u);
+}
+
+static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
+ uint8_t* const bgr) {
+ bgr[0] = VP8YUVToB(y, u);
+ bgr[1] = VP8YUVToG(y, u, v);
+ bgr[2] = VP8YUVToR(y, v);
+}
+
+static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
+ uint8_t* const rgb) {
+ const int r = VP8YUVToR(y, v); // 5 usable bits
+ const int g = VP8YUVToG(y, u, v); // 6 usable bits
+ const int b = VP8YUVToB(y, u); // 5 usable bits
+ const int rg = (r & 0xf8) | (g >> 5);
+ const int gb = ((g << 3) & 0xe0) | (b >> 3);
+#ifdef WEBP_SWAP_16BIT_CSP
+ rgb[0] = gb;
+ rgb[1] = rg;
+#else
+ rgb[0] = rg;
+ rgb[1] = gb;
+#endif
+}
+
+static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
+ uint8_t* const argb) {
+ const int r = VP8YUVToR(y, v); // 4 usable bits
+ const int g = VP8YUVToG(y, u, v); // 4 usable bits
+ const int b = VP8YUVToB(y, u); // 4 usable bits
+ const int rg = (r & 0xf0) | (g >> 4);
+ const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
+#ifdef WEBP_SWAP_16BIT_CSP
+ argb[0] = ba;
+ argb[1] = rg;
+#else
+ argb[0] = rg;
+ argb[1] = ba;
+#endif
+}
+
+#else
+
+// Table-based version, not totally equivalent to the SSE2 version.
+// Rounding diff is only +/-1 though.
+
extern int16_t VP8kVToR[256], VP8kUToB[256];
extern int32_t VP8kVToG[256], VP8kUToG[256];
extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
-static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
uint8_t* const rgb) {
const int r_off = VP8kVToR[v];
const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
@@ -40,42 +169,60 @@ static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
}
-static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
- uint8_t* const rgb) {
+static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
+ uint8_t* const bgr) {
const int r_off = VP8kVToR[v];
const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
const int b_off = VP8kUToB[u];
- rgb[0] = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
- (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
- rgb[1] = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
- (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
-}
-
-static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
- uint8_t* const argb) {
- argb[0] = 0xff;
- VP8YuvToRgb(y, u, v, argb + 1);
+ bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
+ bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
+ bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
}
-static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
- uint8_t* const argb) {
+static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
+ uint8_t* const rgb) {
const int r_off = VP8kVToR[v];
const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
const int b_off = VP8kUToB[u];
- // Don't update alpha (last 4 bits of argb[1])
- argb[0] = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
- VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
- argb[1] = 0x0f | (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4);
+ const int rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
+ (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
+ const int gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
+ (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
+#ifdef WEBP_SWAP_16BIT_CSP
+ rgb[0] = gb;
+ rgb[1] = rg;
+#else
+ rgb[0] = rg;
+ rgb[1] = gb;
+#endif
}
-static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
- uint8_t* const bgr) {
+static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
+ uint8_t* const argb) {
const int r_off = VP8kVToR[v];
const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
const int b_off = VP8kUToB[u];
- bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
- bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
- bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
+ const int rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
+ VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
+ const int ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f;
+#ifdef WEBP_SWAP_16BIT_CSP
+ argb[0] = ba;
+ argb[1] = rg;
+#else
+ argb[0] = rg;
+ argb[1] = ba;
+#endif
+}
+
+#endif // WEBP_YUV_USE_TABLE
+
+//-----------------------------------------------------------------------------
+// Alpha handling variants
+
+static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
+ uint8_t* const argb) {
+ argb[0] = 0xff;
+ VP8YuvToRgb(y, u, v, argb + 1);
}
static WEBP_INLINE void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
@@ -93,35 +240,79 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
// Must be called before everything, to initialize the tables.
void VP8YUVInit(void);
+//-----------------------------------------------------------------------------
+// SSE2 extra functions (mostly for upsampling_sse2.c)
+
+#if defined(WEBP_USE_SSE2)
+
+// When the following is defined, tables are initialized statically, adding ~12k
+// to the binary size. Otherwise, they are initialized at run-time (small cost).
+#define WEBP_YUV_USE_SSE2_TABLES
+
+// Process 32 pixels and store the result (24b or 32b per pixel) in *dst.
+void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst);
+void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst);
+void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst);
+void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst);
+
+// Must be called to initialize tables before using the functions.
+void VP8YUVInitSSE2(void);
+
+#endif // WEBP_USE_SSE2
+
//------------------------------------------------------------------------------
// RGB -> YUV conversion
-// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
-// More information at: http://en.wikipedia.org/wiki/YCbCr
-// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
-// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
-// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
-// We use 16bit fixed point operations.
-static WEBP_INLINE int VP8ClipUV(int v) {
- v = (v + (257 << (YUV_FIX + 2 - 1))) >> (YUV_FIX + 2);
- return ((v & ~0xff) == 0) ? v : (v < 0) ? 0 : 255;
+// Stub functions that can be called with various rounding values:
+static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
+ uv = (uv + rounding + (128 << (YUV_FIX + 2))) >> (YUV_FIX + 2);
+ return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
}
-static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
- const int kRound = (1 << (YUV_FIX - 1)) + (16 << YUV_FIX);
+#ifndef USE_YUVj
+
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
const int luma = 16839 * r + 33059 * g + 6420 * b;
- return (luma + kRound) >> YUV_FIX; // no need to clip
+ return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX; // no need to clip
}
-static WEBP_INLINE int VP8RGBToU(int r, int g, int b) {
- return VP8ClipUV(-9719 * r - 19081 * g + 28800 * b);
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
+ const int u = -9719 * r - 19081 * g + 28800 * b;
+ return VP8ClipUV(u, rounding);
}
-static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
- return VP8ClipUV(+28800 * r - 24116 * g - 4684 * b);
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
+ const int v = +28800 * r - 24116 * g - 4684 * b;
+ return VP8ClipUV(v, rounding);
+}
+
+#else
+
+// This JPEG-YUV colorspace, only for comparison!
+// These are also 16bit precision coefficients from Rec.601, but with full
+// [0..255] output range.
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
+ const int luma = 19595 * r + 38470 * g + 7471 * b;
+ return (luma + rounding) >> YUV_FIX; // no need to clip
}
-#if defined(__cplusplus) || defined(c_plusplus)
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
+ const int u = -11058 * r - 21710 * g + 32768 * b;
+ return VP8ClipUV(u, rounding);
+}
+
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
+ const int v = 32768 * r - 27439 * g - 5329 * b;
+ return VP8ClipUV(v, rounding);
+}
+
+#endif // USE_YUVj
+
+#ifdef __cplusplus
} // extern "C"
#endif