summaryrefslogtreecommitdiff
path: root/drivers/webp/dsp/upsampling_sse2.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/webp/dsp/upsampling_sse2.c')
-rw-r--r--drivers/webp/dsp/upsampling_sse2.c139
1 files changed, 65 insertions, 74 deletions
diff --git a/drivers/webp/dsp/upsampling_sse2.c b/drivers/webp/dsp/upsampling_sse2.c
index 0db0798c6d..8cb275a02b 100644
--- a/drivers/webp/dsp/upsampling_sse2.c
+++ b/drivers/webp/dsp/upsampling_sse2.c
@@ -1,10 +1,8 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+// Software License Agreement: http://www.webmproject.org/license/software/
+// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// SSE2 version of YUV to RGB upsampling functions.
@@ -20,6 +18,10 @@
#include <string.h>
#include "./yuv.h"
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
#ifdef FANCY_UPSAMPLING
// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
@@ -47,14 +49,14 @@
(out) = _mm_sub_epi8(tmp0, tmp4); /* (k + in + 1) / 2 - lsb_correction */ \
} while (0)
-// pack and store two alternating pixel rows
+// pack and store two alterning pixel rows
#define PACK_AND_STORE(a, b, da, db, out) do { \
- const __m128i t_a = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \
- const __m128i t_b = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \
- const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b); \
- const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b); \
- _mm_store_si128(((__m128i*)(out)) + 0, t_1); \
- _mm_store_si128(((__m128i*)(out)) + 1, t_2); \
+ const __m128i ta = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \
+ const __m128i tb = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \
+ const __m128i t1 = _mm_unpacklo_epi8(ta, tb); \
+ const __m128i t2 = _mm_unpackhi_epi8(ta, tb); \
+ _mm_store_si128(((__m128i*)(out)) + 0, t1); \
+ _mm_store_si128(((__m128i*)(out)) + 1, t2); \
} while (0)
// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
@@ -83,8 +85,8 @@
GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \
\
/* pack the alternate pixels */ \
- PACK_AND_STORE(a, b, diag1, diag2, out + 0); /* store top */ \
- PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32); /* store bottom */ \
+ PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]); \
+ PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]); \
}
// Turn the macro into a function for reducing code-size when non-critical
@@ -104,68 +106,69 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
Upsample32Pixels(r1, r2, out); \
}
-#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, \
+#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv, \
top_dst, bottom_dst, cur_x, num_pixels) { \
int n; \
- for (n = 0; n < (num_pixels); ++n) { \
- FUNC(top_y[(cur_x) + n], r_u[n], r_v[n], \
- top_dst + ((cur_x) + n) * XSTEP); \
+ if (top_y) { \
+ for (n = 0; n < (num_pixels); ++n) { \
+ FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n], \
+ top_dst + ((cur_x) + n) * XSTEP); \
+ } \
} \
- if (bottom_y != NULL) { \
+ if (bottom_y) { \
for (n = 0; n < (num_pixels); ++n) { \
- FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n], \
+ FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n], \
bottom_dst + ((cur_x) + n) * XSTEP); \
} \
} \
}
-#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \
- top_dst, bottom_dst, cur_x) do { \
- FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP); \
- if (bottom_y != NULL) { \
- FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64, \
- bottom_dst + (cur_x) * XSTEP); \
- } \
-} while (0)
-
#define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
- int uv_pos, pos; \
- /* 16byte-aligned array to cache reconstructed u and v */ \
+ int b; \
+ /* 16 byte aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[4 * 32 + 15]; \
- uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
- uint8_t* const r_v = r_u + 32; \
+ uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
+ const int uv_len = (len + 1) >> 1; \
+ /* 17 pixels must be read-able for each block */ \
+ const int num_blocks = (uv_len - 1) >> 4; \
+ const int leftover = uv_len - num_blocks * 16; \
+ const int last_pos = 1 + 32 * num_blocks; \
\
- assert(top_y != NULL); \
- { /* Treat the first pixel in regular way */ \
- const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
- const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
- const int u0_t = (top_u[0] + u_diag) >> 1; \
- const int v0_t = (top_v[0] + v_diag) >> 1; \
- FUNC(top_y[0], u0_t, v0_t, top_dst); \
- if (bottom_y != NULL) { \
- const int u0_b = (cur_u[0] + u_diag) >> 1; \
- const int v0_b = (cur_v[0] + v_diag) >> 1; \
- FUNC(bottom_y[0], u0_b, v0_b, bottom_dst); \
- } \
+ const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
+ const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
+ \
+ assert(len > 0); \
+ /* Treat the first pixel in regular way */ \
+ if (top_y) { \
+ const int u0 = (top_u[0] + u_diag) >> 1; \
+ const int v0 = (top_v[0] + v_diag) >> 1; \
+ FUNC(top_y[0], u0, v0, top_dst); \
} \
- /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */ \
- for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) { \
- UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u); \
- UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v); \
- CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos); \
+ if (bottom_y) { \
+ const int u0 = (cur_u[0] + u_diag) >> 1; \
+ const int v0 = (cur_v[0] + v_diag) >> 1; \
+ FUNC(bottom_y[0], u0, v0, bottom_dst); \
} \
- if (len > 1) { \
- const int left_over = ((len + 1) >> 1) - (pos >> 1); \
- assert(left_over > 0); \
- UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \
- UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \
- CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, \
- pos, len - pos); \
+ \
+ for (b = 0; b < num_blocks; ++b) { \
+ UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32); \
+ UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32); \
+ CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \
+ 32 * b + 1, 32) \
+ top_u += 16; \
+ cur_u += 16; \
+ top_v += 16; \
+ cur_v += 16; \
} \
+ \
+ UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32); \
+ UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32); \
+ CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \
+ last_pos, len - last_pos); \
}
// SSE2 variants of the fancy upsampler.
@@ -179,40 +182,28 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
#undef UPSAMPLE_32PIXELS
#undef UPSAMPLE_LAST_BLOCK
#undef CONVERT2RGB
-#undef CONVERT2RGB_32
#undef SSE2_UPSAMPLE_FUNC
-#endif // FANCY_UPSAMPLING
-
-#endif // WEBP_USE_SSE2
-
//------------------------------------------------------------------------------
-#ifdef FANCY_UPSAMPLING
-
extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
void WebPInitUpsamplersSSE2(void) {
-#if defined(WEBP_USE_SSE2)
- VP8YUVInitSSE2();
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePairSSE2;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePairSSE2;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;
-#endif // WEBP_USE_SSE2
}
void WebPInitPremultiplySSE2(void) {
-#if defined(WEBP_USE_SSE2)
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairSSE2;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairSSE2;
-#endif // WEBP_USE_SSE2
}
-#else
-
-// this empty function is to avoid an empty .o
-void WebPInitPremultiplySSE2(void) {}
-
#endif // FANCY_UPSAMPLING
+#if defined(__cplusplus) || defined(c_plusplus)
+} // extern "C"
+#endif
+
+#endif // WEBP_USE_SSE2