1 files changed, 101 insertions, 0 deletions
diff --git a/thirdparty/libwebp/src/dsp/neon.h b/thirdparty/libwebp/src/dsp/neon.h
new file mode 100644
index 0000000000..aa1dea1301
--- /dev/null
+++ b/thirdparty/libwebp/src/dsp/neon.h
@@ -0,0 +1,101 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  NEON common code.
+
+#ifndef WEBP_DSP_NEON_H_
+#define WEBP_DSP_NEON_H_
+
+#include <arm_neon.h>
+
+#include "src/dsp/dsp.h"
+
+// Right now, some intrinsics functions seem slower, so we disable them
+// everywhere except newer clang/gcc or aarch64 where the inline assembly is
+// incompatible.
+#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
+#define WEBP_USE_INTRINSICS   // use intrinsics when possible
+#endif
+
+#define INIT_VECTOR2(v, a, b) do {  \
+  v.val[0] = a;                     \
+  v.val[1] = b;                     \
+} while (0)
+
+#define INIT_VECTOR3(v, a, b, c) do {  \
+  v.val[0] = a;                        \
+  v.val[1] = b;                        \
+  v.val[2] = c;                        \
+} while (0)
+
+#define INIT_VECTOR4(v, a, b, c, d) do {  \
+  v.val[0] = a;                           \
+  v.val[1] = b;                           \
+  v.val[2] = c;                           \
+  v.val[3] = d;                           \
+} while (0)
+
+// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
+// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
+// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
+#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#define WORK_AROUND_GCC
+#endif
+
+static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
+  uint64x2x2_t row01, row23;
+
+  row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
+  row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
+  row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
+  row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
+  // Transpose 64-bit values (there's no vswp equivalent)
+  {
+    const uint64x1_t row0h = vget_high_u64(row01.val[0]);
+    const uint64x1_t row2l = vget_low_u64(row23.val[0]);
+    const uint64x1_t row1h = vget_high_u64(row01.val[1]);
+    const uint64x1_t row3l = vget_low_u64(row23.val[1]);
+    row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
+    row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
+    row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
+    row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
+  }
+  {
+    const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
+                                        vreinterpretq_s32_u64(row01.val[1]));
+    const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
+                                        vreinterpretq_s32_u64(row23.val[1]));
+    int32x4x4_t out;
+    out.val[0] = out01.val[0];
+    out.val[1] = out01.val[1];
+    out.val[2] = out23.val[0];
+    out.val[3] = out23.val[1];
+    return out;
+  }
+}
+
+#if 0     // Useful debug macro.
+#include <stdio.h>
+#define PRINT_REG(REG, SIZE) do {                       \
+  int i;                                                \
+  printf("%s \t[%d]: 0x", #REG, SIZE);                  \
+  if (SIZE == 8) {                                      \
+    uint8_t _tmp[8];                                    \
+    vst1_u8(_tmp, (REG));                               \
+    for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]);   \
+  } else if (SIZE == 16) {                              \
+    uint16_t _tmp[4];                                   \
+    vst1_u16(_tmp, (REG));                              \
+    for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]);   \
+  }                                                     \
+  printf("\n");                                         \
+} while (0)
+#endif
+
+#endif  // WEBP_DSP_NEON_H_