summaryrefslogtreecommitdiff
path: root/thirdparty/libvpx/vpx_dsp/arm
diff options
context:
space:
mode:
authorBłażej Szczygieł <spaz16@wp.pl>2016-09-14 22:10:03 +0200
committerBłażej Szczygieł <spaz16@wp.pl>2016-10-19 13:34:28 +0200
commit5268443fdfd6f9f8172cede1140810ae21f7990a (patch)
tree1bfb077f7557a4ee18734c8bb38ec0ddc22c8418 /thirdparty/libvpx/vpx_dsp/arm
parent2d77a6f5d3beae3b341e4a7f331202bd1a010508 (diff)
Add libvpx thirdparty library
Only necessary files
Diffstat (limited to 'thirdparty/libvpx/vpx_dsp/arm')
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c61
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c1317
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c185
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c165
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c719
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c50
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c151
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c64
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c540
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c822
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm630
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c179
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c266
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c445
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm635
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c58
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/save_reg_neon.asm36
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c373
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c340
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c147
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c94
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c72
22 files changed, 7349 insertions, 0 deletions
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
new file mode 100644
index 0000000000..f734e48027
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+void vpx_idct16x16_1_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x8_t d2u8, d3u8, d30u8, d31u8;
+ uint64x1_t d2u64, d3u64, d4u64, d5u64;
+ uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, j, a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ q0s16 = vdupq_n_s16(a1);
+ q0u16 = vreinterpretq_u16_s16(q0s16);
+
+ for (d1 = d2 = dest, i = 0; i < 4; i++) {
+ for (j = 0; j < 2; j++) {
+ d2u64 = vld1_u64((const uint64_t *)d1);
+ d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
+ d1 += dest_stride;
+ d4u64 = vld1_u64((const uint64_t *)d1);
+ d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
+ d1 += dest_stride;
+
+ q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+ q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+ q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+ q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+ d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+ vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
+ d2 += dest_stride;
+ }
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
new file mode 100644
index 0000000000..651ebb21f9
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
@@ -0,0 +1,1317 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void TRANSPOSE8X8(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+ vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+ vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+ vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+ vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
+}
+
+void vpx_idct16x16_256_add_neon_pass1(
+ int16_t *in,
+ int16_t *out,
+ int output_stride) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(in);
+ q8s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q9s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q10s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q11s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q12s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q13s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q14s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ // stage 3
+ d0s16 = vdup_n_s16(cospi_28_64);
+ d1s16 = vdup_n_s16(cospi_4_64);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d18s16, d1s16);
+ q6s32 = vmull_s16(d19s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
+ q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
+
+ d2s16 = vdup_n_s16(cospi_12_64);
+ d3s16 = vdup_n_s16(cospi_20_64);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q5s32, 14);
+ d15s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ q2s32 = vmull_s16(d26s16, d2s16);
+ q3s32 = vmull_s16(d27s16, d2s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q15s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
+ q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
+
+ d10s16 = vqrshrn_n_s32(q2s32, 14);
+ d11s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q15s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 4
+ d30s16 = vdup_n_s16(cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d30s16);
+ q11s32 = vmull_s16(d17s16, d30s16);
+ q0s32 = vmull_s16(d24s16, d30s16);
+ q1s32 = vmull_s16(d25s16, d30s16);
+
+ d30s16 = vdup_n_s16(cospi_24_64);
+ d31s16 = vdup_n_s16(cospi_8_64);
+
+ q3s32 = vaddq_s32(q2s32, q0s32);
+ q12s32 = vaddq_s32(q11s32, q1s32);
+ q13s32 = vsubq_s32(q2s32, q0s32);
+ q1s32 = vsubq_s32(q11s32, q1s32);
+
+ d16s16 = vqrshrn_n_s32(q3s32, 14);
+ d17s16 = vqrshrn_n_s32(q12s32, 14);
+ d18s16 = vqrshrn_n_s32(q13s32, 14);
+ d19s16 = vqrshrn_n_s32(q1s32, 14);
+ q8s16 = vcombine_s16(d16s16, d17s16);
+ q9s16 = vcombine_s16(d18s16, d19s16);
+
+ q0s32 = vmull_s16(d20s16, d31s16);
+ q1s32 = vmull_s16(d21s16, d31s16);
+ q12s32 = vmull_s16(d20s16, d30s16);
+ q13s32 = vmull_s16(d21s16, d30s16);
+
+ q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
+ q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
+ q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
+ q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
+
+ d22s16 = vqrshrn_n_s32(q0s32, 14);
+ d23s16 = vqrshrn_n_s32(q1s32, 14);
+ d20s16 = vqrshrn_n_s32(q12s32, 14);
+ d21s16 = vqrshrn_n_s32(q13s32, 14);
+ q10s16 = vcombine_s16(d20s16, d21s16);
+ q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q15s16 = vaddq_s16(q6s16, q7s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ // stage 5
+ q0s16 = vaddq_s16(q8s16, q11s16);
+ q1s16 = vaddq_s16(q9s16, q10s16);
+ q2s16 = vsubq_s16(q9s16, q10s16);
+ q3s16 = vsubq_s16(q8s16, q11s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+
+ q11s32 = vmull_s16(d26s16, d16s16);
+ q12s32 = vmull_s16(d27s16, d16s16);
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+
+ q6s32 = vsubq_s32(q9s32, q11s32);
+ q13s32 = vsubq_s32(q10s32, q12s32);
+ q9s32 = vaddq_s32(q9s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q12s32);
+
+ d10s16 = vqrshrn_n_s32(q6s32, 14);
+ d11s16 = vqrshrn_n_s32(q13s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q10s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 6
+ q8s16 = vaddq_s16(q0s16, q15s16);
+ q9s16 = vaddq_s16(q1s16, q6s16);
+ q10s16 = vaddq_s16(q2s16, q5s16);
+ q11s16 = vaddq_s16(q3s16, q4s16);
+ q12s16 = vsubq_s16(q3s16, q4s16);
+ q13s16 = vsubq_s16(q2s16, q5s16);
+ q14s16 = vsubq_s16(q1s16, q6s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+ d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+ d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+ d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+ d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ // store the data
+ output_stride >>= 1; // output_stride / 2, out is int16_t
+ vst1_u64((uint64_t *)out, d16u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d17u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d20u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d21u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d22u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d23u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d24u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
+
+void vpx_idct16x16_256_add_neon_pass2(
+ int16_t *src,
+ int16_t *out,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8_t *d;
+ uint8x8_t d12u8, d13u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64;
+ int64x1_t d12s64, d13s64;
+ uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
+ uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(src);
+ q8s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q9s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q10s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q11s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q12s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q13s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q14s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ // stage 3
+ d12s16 = vdup_n_s16(cospi_30_64);
+ d13s16 = vdup_n_s16(cospi_2_64);
+
+ q2s32 = vmull_s16(d16s16, d12s16);
+ q3s32 = vmull_s16(d17s16, d12s16);
+ q1s32 = vmull_s16(d16s16, d13s16);
+ q4s32 = vmull_s16(d17s16, d13s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
+ q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
+ q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
+
+ d0s16 = vqrshrn_n_s32(q2s32, 14);
+ d1s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q1s32, 14);
+ d15s16 = vqrshrn_n_s32(q4s32, 14);
+ q0s16 = vcombine_s16(d0s16, d1s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d30s16 = vdup_n_s16(cospi_14_64);
+ d31s16 = vdup_n_s16(cospi_18_64);
+
+ q2s32 = vmull_s16(d24s16, d30s16);
+ q3s32 = vmull_s16(d25s16, d30s16);
+ q4s32 = vmull_s16(d24s16, d31s16);
+ q5s32 = vmull_s16(d25s16, d31s16);
+
+ q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
+ q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
+ q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
+ q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
+
+ d2s16 = vqrshrn_n_s32(q2s32, 14);
+ d3s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q4s32, 14);
+ d13s16 = vqrshrn_n_s32(q5s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ d30s16 = vdup_n_s16(cospi_22_64);
+ d31s16 = vdup_n_s16(cospi_10_64);
+
+ q11s32 = vmull_s16(d20s16, d30s16);
+ q12s32 = vmull_s16(d21s16, d30s16);
+ q4s32 = vmull_s16(d20s16, d31s16);
+ q5s32 = vmull_s16(d21s16, d31s16);
+
+ q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
+ q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
+ q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
+ q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
+
+ d4s16 = vqrshrn_n_s32(q11s32, 14);
+ d5s16 = vqrshrn_n_s32(q12s32, 14);
+ d11s16 = vqrshrn_n_s32(q5s32, 14);
+ d10s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ d30s16 = vdup_n_s16(cospi_6_64);
+ d31s16 = vdup_n_s16(cospi_26_64);
+
+ q10s32 = vmull_s16(d28s16, d30s16);
+ q11s32 = vmull_s16(d29s16, d30s16);
+ q12s32 = vmull_s16(d28s16, d31s16);
+ q13s32 = vmull_s16(d29s16, d31s16);
+
+ q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
+ q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
+ q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
+ q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q11s32, 14);
+ d8s16 = vqrshrn_n_s32(q12s32, 14);
+ d9s16 = vqrshrn_n_s32(q13s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 3
+ q9s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q10s16 = vsubq_s16(q3s16, q2s16);
+ q11s16 = vaddq_s16(q2s16, q3s16);
+ q12s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q6s16, q7s16);
+
+ // stage 4
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ d30s16 = vdup_n_s16(cospi_8_64);
+ d31s16 = vdup_n_s16(cospi_24_64);
+
+ q2s32 = vmull_s16(d18s16, d31s16);
+ q3s32 = vmull_s16(d19s16, d31s16);
+ q4s32 = vmull_s16(d28s16, d31s16);
+ q5s32 = vmull_s16(d29s16, d31s16);
+
+ q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
+ q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
+ q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
+ q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
+
+ d12s16 = vqrshrn_n_s32(q2s32, 14);
+ d13s16 = vqrshrn_n_s32(q3s32, 14);
+ d2s16 = vqrshrn_n_s32(q4s32, 14);
+ d3s16 = vqrshrn_n_s32(q5s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ q3s16 = q11s16;
+ q4s16 = q12s16;
+
+ d30s16 = vdup_n_s16(-cospi_8_64);
+ q11s32 = vmull_s16(d26s16, d30s16);
+ q12s32 = vmull_s16(d27s16, d30s16);
+ q8s32 = vmull_s16(d20s16, d30s16);
+ q9s32 = vmull_s16(d21s16, d30s16);
+
+ q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
+ q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
+ q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
+ q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
+
+ d4s16 = vqrshrn_n_s32(q11s32, 14);
+ d5s16 = vqrshrn_n_s32(q12s32, 14);
+ d10s16 = vqrshrn_n_s32(q8s32, 14);
+ d11s16 = vqrshrn_n_s32(q9s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ // stage 5
+ q8s16 = vaddq_s16(q0s16, q3s16);
+ q9s16 = vaddq_s16(q1s16, q2s16);
+ q10s16 = vsubq_s16(q1s16, q2s16);
+ q11s16 = vsubq_s16(q0s16, q3s16);
+ q12s16 = vsubq_s16(q7s16, q4s16);
+ q13s16 = vsubq_s16(q6s16, q5s16);
+ q14s16 = vaddq_s16(q6s16, q5s16);
+ q15s16 = vaddq_s16(q7s16, q4s16);
+
+ // stage 6
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+
+ d14s16 = vdup_n_s16(cospi_16_64);
+
+ q3s32 = vmull_s16(d26s16, d14s16);
+ q4s32 = vmull_s16(d27s16, d14s16);
+ q0s32 = vmull_s16(d20s16, d14s16);
+ q1s32 = vmull_s16(d21s16, d14s16);
+
+ q5s32 = vsubq_s32(q3s32, q0s32);
+ q6s32 = vsubq_s32(q4s32, q1s32);
+ q10s32 = vaddq_s32(q3s32, q0s32);
+ q4s32 = vaddq_s32(q4s32, q1s32);
+
+ d4s16 = vqrshrn_n_s32(q5s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ d10s16 = vqrshrn_n_s32(q10s32, 14);
+ d11s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q0s32 = vmull_s16(d22s16, d14s16);
+ q1s32 = vmull_s16(d23s16, d14s16);
+ q13s32 = vmull_s16(d24s16, d14s16);
+ q6s32 = vmull_s16(d25s16, d14s16);
+
+ q10s32 = vsubq_s32(q13s32, q0s32);
+ q4s32 = vsubq_s32(q6s32, q1s32);
+ q13s32 = vaddq_s32(q13s32, q0s32);
+ q6s32 = vaddq_s32(q6s32, q1s32);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ d8s16 = vqrshrn_n_s32(q13s32, 14);
+ d9s16 = vqrshrn_n_s32(q6s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 7
+ if (skip_adding != 0) {
+ d = dest;
+ // load the data in pass1
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+ vreinterpret_u8_s64(d12s64));
+ q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+ vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q14s16 = vsubq_s16(q1s16, q14s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+ vreinterpret_u8_s64(d12s64));
+ q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+ vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
+
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+ vreinterpret_u8_s64(d12s64));
+ q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+ vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ d13s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ q12s16 = vrshrq_n_s16(q12s16, 6);
+ q13s16 = vrshrq_n_s16(q13s16, 6);
+ q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+ vreinterpret_u8_s64(d12s64));
+ q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+ vreinterpret_u8_s64(d13s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+ d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+ d += dest_stride;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
+
+ // store the data out 8,9,10,11,12,13,14,15
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q8s16 = vrshrq_n_s16(q8s16, 6);
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q9s16 = vrshrq_n_s16(q9s16, 6);
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q2s16 = vrshrq_n_s16(q2s16, 6);
+ q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q3s16 = vrshrq_n_s16(q3s16, 6);
+ q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q4s16 = vrshrq_n_s16(q4s16, 6);
+ q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q5s16 = vrshrq_n_s16(q5s16, 6);
+ q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ dest += dest_stride;
+ q14s16 = vrshrq_n_s16(q14s16, 6);
+ q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ d += dest_stride;
+
+ d12s64 = vld1_s64((int64_t *)dest);
+ q15s16 = vrshrq_n_s16(q15s16, 6);
+ q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
+ vreinterpret_u8_s64(d12s64));
+ d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
+ vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+ } else { // skip_adding_dest
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q14s16 = vsubq_s16(q1s16, q14s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
+
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
+
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
+ out += 12;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
+ out += 4;
+ vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+ }
+ return;
+}
+
+void vpx_idct16x16_10_add_neon_pass1(
+ int16_t *in,
+ int16_t *out,
+ int output_stride) {
+ int16x4_t d4s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q6s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q15s32;
+ int16x8x2_t q0x2s16;
+
+ q0x2s16 = vld2q_s16(in);
+ q8s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q9s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q10s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q11s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q12s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q13s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q14s16 = q0x2s16.val[0];
+ in += 16;
+ q0x2s16 = vld2q_s16(in);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // stage 3
+ q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+ q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+ q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+ q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+ // stage 4
+ q1s16 = vdupq_n_s16(cospi_16_64 * 2);
+ d4s16 = vdup_n_s16(cospi_16_64);
+
+ q8s16 = vqrdmulhq_s16(q8s16, q1s16);
+
+ d8s16 = vget_low_s16(q4s16);
+ d9s16 = vget_high_s16(q4s16);
+ d14s16 = vget_low_s16(q7s16);
+ d15s16 = vget_high_s16(q7s16);
+ q9s32 = vmull_s16(d14s16, d4s16);
+ q10s32 = vmull_s16(d15s16, d4s16);
+ q12s32 = vmull_s16(d9s16, d4s16);
+ q11s32 = vmull_s16(d8s16, d4s16);
+
+ q15s32 = vsubq_s32(q10s32, q12s32);
+ q6s32 = vsubq_s32(q9s32, q11s32);
+ q9s32 = vaddq_s32(q9s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q12s32);
+
+ d11s16 = vqrshrn_n_s32(q15s32, 14);
+ d10s16 = vqrshrn_n_s32(q6s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q10s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 6
+ q2s16 = vaddq_s16(q8s16, q7s16);
+ q9s16 = vaddq_s16(q8s16, q6s16);
+ q10s16 = vaddq_s16(q8s16, q5s16);
+ q11s16 = vaddq_s16(q8s16, q4s16);
+ q12s16 = vsubq_s16(q8s16, q4s16);
+ q13s16 = vsubq_s16(q8s16, q5s16);
+ q14s16 = vsubq_s16(q8s16, q6s16);
+ q15s16 = vsubq_s16(q8s16, q7s16);
+
+ d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+ d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+ d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+ d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+ d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ // store the data
+ output_stride >>= 1; // output_stride / 2, out is int16_t
+ vst1_u64((uint64_t *)out, d4u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d5u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d20u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d21u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d22u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d23u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d24u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += output_stride;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
+
+void vpx_idct16x16_10_add_neon_pass2(
+ int16_t *src,
+ int16_t *out,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
+ uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
+ uint64x1_t d16u64, d17u64, d18u64, d19u64;
+ uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32;
+ int16x8x2_t q0x2s16;
+ (void)skip_adding;
+ (void)dest;
+ (void)dest_stride;
+
+ q0x2s16 = vld2q_s16(src);
+ q8s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q9s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q10s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q11s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q12s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q13s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q14s16 = q0x2s16.val[0];
+ src += 16;
+ q0x2s16 = vld2q_s16(src);
+ q15s16 = q0x2s16.val[0];
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // stage 3
+ q6s16 = vdupq_n_s16(cospi_30_64 * 2);
+ q0s16 = vqrdmulhq_s16(q8s16, q6s16);
+ q6s16 = vdupq_n_s16(cospi_2_64 * 2);
+ q7s16 = vqrdmulhq_s16(q8s16, q6s16);
+
+ q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
+ q14s16 = vdupq_n_s16(cospi_6_64 * 2);
+ q3s16 = vqrdmulhq_s16(q9s16, q15s16);
+ q4s16 = vqrdmulhq_s16(q9s16, q14s16);
+
+ // stage 4
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+ d6s16 = vget_low_s16(q3s16);
+ d7s16 = vget_high_s16(q3s16);
+ d8s16 = vget_low_s16(q4s16);
+ d9s16 = vget_high_s16(q4s16);
+ d14s16 = vget_low_s16(q7s16);
+ d15s16 = vget_high_s16(q7s16);
+
+ d30s16 = vdup_n_s16(cospi_8_64);
+ d31s16 = vdup_n_s16(cospi_24_64);
+
+ q12s32 = vmull_s16(d14s16, d31s16);
+ q5s32 = vmull_s16(d15s16, d31s16);
+ q2s32 = vmull_s16(d0s16, d31s16);
+ q11s32 = vmull_s16(d1s16, d31s16);
+
+ q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
+ q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
+ q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
+ q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
+
+ d2s16 = vqrshrn_n_s32(q12s32, 14);
+ d3s16 = vqrshrn_n_s32(q5s32, 14);
+ d12s16 = vqrshrn_n_s32(q2s32, 14);
+ d13s16 = vqrshrn_n_s32(q11s32, 14);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ d30s16 = vdup_n_s16(-cospi_8_64);
+ q10s32 = vmull_s16(d8s16, d30s16);
+ q13s32 = vmull_s16(d9s16, d30s16);
+ q8s32 = vmull_s16(d6s16, d30s16);
+ q9s32 = vmull_s16(d7s16, d30s16);
+
+ q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
+ q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
+ q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
+ q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
+
+ d4s16 = vqrshrn_n_s32(q10s32, 14);
+ d5s16 = vqrshrn_n_s32(q13s32, 14);
+ d10s16 = vqrshrn_n_s32(q8s32, 14);
+ d11s16 = vqrshrn_n_s32(q9s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ // stage 5
+ q8s16 = vaddq_s16(q0s16, q3s16);
+ q9s16 = vaddq_s16(q1s16, q2s16);
+ q10s16 = vsubq_s16(q1s16, q2s16);
+ q11s16 = vsubq_s16(q0s16, q3s16);
+ q12s16 = vsubq_s16(q7s16, q4s16);
+ q13s16 = vsubq_s16(q6s16, q5s16);
+ q14s16 = vaddq_s16(q6s16, q5s16);
+ q15s16 = vaddq_s16(q7s16, q4s16);
+
+ // stage 6
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+
+ d14s16 = vdup_n_s16(cospi_16_64);
+ q3s32 = vmull_s16(d26s16, d14s16);
+ q4s32 = vmull_s16(d27s16, d14s16);
+ q0s32 = vmull_s16(d20s16, d14s16);
+ q1s32 = vmull_s16(d21s16, d14s16);
+
+ q5s32 = vsubq_s32(q3s32, q0s32);
+ q6s32 = vsubq_s32(q4s32, q1s32);
+ q0s32 = vaddq_s32(q3s32, q0s32);
+ q4s32 = vaddq_s32(q4s32, q1s32);
+
+ d4s16 = vqrshrn_n_s32(q5s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ d10s16 = vqrshrn_n_s32(q0s32, 14);
+ d11s16 = vqrshrn_n_s32(q4s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q0s32 = vmull_s16(d22s16, d14s16);
+ q1s32 = vmull_s16(d23s16, d14s16);
+ q13s32 = vmull_s16(d24s16, d14s16);
+ q6s32 = vmull_s16(d25s16, d14s16);
+
+ q10s32 = vsubq_s32(q13s32, q0s32);
+ q4s32 = vsubq_s32(q6s32, q1s32);
+ q13s32 = vaddq_s32(q13s32, q0s32);
+ q6s32 = vaddq_s32(q6s32, q1s32);
+
+ d6s16 = vqrshrn_n_s32(q10s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ d8s16 = vqrshrn_n_s32(q13s32, 14);
+ d9s16 = vqrshrn_n_s32(q6s32, 14);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+
+ // stage 7
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q15s16);
+ q13s16 = vaddq_s16(q1s16, q14s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q14s16 = vsubq_s16(q1s16, q14s16);
+ q15s16 = vsubq_s16(q0s16, q15s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q10s16, q5s16);
+ q13s16 = vaddq_s16(q11s16, q4s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q4s16 = vsubq_s16(q11s16, q4s16);
+ q5s16 = vsubq_s16(q10s16, q5s16);
+
+ q0s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q1s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q12s16 = vaddq_s16(q0s16, q3s16);
+ q13s16 = vaddq_s16(q1s16, q2s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q2s16 = vsubq_s16(q1s16, q2s16);
+ q3s16 = vsubq_s16(q0s16, q3s16);
+
+ q10s16 = vld1q_s16(pass1Output);
+ pass1Output += 8;
+ q11s16 = vld1q_s16(pass1Output);
+ q12s16 = vaddq_s16(q10s16, q9s16);
+ q13s16 = vaddq_s16(q11s16, q8s16);
+ d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+ d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+ d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+ d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+ vst1_u64((uint64_t *)out, d24u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d25u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d26u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d27u64);
+ out += 12;
+ q8s16 = vsubq_s16(q11s16, q8s16);
+ q9s16 = vsubq_s16(q10s16, q9s16);
+
+ d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+ d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+ d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
+ d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
+ d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
+ d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
+ d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
+ d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
+ d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+ d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+ d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+ d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+ d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+ d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+ d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+ d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+ vst1_u64((uint64_t *)out, d16u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d17u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d18u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d19u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d4u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d5u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d6u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d7u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d8u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d9u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d10u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d11u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d28u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d29u64);
+ out += 12;
+ vst1_u64((uint64_t *)out, d30u64);
+ out += 4;
+ vst1_u64((uint64_t *)out, d31u64);
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c
new file mode 100644
index 0000000000..352979aa16
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+void vpx_idct16x16_256_add_neon_pass1(const int16_t *input,
+ int16_t *output,
+ int output_stride);
+void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
+ int16_t *output,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride);
+void vpx_idct16x16_10_add_neon_pass1(const int16_t *input,
+ int16_t *output,
+ int output_stride);
+void vpx_idct16x16_10_add_neon_pass2(const int16_t *src,
+ int16_t *output,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride);
+
+#if HAVE_NEON_ASM
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
+extern void vpx_push_neon(int64_t *store);
+extern void vpx_pop_neon(int64_t *store);
+#endif // HAVE_NEON_ASM
+
+void vpx_idct16x16_256_add_neon(const int16_t *input,
+ uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
+ int64_t store_reg[8];
+#endif
+ int16_t pass1_output[16*16] = {0};
+ int16_t row_idct_output[16*16] = {0};
+
+#if HAVE_NEON_ASM
+ // save d8-d15 register values.
+ vpx_push_neon(store_reg);
+#endif
+
+ /* Parallel idct on the upper 8 rows */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7
+ // which will be saved into row_idct_output.
+ vpx_idct16x16_256_add_neon_pass2(input+1,
+ row_idct_output,
+ pass1_output,
+ 0,
+ dest,
+ dest_stride);
+
+ /* Parallel idct on the lower 8 rows */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7
+ // which will be saved into row_idct_output.
+ vpx_idct16x16_256_add_neon_pass2(input+8*16+1,
+ row_idct_output+8,
+ pass1_output,
+ 0,
+ dest,
+ dest_stride);
+
+ /* Parallel idct on the left 8 columns */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7.
+ // Then add the result to the destination data.
+ vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
+ row_idct_output,
+ pass1_output,
+ 1,
+ dest,
+ dest_stride);
+
+ /* Parallel idct on the right 8 columns */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7.
+ // Then add the result to the destination data.
+ vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
+ row_idct_output+8,
+ pass1_output,
+ 1,
+ dest+8,
+ dest_stride);
+
+#if HAVE_NEON_ASM
+ // restore d8-d15 register values.
+ vpx_pop_neon(store_reg);
+#endif
+
+ return;
+}
+
+void vpx_idct16x16_10_add_neon(const int16_t *input,
+ uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
+ int64_t store_reg[8];
+#endif
+ int16_t pass1_output[16*16] = {0};
+ int16_t row_idct_output[16*16] = {0};
+
+#if HAVE_NEON_ASM
+ // save d8-d15 register values.
+ vpx_push_neon(store_reg);
+#endif
+
+ /* Parallel idct on the upper 8 rows */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7
+ // which will be saved into row_idct_output.
+ vpx_idct16x16_10_add_neon_pass2(input+1,
+ row_idct_output,
+ pass1_output,
+ 0,
+ dest,
+ dest_stride);
+
+ /* Skip Parallel idct on the lower 8 rows as they are all 0s */
+
+ /* Parallel idct on the left 8 columns */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7.
+ // Then add the result to the destination data.
+ vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
+ row_idct_output,
+ pass1_output,
+ 1,
+ dest,
+ dest_stride);
+
+ /* Parallel idct on the right 8 columns */
+ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+ // stage 6 result in pass1_output.
+ vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+
+ // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+ // with result in pass1(pass1_output) to calculate final result in stage 7.
+ // Then add the result to the destination data.
+ vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
+ row_idct_output+8,
+ pass1_output,
+ 1,
+ dest+8,
+ dest_stride);
+
+#if HAVE_NEON_ASM
+ // restore d8-d15 register values.
+ vpx_pop_neon(store_reg);
+#endif
+
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
new file mode 100644
index 0000000000..c25c0c4a5c
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void LD_16x8(
+ uint8_t *d,
+ int d_stride,
+ uint8x16_t *q8u8,
+ uint8x16_t *q9u8,
+ uint8x16_t *q10u8,
+ uint8x16_t *q11u8,
+ uint8x16_t *q12u8,
+ uint8x16_t *q13u8,
+ uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vld1q_u8(d);
+ d += d_stride;
+ *q9u8 = vld1q_u8(d);
+ d += d_stride;
+ *q10u8 = vld1q_u8(d);
+ d += d_stride;
+ *q11u8 = vld1q_u8(d);
+ d += d_stride;
+ *q12u8 = vld1q_u8(d);
+ d += d_stride;
+ *q13u8 = vld1q_u8(d);
+ d += d_stride;
+ *q14u8 = vld1q_u8(d);
+ d += d_stride;
+ *q15u8 = vld1q_u8(d);
+ return;
+}
+
+static INLINE void ADD_DIFF_16x8(
+ uint8x16_t qdiffu8,
+ uint8x16_t *q8u8,
+ uint8x16_t *q9u8,
+ uint8x16_t *q10u8,
+ uint8x16_t *q11u8,
+ uint8x16_t *q12u8,
+ uint8x16_t *q13u8,
+ uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
+ *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
+ *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
+ *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
+ *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
+ *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
+ *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
+ *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
+ return;
+}
+
+static INLINE void SUB_DIFF_16x8(
+ uint8x16_t qdiffu8,
+ uint8x16_t *q8u8,
+ uint8x16_t *q9u8,
+ uint8x16_t *q10u8,
+ uint8x16_t *q11u8,
+ uint8x16_t *q12u8,
+ uint8x16_t *q13u8,
+ uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
+ *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
+ *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
+ *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
+ *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
+ *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
+ *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
+ *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
+ return;
+}
+
+static INLINE void ST_16x8(
+ uint8_t *d,
+ int d_stride,
+ uint8x16_t *q8u8,
+ uint8x16_t *q9u8,
+ uint8x16_t *q10u8,
+ uint8x16_t *q11u8,
+ uint8x16_t *q12u8,
+ uint8x16_t *q13u8,
+ uint8x16_t *q14u8,
+ uint8x16_t *q15u8) {
+ vst1q_u8(d, *q8u8);
+ d += d_stride;
+ vst1q_u8(d, *q9u8);
+ d += d_stride;
+ vst1q_u8(d, *q10u8);
+ d += d_stride;
+ vst1q_u8(d, *q11u8);
+ d += d_stride;
+ vst1q_u8(d, *q12u8);
+ d += d_stride;
+ vst1q_u8(d, *q13u8);
+ d += d_stride;
+ vst1q_u8(d, *q14u8);
+ d += d_stride;
+ vst1q_u8(d, *q15u8);
+ return;
+}
+
+void vpx_idct32x32_1_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int i, j, dest_stride8;
+ uint8_t *d;
+ int16_t a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ dest_stride8 = dest_stride * 8;
+ if (a1 >= 0) { // diff_positive_32_32
+ a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+ q0u8 = vdupq_n_u8(a1);
+ for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
+ d = dest;
+ for (j = 0; j < 4; j++) {
+ LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ d += dest_stride8;
+ }
+ }
+ } else { // diff_negative_32_32
+ a1 = -a1;
+ a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+ q0u8 = vdupq_n_u8(a1);
+ for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
+ d = dest;
+ for (j = 0; j < 4; j++) {
+ LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+ &q12u8, &q13u8, &q14u8, &q15u8);
+ d += dest_stride8;
+ }
+ }
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
new file mode 100644
index 0000000000..025437eb96
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define LOAD_FROM_TRANSPOSED(prev, first, second) \
+ q14s16 = vld1q_s16(trans_buf + first * 8); \
+ q13s16 = vld1q_s16(trans_buf + second * 8);
+
+#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
+ qA = vld1q_s16(out + first * 32); \
+ qB = vld1q_s16(out + second * 32);
+
+#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
+ vst1q_s16(out + first * 32, qA); \
+ vst1q_s16(out + second * 32, qB);
+
+#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
+ __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
+ q6s16, q7s16, q8s16, q9s16);
+static INLINE void __STORE_COMBINE_CENTER_RESULTS(
+ uint8_t *p1,
+ uint8_t *p2,
+ int stride,
+ int16x8_t q6s16,
+ int16x8_t q7s16,
+ int16x8_t q8s16,
+ int16x8_t q9s16) {
+ int16x4_t d8s16, d9s16, d10s16, d11s16;
+
+ d8s16 = vld1_s16((int16_t *)p1);
+ p1 += stride;
+ d11s16 = vld1_s16((int16_t *)p2);
+ p2 -= stride;
+ d9s16 = vld1_s16((int16_t *)p1);
+ d10s16 = vld1_s16((int16_t *)p2);
+
+ q7s16 = vrshrq_n_s16(q7s16, 6);
+ q8s16 = vrshrq_n_s16(q8s16, 6);
+ q9s16 = vrshrq_n_s16(q9s16, 6);
+ q6s16 = vrshrq_n_s16(q6s16, 6);
+
+ q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
+ vreinterpret_u8_s16(d9s16)));
+ q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_s16(d10s16)));
+ q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_s16(d11s16)));
+ q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
+ vreinterpret_u8_s16(d8s16)));
+
+ d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+ d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
+ d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
+ d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+
+ vst1_s16((int16_t *)p1, d9s16);
+ p1 -= stride;
+ vst1_s16((int16_t *)p2, d10s16);
+ p2 += stride;
+ vst1_s16((int16_t *)p1, d8s16);
+ vst1_s16((int16_t *)p2, d11s16);
+ return;
+}
+
+#define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
+ __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
+ q4s16, q5s16, q6s16, q7s16);
+static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
+ uint8_t *p1,
+ uint8_t *p2,
+ int stride,
+ int16x8_t q4s16,
+ int16x8_t q5s16,
+ int16x8_t q6s16,
+ int16x8_t q7s16) {
+ int16x4_t d4s16, d5s16, d6s16, d7s16;
+
+ d4s16 = vld1_s16((int16_t *)p1);
+ p1 += stride;
+ d7s16 = vld1_s16((int16_t *)p2);
+ p2 -= stride;
+ d5s16 = vld1_s16((int16_t *)p1);
+ d6s16 = vld1_s16((int16_t *)p2);
+
+ q5s16 = vrshrq_n_s16(q5s16, 6);
+ q6s16 = vrshrq_n_s16(q6s16, 6);
+ q7s16 = vrshrq_n_s16(q7s16, 6);
+ q4s16 = vrshrq_n_s16(q4s16, 6);
+
+ q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
+ vreinterpret_u8_s16(d5s16)));
+ q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
+ vreinterpret_u8_s16(d6s16)));
+ q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
+ vreinterpret_u8_s16(d7s16)));
+ q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
+ vreinterpret_u8_s16(d4s16)));
+
+ d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
+ d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+ d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+ d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
+
+ vst1_s16((int16_t *)p1, d5s16);
+ p1 -= stride;
+ vst1_s16((int16_t *)p2, d6s16);
+ p2 += stride;
+ vst1_s16((int16_t *)p2, d7s16);
+ vst1_s16((int16_t *)p1, d4s16);
+ return;
+}
+
+#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
+ DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
+static INLINE void DO_BUTTERFLY(
+ int16x8_t q14s16,
+ int16x8_t q13s16,
+ int16_t first_const,
+ int16_t second_const,
+ int16x8_t *qAs16,
+ int16x8_t *qBs16) {
+ int16x4_t d30s16, d31s16;
+ int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
+ int16x4_t dCs16, dDs16, dAs16, dBs16;
+
+ dCs16 = vget_low_s16(q14s16);
+ dDs16 = vget_high_s16(q14s16);
+ dAs16 = vget_low_s16(q13s16);
+ dBs16 = vget_high_s16(q13s16);
+
+ d30s16 = vdup_n_s16(first_const);
+ d31s16 = vdup_n_s16(second_const);
+
+ q8s32 = vmull_s16(dCs16, d30s16);
+ q10s32 = vmull_s16(dAs16, d31s16);
+ q9s32 = vmull_s16(dDs16, d30s16);
+ q11s32 = vmull_s16(dBs16, d31s16);
+ q12s32 = vmull_s16(dCs16, d31s16);
+
+ q8s32 = vsubq_s32(q8s32, q10s32);
+ q9s32 = vsubq_s32(q9s32, q11s32);
+
+ q10s32 = vmull_s16(dDs16, d31s16);
+ q11s32 = vmull_s16(dAs16, d30s16);
+ q15s32 = vmull_s16(dBs16, d30s16);
+
+ q11s32 = vaddq_s32(q12s32, q11s32);
+ q10s32 = vaddq_s32(q10s32, q15s32);
+
+ *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
+ vqrshrn_n_s32(q9s32, 14));
+ *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
+ vqrshrn_n_s32(q10s32, 14));
+ return;
+}
+
+static INLINE void idct32_transpose_pair(
+ int16_t *input,
+ int16_t *t_buf) {
+ int16_t *in;
+ int i;
+ const int stride = 32;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ for (i = 0; i < 4; i++, input += 8) {
+ in = input;
+ q8s16 = vld1q_s16(in);
+ in += stride;
+ q9s16 = vld1q_s16(in);
+ in += stride;
+ q10s16 = vld1q_s16(in);
+ in += stride;
+ q11s16 = vld1q_s16(in);
+ in += stride;
+ q12s16 = vld1q_s16(in);
+ in += stride;
+ q13s16 = vld1q_s16(in);
+ in += stride;
+ q14s16 = vld1q_s16(in);
+ in += stride;
+ q15s16 = vld1q_s16(in);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+ d20s16 = vget_low_s16(q10s16);
+ d21s16 = vget_high_s16(q10s16);
+ d22s16 = vget_low_s16(q11s16);
+ d23s16 = vget_high_s16(q11s16);
+ d24s16 = vget_low_s16(q12s16);
+ d25s16 = vget_high_s16(q12s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+ d30s16 = vget_low_s16(q15s16);
+ d31s16 = vget_high_s16(q15s16);
+
+ q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ q12s16 = vcombine_s16(d17s16, d25s16);
+ q13s16 = vcombine_s16(d19s16, d27s16);
+ q14s16 = vcombine_s16(d21s16, d29s16);
+ q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+ vreinterpretq_s32_s16(q10s16));
+ q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
+ vreinterpretq_s32_s16(q11s16));
+ q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
+ vreinterpretq_s32_s16(q14s16));
+ q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
+ vreinterpretq_s32_s16(q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ vst1q_s16(t_buf, q0x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q0x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q1x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q1x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q2x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q2x2s16.val[1]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q3x2s16.val[0]);
+ t_buf += 8;
+ vst1q_s16(t_buf, q3x2s16.val[1]);
+ t_buf += 8;
+ }
+ return;
+}
+
+static INLINE void idct32_bands_end_1st_pass(
+ int16_t *out,
+ int16x8_t q2s16,
+ int16x8_t q3s16,
+ int16x8_t q6s16,
+ int16x8_t q7s16,
+ int16x8_t q8s16,
+ int16x8_t q9s16,
+ int16x8_t q10s16,
+ int16x8_t q11s16,
+ int16x8_t q12s16,
+ int16x8_t q13s16,
+ int16x8_t q14s16,
+ int16x8_t q15s16) {
+ int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+ STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
+ STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
+ STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
+ q2s16 = vaddq_s16(q10s16, q1s16);
+ q3s16 = vaddq_s16(q11s16, q0s16);
+ q4s16 = vsubq_s16(q11s16, q0s16);
+ q5s16 = vsubq_s16(q10s16, q1s16);
+
+ LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
+ STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
+ STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
+ q2s16 = vaddq_s16(q12s16, q1s16);
+ q3s16 = vaddq_s16(q13s16, q0s16);
+ q4s16 = vsubq_s16(q13s16, q0s16);
+ q5s16 = vsubq_s16(q12s16, q1s16);
+
+ LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
+ STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
+ STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
+
+ LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
+ q2s16 = vaddq_s16(q14s16, q1s16);
+ q3s16 = vaddq_s16(q15s16, q0s16);
+ q4s16 = vsubq_s16(q15s16, q0s16);
+ q5s16 = vsubq_s16(q14s16, q1s16);
+
+ LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
+ STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
+
+ LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
+ STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
+ return;
+}
+
+static INLINE void idct32_bands_end_2nd_pass(
+ int16_t *out,
+ uint8_t *dest,
+ int stride,
+ int16x8_t q2s16,
+ int16x8_t q3s16,
+ int16x8_t q6s16,
+ int16x8_t q7s16,
+ int16x8_t q8s16,
+ int16x8_t q9s16,
+ int16x8_t q10s16,
+ int16x8_t q11s16,
+ int16x8_t q12s16,
+ int16x8_t q13s16,
+ int16x8_t q14s16,
+ int16x8_t q15s16) {
+ uint8_t *r6 = dest + 31 * stride;
+ uint8_t *r7 = dest/* + 0 * stride*/;
+ uint8_t *r9 = dest + 15 * stride;
+ uint8_t *r10 = dest + 16 * stride;
+ int str2 = stride << 1;
+ int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2; r9 -= str2;
+
+ LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2; r6 -= str2;
+
+ LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
+ q2s16 = vaddq_s16(q10s16, q1s16);
+ q3s16 = vaddq_s16(q11s16, q0s16);
+ q4s16 = vsubq_s16(q11s16, q0s16);
+ q5s16 = vsubq_s16(q10s16, q1s16);
+
+ LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2; r9 -= str2;
+
+ LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2; r6 -= str2;
+
+ LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
+ q2s16 = vaddq_s16(q12s16, q1s16);
+ q3s16 = vaddq_s16(q13s16, q0s16);
+ q4s16 = vsubq_s16(q13s16, q0s16);
+ q5s16 = vsubq_s16(q12s16, q1s16);
+
+ LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+ r10 += str2; r9 -= str2;
+
+ LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ r7 += str2; r6 -= str2;
+
+ LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
+ q2s16 = vaddq_s16(q14s16, q1s16);
+ q3s16 = vaddq_s16(q15s16, q0s16);
+ q4s16 = vsubq_s16(q15s16, q0s16);
+ q5s16 = vsubq_s16(q14s16, q1s16);
+
+ LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+ STORE_COMBINE_CENTER_RESULTS(r10, r9);
+
+ LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
+ q4s16 = vaddq_s16(q2s16, q1s16);
+ q5s16 = vaddq_s16(q3s16, q0s16);
+ q6s16 = vsubq_s16(q3s16, q0s16);
+ q7s16 = vsubq_s16(q2s16, q1s16);
+ STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+ return;
+}
+
+void vpx_idct32x32_1024_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int stride) {
+ int i, idct32_pass_loop;
+ int16_t trans_buf[32 * 8];
+ int16_t pass1[32 * 32];
+ int16_t pass2[32 * 32];
+ int16_t *out;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+
+ for (idct32_pass_loop = 0, out = pass1;
+ idct32_pass_loop < 2;
+ idct32_pass_loop++,
+ input = pass1, // the input of pass2 is the result of pass1
+ out = pass2) {
+ for (i = 0;
+ i < 4; i++,
+ input += 32 * 8, out += 8) { // idct32_bands_loop
+ idct32_transpose_pair(input, trans_buf);
+
+ // -----------------------------------------
+ // BLOCK A: 16-19,28-31
+ // -----------------------------------------
+ // generate 16,17,30,31
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(0, 1, 31)
+ DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(31, 17, 15)
+ DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
+ // part of stage 2
+ q4s16 = vaddq_s16(q0s16, q1s16);
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q6s16 = vaddq_s16(q2s16, q3s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
+
+ // generate 18,19,28,29
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(15, 9, 23)
+ DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(23, 25, 7)
+ DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
+ // part of stage 2
+ q13s16 = vsubq_s16(q3s16, q2s16);
+ q3s16 = vaddq_s16(q3s16, q2s16);
+ q14s16 = vsubq_s16(q1s16, q0s16);
+ q2s16 = vaddq_s16(q1s16, q0s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
+ // part of stage 4
+ q8s16 = vaddq_s16(q4s16, q2s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q10s16 = vaddq_s16(q7s16, q1s16);
+ q15s16 = vaddq_s16(q6s16, q3s16);
+ q13s16 = vsubq_s16(q5s16, q0s16);
+ q14s16 = vsubq_s16(q7s16, q1s16);
+ STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
+ STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
+ STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
+ // part of stage 4
+ q13s16 = vsubq_s16(q4s16, q2s16);
+ q14s16 = vsubq_s16(q6s16, q3s16);
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
+ STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
+
+ // -----------------------------------------
+ // BLOCK B: 20-23,24-27
+ // -----------------------------------------
+ // generate 20,21,26,27
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(7, 5, 27)
+ DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(27, 21, 11)
+ DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
+ // part of stage 2
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+
+ // generate 22,23,24,25
+ // part of stage 1
+ LOAD_FROM_TRANSPOSED(11, 13, 19)
+ DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(19, 29, 3)
+ DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
+ // part of stage 2
+ q14s16 = vsubq_s16(q4s16, q5s16);
+ q5s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q6s16, q7s16);
+ q6s16 = vaddq_s16(q6s16, q7s16);
+ // part of stage 3
+ DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
+ // part of stage 4
+ q10s16 = vaddq_s16(q7s16, q1s16);
+ q11s16 = vaddq_s16(q5s16, q0s16);
+ q12s16 = vaddq_s16(q6s16, q2s16);
+ q15s16 = vaddq_s16(q4s16, q3s16);
+ // part of stage 6
+ LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
+ q8s16 = vaddq_s16(q14s16, q11s16);
+ q9s16 = vaddq_s16(q13s16, q10s16);
+ q13s16 = vsubq_s16(q13s16, q10s16);
+ q11s16 = vsubq_s16(q14s16, q11s16);
+ STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
+ LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
+ q8s16 = vsubq_s16(q9s16, q12s16);
+ q10s16 = vaddq_s16(q14s16, q15s16);
+ q14s16 = vsubq_s16(q14s16, q15s16);
+ q12s16 = vaddq_s16(q9s16, q12s16);
+ STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
+ // part of stage 7
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
+ q13s16 = q11s16;
+ q14s16 = q8s16;
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
+ // part of stage 4
+ q14s16 = vsubq_s16(q5s16, q0s16);
+ q13s16 = vsubq_s16(q6s16, q2s16);
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
+ q14s16 = vsubq_s16(q7s16, q1s16);
+ q13s16 = vsubq_s16(q4s16, q3s16);
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
+ // part of stage 6
+ LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
+ q8s16 = vaddq_s16(q14s16, q1s16);
+ q9s16 = vaddq_s16(q13s16, q6s16);
+ q13s16 = vsubq_s16(q13s16, q6s16);
+ q1s16 = vsubq_s16(q14s16, q1s16);
+ STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
+ LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
+ q14s16 = vsubq_s16(q8s16, q5s16);
+ q10s16 = vaddq_s16(q8s16, q5s16);
+ q11s16 = vaddq_s16(q9s16, q0s16);
+ q0s16 = vsubq_s16(q9s16, q0s16);
+ STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
+ // part of stage 7
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+ STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
+ DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
+ &q1s16, &q0s16);
+ STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
+
+ // -----------------------------------------
+ // BLOCK C: 8-10,11-15
+ // -----------------------------------------
+ // generate 8,9,14,15
+ // part of stage 2
+ LOAD_FROM_TRANSPOSED(3, 2, 30)
+ DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(30, 18, 14)
+ DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
+ // part of stage 3
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 4
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
+
+ // generate 10,11,12,13
+ // part of stage 2
+ LOAD_FROM_TRANSPOSED(14, 10, 22)
+ DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(22, 26, 6)
+ DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
+ // part of stage 3
+ q14s16 = vsubq_s16(q4s16, q5s16);
+ q5s16 = vaddq_s16(q4s16, q5s16);
+ q13s16 = vsubq_s16(q6s16, q7s16);
+ q6s16 = vaddq_s16(q6s16, q7s16);
+ // part of stage 4
+ DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
+ // part of stage 5
+ q8s16 = vaddq_s16(q0s16, q5s16);
+ q9s16 = vaddq_s16(q1s16, q7s16);
+ q13s16 = vsubq_s16(q1s16, q7s16);
+ q14s16 = vsubq_s16(q3s16, q4s16);
+ q10s16 = vaddq_s16(q3s16, q4s16);
+ q15s16 = vaddq_s16(q2s16, q6s16);
+ STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
+ STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
+ // part of stage 6
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+ STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
+ q13s16 = vsubq_s16(q0s16, q5s16);
+ q14s16 = vsubq_s16(q2s16, q6s16);
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+ STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
+
+ // -----------------------------------------
+ // BLOCK D: 0-3,4-7
+ // -----------------------------------------
+ // generate 4,5,6,7
+ // part of stage 3
+ LOAD_FROM_TRANSPOSED(6, 4, 28)
+ DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
+ LOAD_FROM_TRANSPOSED(28, 20, 12)
+ DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+ // part of stage 4
+ q13s16 = vsubq_s16(q0s16, q1s16);
+ q0s16 = vaddq_s16(q0s16, q1s16);
+ q14s16 = vsubq_s16(q2s16, q3s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ // part of stage 5
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+
+ // generate 0,1,2,3
+ // part of stage 4
+ LOAD_FROM_TRANSPOSED(12, 0, 16)
+ DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
+ LOAD_FROM_TRANSPOSED(16, 8, 24)
+ DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
+ // part of stage 5
+ q4s16 = vaddq_s16(q7s16, q6s16);
+ q7s16 = vsubq_s16(q7s16, q6s16);
+ q6s16 = vsubq_s16(q5s16, q14s16);
+ q5s16 = vaddq_s16(q5s16, q14s16);
+ // part of stage 6
+ q8s16 = vaddq_s16(q4s16, q2s16);
+ q9s16 = vaddq_s16(q5s16, q3s16);
+ q10s16 = vaddq_s16(q6s16, q1s16);
+ q11s16 = vaddq_s16(q7s16, q0s16);
+ q12s16 = vsubq_s16(q7s16, q0s16);
+ q13s16 = vsubq_s16(q6s16, q1s16);
+ q14s16 = vsubq_s16(q5s16, q3s16);
+ q15s16 = vsubq_s16(q4s16, q2s16);
+ // part of stage 7
+ LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
+ q2s16 = vaddq_s16(q8s16, q1s16);
+ q3s16 = vaddq_s16(q9s16, q0s16);
+ q4s16 = vsubq_s16(q9s16, q0s16);
+ q5s16 = vsubq_s16(q8s16, q1s16);
+ LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
+ q8s16 = vaddq_s16(q4s16, q1s16);
+ q9s16 = vaddq_s16(q5s16, q0s16);
+ q6s16 = vsubq_s16(q5s16, q0s16);
+ q7s16 = vsubq_s16(q4s16, q1s16);
+
+ if (idct32_pass_loop == 0) {
+ idct32_bands_end_1st_pass(out,
+ q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+ q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
+ } else {
+ idct32_bands_end_2nd_pass(out, dest, stride,
+ q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+ q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
+ dest += 8;
+ }
+ }
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
new file mode 100644
index 0000000000..ea618700c9
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+void vpx_idct4x4_1_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x8_t d6u8;
+ uint32x2_t d2u32 = vdup_n_u32(0);
+ uint16x8_t q8u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ q0s16 = vdupq_n_s16(a1);
+
+ // dc_only_idct_add
+ d1 = d2 = dest;
+ for (i = 0; i < 2; i++) {
+ d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
+ d1 += dest_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),
+ vreinterpret_u8_u32(d2u32));
+ d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+
+ vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
+ d2 += dest_stride;
+ vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
+ d2 += dest_stride;
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
new file mode 100644
index 0000000000..3c975c99b7
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vpx_idct4x4_16_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x8_t d26u8, d27u8;
+ uint32x2_t d26u32, d27u32;
+ uint16x8_t q8u16, q9u16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
+ int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
+ int16x8_t q8s16, q9s16, q13s16, q14s16;
+ int32x4_t q1s32, q13s32, q14s32, q15s32;
+ int16x4x2_t d0x2s16, d1x2s16;
+ int32x4x2_t q0x2s32;
+ uint8_t *d;
+ int16_t cospi_8_64 = 15137;
+ int16_t cospi_16_64 = 11585;
+ int16_t cospi_24_64 = 6270;
+
+ d26u32 = d27u32 = vdup_n_u32(0);
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_low_s16(q9s16);
+ d19s16 = vget_high_s16(q9s16);
+
+ d0x2s16 = vtrn_s16(d16s16, d17s16);
+ d1x2s16 = vtrn_s16(d18s16, d19s16);
+ q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+ q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+ d20s16 = vdup_n_s16(cospi_8_64);
+ d21s16 = vdup_n_s16(cospi_16_64);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+ vreinterpretq_s32_s16(q9s16));
+ d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+ d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+ d22s16 = vdup_n_s16(cospi_24_64);
+
+ // stage 1
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, d22s16);
+ q1s32 = vmull_s16(d17s16, d20s16);
+ q13s32 = vmull_s16(d23s16, d21s16);
+ q14s32 = vmull_s16(d24s16, d21s16);
+
+ q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+ q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+
+ // stage 2
+ q8s16 = vaddq_s16(q13s16, q14s16);
+ q9s16 = vsubq_s16(q13s16, q14s16);
+
+ d16s16 = vget_low_s16(q8s16);
+ d17s16 = vget_high_s16(q8s16);
+ d18s16 = vget_high_s16(q9s16); // vswp d18 d19
+ d19s16 = vget_low_s16(q9s16);
+
+ d0x2s16 = vtrn_s16(d16s16, d17s16);
+ d1x2s16 = vtrn_s16(d18s16, d19s16);
+ q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+ q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+ vreinterpretq_s32_s16(q9s16));
+ d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+ d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+ d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+ // do the transform on columns
+ // stage 1
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, d22s16);
+ q1s32 = vmull_s16(d17s16, d20s16);
+ q13s32 = vmull_s16(d23s16, d21s16);
+ q14s32 = vmull_s16(d24s16, d21s16);
+
+ q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+ q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+
+ // stage 2
+ q8s16 = vaddq_s16(q13s16, q14s16);
+ q9s16 = vsubq_s16(q13s16, q14s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 4);
+ q9s16 = vrshrq_n_s16(q9s16, 4);
+
+ d = dest;
+ d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
+ d += dest_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
+ d += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
+ d += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u32(d26u32));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u32(d27u32));
+
+ d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+ d = dest;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
+ d += dest_stride;
+ vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
new file mode 100644
index 0000000000..c1b801fad5
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+void vpx_idct8x8_1_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x8_t d2u8, d3u8, d30u8, d31u8;
+ uint64x1_t d2u64, d3u64, d4u64, d5u64;
+ uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+
+ q0s16 = vdupq_n_s16(a1);
+ q0u16 = vreinterpretq_u16_s16(q0s16);
+
+ d1 = d2 = dest;
+ for (i = 0; i < 2; i++) {
+ d2u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d4u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d5u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+
+ q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+ q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+ q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+ q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+ d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
+ d2 += dest_stride;
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
new file mode 100644
index 0000000000..4b2c2a6f83
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
@@ -0,0 +1,540 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void TRANSPOSE8X8(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+ vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+ vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+ vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+ vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
+}
+
+static INLINE void IDCT8x8_1D(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+ d0s16 = vdup_n_s16(cospi_28_64);
+ d1s16 = vdup_n_s16(cospi_4_64);
+ d2s16 = vdup_n_s16(cospi_12_64);
+ d3s16 = vdup_n_s16(cospi_20_64);
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d26s16, d2s16);
+ q6s32 = vmull_s16(d27s16, d2s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+ q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d10s16 = vqrshrn_n_s32(q5s32, 14);
+ d11s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q2s32 = vmull_s16(d18s16, d1s16);
+ q3s32 = vmull_s16(d19s16, d1s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q13s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+ d14s16 = vqrshrn_n_s32(q2s32, 14);
+ d15s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q13s32, 14);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d0s16 = vdup_n_s16(cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d0s16);
+ q3s32 = vmull_s16(d17s16, d0s16);
+ q13s32 = vmull_s16(d16s16, d0s16);
+ q15s32 = vmull_s16(d17s16, d0s16);
+
+ q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+ q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+ q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+ d0s16 = vdup_n_s16(cospi_24_64);
+ d1s16 = vdup_n_s16(cospi_8_64);
+
+ d18s16 = vqrshrn_n_s32(q2s32, 14);
+ d19s16 = vqrshrn_n_s32(q3s32, 14);
+ d22s16 = vqrshrn_n_s32(q13s32, 14);
+ d23s16 = vqrshrn_n_s32(q15s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+ *q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q2s32 = vmull_s16(d20s16, d0s16);
+ q3s32 = vmull_s16(d21s16, d0s16);
+ q8s32 = vmull_s16(d20s16, d1s16);
+ q12s32 = vmull_s16(d21s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+ q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+ d26s16 = vqrshrn_n_s32(q2s32, 14);
+ d27s16 = vqrshrn_n_s32(q3s32, 14);
+ d30s16 = vqrshrn_n_s32(q8s32, 14);
+ d31s16 = vqrshrn_n_s32(q12s32, 14);
+ *q13s16 = vcombine_s16(d26s16, d27s16);
+ *q15s16 = vcombine_s16(d30s16, d31s16);
+
+ q0s16 = vaddq_s16(*q9s16, *q15s16);
+ q1s16 = vaddq_s16(*q11s16, *q13s16);
+ q2s16 = vsubq_s16(*q11s16, *q13s16);
+ q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+ *q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ *q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ *q8s16 = vaddq_s16(q0s16, q7s16);
+ *q9s16 = vaddq_s16(q1s16, q6s16);
+ *q10s16 = vaddq_s16(q2s16, q5s16);
+ *q11s16 = vaddq_s16(q3s16, q4s16);
+ *q12s16 = vsubq_s16(q3s16, q4s16);
+ *q13s16 = vsubq_s16(q2s16, q5s16);
+ *q14s16 = vsubq_s16(q1s16, q6s16);
+ *q15s16 = vsubq_s16(q0s16, q7s16);
+ return;
+}
+
+void vpx_idct8x8_64_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 16);
+ q11s16 = vld1q_s16(input + 24);
+ q12s16 = vld1q_s16(input + 32);
+ q13s16 = vld1q_s16(input + 40);
+ q14s16 = vld1q_s16(input + 48);
+ q15s16 = vld1q_s16(input + 56);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ d1 = d2 = dest;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ return;
+}
+
+void vpx_idct8x8_12_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
+ int16x4_t d26s16, d27s16, d28s16, d29s16;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+ int32x4_t q9s32, q10s32, q11s32, q12s32;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 16);
+ q11s16 = vld1q_s16(input + 24);
+ q12s16 = vld1q_s16(input + 32);
+ q13s16 = vld1q_s16(input + 40);
+ q14s16 = vld1q_s16(input + 48);
+ q15s16 = vld1q_s16(input + 56);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // First transform rows
+ // stage 1
+ q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+ q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+ q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+
+ q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
+
+ q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+ q1s16 = vdupq_n_s16(cospi_12_64 * 2);
+
+ q5s16 = vqrdmulhq_s16(q11s16, q0s16);
+
+ q0s16 = vdupq_n_s16(cospi_16_64 * 2);
+
+ q6s16 = vqrdmulhq_s16(q11s16, q1s16);
+
+ // stage 2 & stage 3 - even half
+ q1s16 = vdupq_n_s16(cospi_24_64 * 2);
+
+ q9s16 = vqrdmulhq_s16(q8s16, q0s16);
+
+ q0s16 = vdupq_n_s16(cospi_8_64 * 2);
+
+ q13s16 = vqrdmulhq_s16(q10s16, q1s16);
+
+ q15s16 = vqrdmulhq_s16(q10s16, q0s16);
+
+ // stage 3 -odd half
+ q0s16 = vaddq_s16(q9s16, q15s16);
+ q1s16 = vaddq_s16(q9s16, q13s16);
+ q2s16 = vsubq_s16(q9s16, q13s16);
+ q3s16 = vsubq_s16(q9s16, q15s16);
+
+ // stage 2 - odd half
+ q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ // stage 4
+ q8s16 = vaddq_s16(q0s16, q7s16);
+ q9s16 = vaddq_s16(q1s16, q6s16);
+ q10s16 = vaddq_s16(q2s16, q5s16);
+ q11s16 = vaddq_s16(q3s16, q4s16);
+ q12s16 = vsubq_s16(q3s16, q4s16);
+ q13s16 = vsubq_s16(q2s16, q5s16);
+ q14s16 = vsubq_s16(q1s16, q6s16);
+ q15s16 = vsubq_s16(q0s16, q7s16);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ d1 = d2 = dest;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c b/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c
new file mode 100644
index 0000000000..0a376104d2
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c
@@ -0,0 +1,822 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int do_above, int do_left) {
+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+ if (do_above) {
+ const uint8x8_t A = vld1_u8(above); // top row
+ const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ sum_top = vcombine_u16(p1, p1);
+ }
+
+ if (do_left) {
+ const uint8x8_t L = vld1_u8(left); // left border
+ const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ sum_left = vcombine_u16(p1, p1);
+ }
+
+ if (do_above && do_left) {
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 3);
+ } else if (do_above) {
+ dc0 = vrshrn_n_u16(sum_top, 2);
+ } else if (do_left) {
+ dc0 = vrshrn_n_u16(sum_left, 2);
+ } else {
+ dc0 = vdup_n_u8(0x80);
+ }
+
+ {
+ const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+ }
+ }
+}
+
+void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_4x4(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ dc_4x4(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ dc_4x4(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ dc_4x4(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int do_above, int do_left) {
+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+ if (do_above) {
+ const uint8x8_t A = vld1_u8(above); // top row
+ const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ sum_top = vcombine_u16(p2, p2);
+ }
+
+ if (do_left) {
+ const uint8x8_t L = vld1_u8(left); // left border
+ const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ sum_left = vcombine_u16(p2, p2);
+ }
+
+ if (do_above && do_left) {
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 4);
+ } else if (do_above) {
+ dc0 = vrshrn_n_u16(sum_top, 3);
+ } else if (do_left) {
+ dc0 = vrshrn_n_u16(sum_left, 3);
+ } else {
+ dc0 = vdup_n_u8(0x80);
+ }
+
+ {
+ const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+ int i;
+ for (i = 0; i < 8; ++i) {
+ vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
+ }
+ }
+}
+
+void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_8x8(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ dc_8x8(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ dc_8x8(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ dc_8x8(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int do_above, int do_left) {
+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+ if (do_above) {
+ const uint8x16_t A = vld1q_u8(above); // top row
+ const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
+ const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ const uint16x4_t p3 = vpadd_u16(p2, p2);
+ sum_top = vcombine_u16(p3, p3);
+ }
+
+ if (do_left) {
+ const uint8x16_t L = vld1q_u8(left); // left row
+ const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left
+ const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ const uint16x4_t p3 = vpadd_u16(p2, p2);
+ sum_left = vcombine_u16(p3, p3);
+ }
+
+ if (do_above && do_left) {
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 5);
+ } else if (do_above) {
+ dc0 = vrshrn_n_u16(sum_top, 4);
+ } else if (do_left) {
+ dc0 = vrshrn_n_u16(sum_left, 4);
+ } else {
+ dc0 = vdup_n_u8(0x80);
+ }
+
+ {
+ const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+ int i;
+ for (i = 0; i < 16; ++i) {
+ vst1q_u8(dst + i * stride, dc);
+ }
+ }
+}
+
+void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_16x16(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ dc_16x16(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ dc_16x16(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ dc_16x16(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int do_above, int do_left) {
+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+ if (do_above) {
+ const uint8x16_t A0 = vld1q_u8(above); // top row
+ const uint8x16_t A1 = vld1q_u8(above + 16);
+ const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top
+ const uint16x8_t p1 = vpaddlq_u8(A1);
+ const uint16x8_t p2 = vaddq_u16(p0, p1);
+ const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+ const uint16x4_t p4 = vpadd_u16(p3, p3);
+ const uint16x4_t p5 = vpadd_u16(p4, p4);
+ sum_top = vcombine_u16(p5, p5);
+ }
+
+ if (do_left) {
+ const uint8x16_t L0 = vld1q_u8(left); // left row
+ const uint8x16_t L1 = vld1q_u8(left + 16);
+ const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left
+ const uint16x8_t p1 = vpaddlq_u8(L1);
+ const uint16x8_t p2 = vaddq_u16(p0, p1);
+ const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+ const uint16x4_t p4 = vpadd_u16(p3, p3);
+ const uint16x4_t p5 = vpadd_u16(p4, p4);
+ sum_left = vcombine_u16(p5, p5);
+ }
+
+ if (do_above && do_left) {
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 6);
+ } else if (do_above) {
+ dc0 = vrshrn_n_u16(sum_top, 5);
+ } else if (do_left) {
+ dc0 = vrshrn_n_u16(sum_left, 5);
+ } else {
+ dc0 = vdup_n_u8(0x80);
+ }
+
+ {
+ const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+ int i;
+ for (i = 0; i < 32; ++i) {
+ vst1q_u8(dst + i * stride, dc);
+ vst1q_u8(dst + i * stride + 16, dc);
+ }
+ }
+}
+
+void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_32x32(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ dc_32x32(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ dc_32x32(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ dc_32x32(dst, stride, NULL, NULL, 0, 0);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row
+ const uint64x1_t A1 = vshr_n_u64(A0, 8);
+ const uint64x1_t A2 = vshr_n_u64(A0, 16);
+ const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+ const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+ const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+ const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
+ const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+ const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+ const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+ const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+ const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+ const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+ (void)left;
+ vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+ vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+ vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+ vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+ dst[3 * stride + 3] = above[7];
+}
+
+void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
+ static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
+ const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
+ const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
+ const uint8x8_t A0 = vld1_u8(above); // top row
+ const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
+ const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
+ const uint8x8_t avg1 = vhadd_u8(A0, A2);
+ uint8x8_t row = vrhadd_u8(avg1, A1);
+ int i;
+ (void)left;
+ for (i = 0; i < 7; ++i) {
+ vst1_u8(dst + i * stride, row);
+ row = vtbl1_u8(row, sh_12345677);
+ }
+ vst1_u8(dst + i * stride, row);
+}
+
+void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t A0 = vld1q_u8(above); // top row
+ const uint8x16_t above_right = vld1q_dup_u8(above + 15);
+ const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
+ const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
+ const uint8x16_t avg1 = vhaddq_u8(A0, A2);
+ uint8x16_t row = vrhaddq_u8(avg1, A1);
+ int i;
+ (void)left;
+ for (i = 0; i < 15; ++i) {
+ vst1q_u8(dst + i * stride, row);
+ row = vextq_u8(row, above_right, 1);
+ }
+ vst1q_u8(dst + i * stride, row);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
+ const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+ const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+ const uint32x2_t zero = vdup_n_u32(0);
+ const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
+ const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
+ const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
+ const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+ const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+ const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+ const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+ const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+ const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+ const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+ const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+ const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+ const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+ const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+ const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+ const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+ vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+ vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+ vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+ vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+}
+
+#if !HAVE_NEON_ASM
+
+void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint32x2_t d0u32 = vdup_n_u32(0);
+ (void)left;
+
+ d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+ for (i = 0; i < 4; i++, dst += stride)
+ vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+}
+
+void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ (void)left;
+
+ d0u8 = vld1_u8(above);
+ for (i = 0; i < 8; i++, dst += stride)
+ vst1_u8(dst, d0u8);
+}
+
+void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ (void)left;
+
+ q0u8 = vld1q_u8(above);
+ for (i = 0; i < 16; i++, dst += stride)
+ vst1q_u8(dst, q0u8);
+}
+
+void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)left;
+
+ q0u8 = vld1q_u8(above);
+ q1u8 = vld1q_u8(above + 16);
+ for (i = 0; i < 32; i++, dst += stride) {
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ }
+}
+
+void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint32x2_t d1u32 = vdup_n_u32(0);
+ (void)above;
+
+ d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+}
+
+void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint64x1_t d1u64 = vdup_n_u64(0);
+ (void)above;
+
+ d1u64 = vld1_u64((const uint64_t *)left);
+
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+ vst1_u8(dst, d0u8);
+ dst += stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+ vst1_u8(dst, d0u8);
+}
+
+void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j;
+ uint8x8_t d2u8 = vdup_n_u8(0);
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)above;
+
+ q1u8 = vld1q_u8(left);
+ d2u8 = vget_low_u8(q1u8);
+ for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+ q0u8 = vdupq_lane_u8(d2u8, 0);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 1);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 2);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 3);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 4);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 5);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 6);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 7);
+ vst1q_u8(dst, q0u8);
+ dst += stride;
+ }
+}
+
+void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j, k;
+ uint8x8_t d2u8 = vdup_n_u8(0);
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)above;
+
+ for (k = 0; k < 2; k++, left += 16) {
+ q1u8 = vld1q_u8(left);
+ d2u8 = vget_low_u8(q1u8);
+ for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+ q0u8 = vdupq_lane_u8(d2u8, 0);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 1);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 2);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 3);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 4);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 5);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 6);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ q0u8 = vdupq_lane_u8(d2u8, 7);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += stride;
+ }
+ }
+}
+
+void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint16x8_t q1u16, q3u16;
+ int16x8_t q1s16;
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint32x2_t d2u32 = vdup_n_u32(0);
+
+ d0u8 = vld1_dup_u8(above - 1);
+ d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
+ q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
+ for (i = 0; i < 4; i++, dst += stride) {
+ q1u16 = vdupq_n_u16((uint16_t)left[i]);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
+ vreinterpretq_s16_u16(q3u16));
+ d0u8 = vqmovun_s16(q1s16);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ }
+}
+
+void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j;
+ uint16x8_t q0u16, q3u16, q10u16;
+ int16x8_t q0s16;
+ uint16x4_t d20u16;
+ uint8x8_t d0u8, d2u8, d30u8;
+
+ d0u8 = vld1_dup_u8(above - 1);
+ d30u8 = vld1_u8(left);
+ d2u8 = vld1_u8(above);
+ q10u16 = vmovl_u8(d30u8);
+ q3u16 = vsubl_u8(d2u8, d0u8);
+ d20u16 = vget_low_u16(q10u16);
+ for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+ q0u16 = vdupq_lane_u16(d20u16, 0);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += stride;
+ q0u16 = vdupq_lane_u16(d20u16, 1);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += stride;
+ q0u16 = vdupq_lane_u16(d20u16, 2);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += stride;
+ q0u16 = vdupq_lane_u16(d20u16, 3);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += stride;
+ }
+}
+
+void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j, k;
+ uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
+ uint8x16_t q0u8, q1u8;
+ int16x8_t q0s16, q1s16, q8s16, q11s16;
+ uint16x4_t d20u16;
+ uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
+
+ q0u8 = vld1q_dup_u8(above - 1);
+ q1u8 = vld1q_u8(above);
+ q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+ q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+ for (k = 0; k < 2; k++, left += 8) {
+ d18u8 = vld1_u8(left);
+ q10u16 = vmovl_u8(d18u8);
+ d20u16 = vget_low_u16(q10u16);
+ for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+ q0u16 = vdupq_lane_u16(d20u16, 0);
+ q8u16 = vdupq_lane_u16(d20u16, 1);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q2u16));
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q3u16));
+ q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q2u16));
+ q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q3u16));
+ d2u8 = vqmovun_s16(q1s16);
+ d3u8 = vqmovun_s16(q0s16);
+ d22u8 = vqmovun_s16(q11s16);
+ d23u8 = vqmovun_s16(q8s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+ dst += stride;
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+ dst += stride;
+
+ q0u16 = vdupq_lane_u16(d20u16, 2);
+ q8u16 = vdupq_lane_u16(d20u16, 3);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q2u16));
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q3u16));
+ q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q2u16));
+ q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q3u16));
+ d2u8 = vqmovun_s16(q1s16);
+ d3u8 = vqmovun_s16(q0s16);
+ d22u8 = vqmovun_s16(q11s16);
+ d23u8 = vqmovun_s16(q8s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+ dst += stride;
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+ dst += stride;
+ }
+ }
+}
+
+void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j, k;
+ uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
+ uint8x16_t q0u8, q1u8, q2u8;
+ int16x8_t q12s16, q13s16, q14s16, q15s16;
+ uint16x4_t d6u16;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
+
+ q0u8 = vld1q_dup_u8(above - 1);
+ q1u8 = vld1q_u8(above);
+ q2u8 = vld1q_u8(above + 16);
+ q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+ q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+ q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
+ q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
+ for (k = 0; k < 4; k++, left += 8) {
+ d26u8 = vld1_u8(left);
+ q3u16 = vmovl_u8(d26u8);
+ d6u16 = vget_low_u16(q3u16);
+ for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
+ q0u16 = vdupq_lane_u16(d6u16, 0);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += stride;
+
+ q0u16 = vdupq_lane_u16(d6u16, 1);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += stride;
+
+ q0u16 = vdupq_lane_u16(d6u16, 2);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += stride;
+
+ q0u16 = vdupq_lane_u16(d6u16, 3);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += stride;
+ }
+ }
+}
+#endif // !HAVE_NEON_ASM
diff --git a/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm b/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm
new file mode 100644
index 0000000000..115790d480
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm
@@ -0,0 +1,630 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_v_predictor_4x4_neon|
+ EXPORT |vpx_v_predictor_8x8_neon|
+ EXPORT |vpx_v_predictor_16x16_neon|
+ EXPORT |vpx_v_predictor_32x32_neon|
+ EXPORT |vpx_h_predictor_4x4_neon|
+ EXPORT |vpx_h_predictor_8x8_neon|
+ EXPORT |vpx_h_predictor_16x16_neon|
+ EXPORT |vpx_h_predictor_32x32_neon|
+ EXPORT |vpx_tm_predictor_4x4_neon|
+ EXPORT |vpx_tm_predictor_8x8_neon|
+ EXPORT |vpx_tm_predictor_16x16_neon|
+ EXPORT |vpx_tm_predictor_32x32_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_v_predictor_4x4_neon| PROC
+ vld1.32 {d0[0]}, [r2]
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ bx lr
+ ENDP ; |vpx_v_predictor_4x4_neon|
+
+;void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_v_predictor_8x8_neon| PROC
+ vld1.8 {d0}, [r2]
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ bx lr
+ ENDP ; |vpx_v_predictor_8x8_neon|
+
+;void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_v_predictor_16x16_neon| PROC
+ vld1.8 {q0}, [r2]
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ bx lr
+ ENDP ; |vpx_v_predictor_16x16_neon|
+
+;void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_v_predictor_32x32_neon| PROC
+ vld1.8 {q0, q1}, [r2]
+ mov r2, #2
+loop_v
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ subs r2, r2, #1
+ bgt loop_v
+ bx lr
+ ENDP ; |vpx_v_predictor_32x32_neon|
+
+;void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_h_predictor_4x4_neon| PROC
+ vld1.32 {d1[0]}, [r3]
+ vdup.8 d0, d1[0]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[1]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[2]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[3]
+ vst1.32 {d0[0]}, [r0], r1
+ bx lr
+ ENDP ; |vpx_h_predictor_4x4_neon|
+
+;void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_h_predictor_8x8_neon| PROC
+ vld1.64 {d1}, [r3]
+ vdup.8 d0, d1[0]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[1]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[2]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[3]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[4]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[5]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[6]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[7]
+ vst1.64 {d0}, [r0], r1
+ bx lr
+ ENDP ; |vpx_h_predictor_8x8_neon|
+
+;void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_h_predictor_16x16_neon| PROC
+ vld1.8 {q1}, [r3]
+ vdup.8 q0, d2[0]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[1]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[2]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[3]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[4]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[5]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[6]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[7]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[0]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[1]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[2]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[3]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[4]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[5]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[6]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[7]
+ vst1.8 {q0}, [r0], r1
+ bx lr
+ ENDP ; |vpx_h_predictor_16x16_neon|
+
+;void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_h_predictor_32x32_neon| PROC
+ sub r1, r1, #16
+ mov r2, #2
+loop_h
+ vld1.8 {q1}, [r3]!
+ vdup.8 q0, d2[0]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[1]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[2]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[3]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[4]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[5]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[6]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[7]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[0]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[1]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[2]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[3]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[4]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[5]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[6]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[7]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ subs r2, r2, #1
+ bgt loop_h
+ bx lr
+ ENDP ; |vpx_h_predictor_32x32_neon|
+
+;void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_tm_predictor_4x4_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.u8 {d0[]}, [r12]
+
+ ; Load above 4 pixels
+ vld1.32 {d2[0]}, [r2]
+
+ ; Compute above - ytop_left
+ vsubl.u8 q3, d2, d0
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; 1st row and 2nd row
+ vld1.u8 {d2[]}, [r3]!
+ vld1.u8 {d4[]}, [r3]!
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vadd.s16 q1, q1, q3
+ vadd.s16 q2, q2, q3
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+
+ ; 3rd row and 4th row
+ vld1.u8 {d2[]}, [r3]!
+ vld1.u8 {d4[]}, [r3]
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vadd.s16 q1, q1, q3
+ vadd.s16 q2, q2, q3
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+ bx lr
+ ENDP ; |vpx_tm_predictor_4x4_neon|
+
+;void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_tm_predictor_8x8_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.8 {d0[]}, [r12]
+
+ ; preload 8 left
+ vld1.8 {d30}, [r3]
+
+ ; Load above 8 pixels
+ vld1.64 {d2}, [r2]
+
+ vmovl.u8 q10, d30
+
+ ; Compute above - ytop_left
+ vsubl.u8 q3, d2, d0
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; 1st row and 2nd row
+ vdup.16 q0, d20[0]
+ vdup.16 q1, d20[1]
+ vadd.s16 q0, q3, q0
+ vadd.s16 q1, q3, q1
+
+ ; 3rd row and 4th row
+ vdup.16 q8, d20[2]
+ vdup.16 q9, d20[3]
+ vadd.s16 q8, q3, q8
+ vadd.s16 q9, q3, q9
+
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q8
+ vqmovun.s16 d3, q9
+
+ vst1.64 {d0}, [r0], r1
+ vst1.64 {d1}, [r0], r1
+ vst1.64 {d2}, [r0], r1
+ vst1.64 {d3}, [r0], r1
+
+ ; 5th row and 6th row
+ vdup.16 q0, d21[0]
+ vdup.16 q1, d21[1]
+ vadd.s16 q0, q3, q0
+ vadd.s16 q1, q3, q1
+
+ ; 7th row and 8th row
+ vdup.16 q8, d21[2]
+ vdup.16 q9, d21[3]
+ vadd.s16 q8, q3, q8
+ vadd.s16 q9, q3, q9
+
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q8
+ vqmovun.s16 d3, q9
+
+ vst1.64 {d0}, [r0], r1
+ vst1.64 {d1}, [r0], r1
+ vst1.64 {d2}, [r0], r1
+ vst1.64 {d3}, [r0], r1
+
+ bx lr
+ ENDP ; |vpx_tm_predictor_8x8_neon|
+
+;void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_tm_predictor_16x16_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.8 {d0[]}, [r12]
+
+ ; Load above 8 pixels
+ vld1.8 {q1}, [r2]
+
+ ; preload 8 left into r12
+ vld1.8 {d18}, [r3]!
+
+ ; Compute above - ytop_left
+ vsubl.u8 q2, d2, d0
+ vsubl.u8 q3, d3, d0
+
+ vmovl.u8 q10, d18
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; Process 8 rows in each single loop and loop 2 times to process 16 rows.
+ mov r2, #2
+
+loop_16x16_neon
+ ; Process two rows.
+ vdup.16 q0, d20[0]
+ vdup.16 q8, d20[1]
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d20[2] ; proload next 2 rows data
+ vdup.16 q8, d20[3]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d21[0] ; proload next 2 rows data
+ vdup.16 q8, d21[1]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d21[2] ; proload next 2 rows data
+ vdup.16 q8, d21[3]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vld1.8 {d18}, [r3]! ; preload 8 left into r12
+ vmovl.u8 q10, d18
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ subs r2, r2, #1
+ bgt loop_16x16_neon
+
+ bx lr
+ ENDP ; |vpx_tm_predictor_16x16_neon|
+
+;void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_tm_predictor_32x32_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.8 {d0[]}, [r12]
+
+ ; Load above 32 pixels
+ vld1.8 {q1}, [r2]!
+ vld1.8 {q2}, [r2]
+
+ ; preload 8 left pixels
+ vld1.8 {d26}, [r3]!
+
+ ; Compute above - ytop_left
+ vsubl.u8 q8, d2, d0
+ vsubl.u8 q9, d3, d0
+ vsubl.u8 q10, d4, d0
+ vsubl.u8 q11, d5, d0
+
+ vmovl.u8 q3, d26
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
+ mov r2, #4
+
+loop_32x32_neon
+ ; Process two rows.
+ vdup.16 q0, d6[0]
+ vdup.16 q2, d6[1]
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q1, d6[2]
+ vdup.16 q2, d6[3]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q1, q8
+ vadd.s16 q13, q1, q9
+ vadd.s16 q14, q1, q10
+ vadd.s16 q15, q1, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q0, d7[0]
+ vdup.16 q2, d7[1]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q0, d7[2]
+ vdup.16 q2, d7[3]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vld1.8 {d0}, [r3]! ; preload 8 left pixels
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vmovl.u8 q3, d0
+ vst1.64 {d24-d27}, [r0], r1
+
+ subs r2, r2, #1
+ bgt loop_32x32_neon
+
+ bx lr
+ ENDP ; |vpx_tm_predictor_32x32_neon|
+
+ END
diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c
new file mode 100644
index 0000000000..d24e6adc8a
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE void loop_filter_neon_16(
+ uint8x16_t qblimit, // blimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p3
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r) { // q1
+ uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q2s16, q11s16;
+ uint16x8_t q4u16;
+ int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
+ int8x8_t d2s8, d3s8;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q3 = vabdq_u8(q9, q8);
+ q4 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q3 = vmaxq_u8(q3, q4);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q9 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q3);
+
+ q2u8 = vabdq_u8(q5, q8);
+ q9 = vqaddq_u8(q9, q9);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q10 = vdupq_n_u8(0x80);
+ q8 = veorq_u8(q8, q10);
+ q7 = veorq_u8(q7, q10);
+ q6 = veorq_u8(q6, q10);
+ q5 = veorq_u8(q5, q10);
+
+ q2u8 = vshrq_n_u8(q2u8, 1);
+ q9 = vqaddq_u8(q9, q2u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q9 = vcgeq_u8(qblimit, q9);
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+ vreinterpretq_s8_u8(q8));
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+
+ q4u16 = vdupq_n_u16(3);
+ q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+ q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+ q15u8 = vandq_u8(q15u8, q9);
+
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+ q4 = vdupq_n_u8(3);
+ q9 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2s8 = vqmovn_s16(q2s16);
+ d3s8 = vqmovn_s16(q11s16);
+ q1s8 = vcombine_s8(d2s8, d3s8);
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+
+ q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
+ q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q1s8 = vshrq_n_s8(q1s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+ q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+ q1s8 = vrshrq_n_s8(q1s8, 1);
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+ q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
+ return;
+}
+
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
+ uint8x16_t qblimit, qlimit, qthresh;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+
+ dblimit0 = vld1_u8(blimit0);
+ dlimit0 = vld1_u8(limit0);
+ dthresh0 = vld1_u8(thresh0);
+ dblimit1 = vld1_u8(blimit1);
+ dlimit1 = vld1_u8(limit1);
+ dthresh1 = vld1_u8(thresh1);
+ qblimit = vcombine_u8(dblimit0, dblimit1);
+ qlimit = vcombine_u8(dlimit0, dlimit1);
+ qthresh = vcombine_u8(dthresh0, dthresh1);
+
+ s -= (p << 2);
+
+ q3u8 = vld1q_u8(s);
+ s += p;
+ q4u8 = vld1q_u8(s);
+ s += p;
+ q5u8 = vld1q_u8(s);
+ s += p;
+ q6u8 = vld1q_u8(s);
+ s += p;
+ q7u8 = vld1q_u8(s);
+ s += p;
+ q8u8 = vld1q_u8(s);
+ s += p;
+ q9u8 = vld1q_u8(s);
+ s += p;
+ q10u8 = vld1q_u8(s);
+
+ loop_filter_neon_16(qblimit, qlimit, qthresh,
+ q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
+ &q5u8, &q6u8, &q7u8, &q8u8);
+
+ s -= (p * 5);
+ vst1q_u8(s, q5u8);
+ s += p;
+ vst1q_u8(s, q6u8);
+ s += p;
+ vst1q_u8(s, q7u8);
+ s += p;
+ vst1q_u8(s, q8u8);
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c
new file mode 100644
index 0000000000..7f3ee70b94
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static INLINE void loop_filter_neon(
+ uint8x8_t dblimit, // flimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p3
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d4ru8, // p1
+ uint8x8_t *d5ru8, // p0
+ uint8x8_t *d6ru8, // q0
+ uint8x8_t *d7ru8) { // q1
+ uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+ int16x8_t q12s16;
+ int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d3u8 = vabd_u8(d17u8, d16u8);
+ d4u8 = vabd_u8(d18u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
+ d3u8 = vmax_u8(d3u8, d4u8);
+ d23u8 = vmax_u8(d19u8, d20u8);
+
+ d17u8 = vabd_u8(d6u8, d7u8);
+
+ d21u8 = vcgt_u8(d21u8, dthresh);
+ d22u8 = vcgt_u8(d22u8, dthresh);
+ d23u8 = vmax_u8(d23u8, d3u8);
+
+ d28u8 = vabd_u8(d5u8, d16u8);
+ d17u8 = vqadd_u8(d17u8, d17u8);
+
+ d23u8 = vcge_u8(dlimit, d23u8);
+
+ d18u8 = vdup_n_u8(0x80);
+ d5u8 = veor_u8(d5u8, d18u8);
+ d6u8 = veor_u8(d6u8, d18u8);
+ d7u8 = veor_u8(d7u8, d18u8);
+ d16u8 = veor_u8(d16u8, d18u8);
+
+ d28u8 = vshr_n_u8(d28u8, 1);
+ d17u8 = vqadd_u8(d17u8, d28u8);
+
+ d19u8 = vdup_n_u8(3);
+
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
+ vreinterpret_s8_u8(d6u8));
+
+ d17u8 = vcge_u8(dblimit, d17u8);
+
+ d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
+ vreinterpret_s8_u8(d16u8));
+
+ d22u8 = vorr_u8(d21u8, d22u8);
+
+ q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+ d23u8 = vand_u8(d23u8, d17u8);
+
+ q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+ d17u8 = vdup_n_u8(4);
+
+ d27s8 = vqmovn_s16(q12s16);
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+ d27s8 = vreinterpret_s8_u8(d27u8);
+
+ d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+ d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+ d28s8 = vshr_n_s8(d28s8, 3);
+ d27s8 = vshr_n_s8(d27s8, 3);
+
+ d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+ d27s8 = vrshr_n_s8(d27s8, 1);
+ d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+ d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+ d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+ *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+ *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+ *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+ return;
+}
+
+void vpx_lpf_horizontal_4_neon(
+ uint8_t *src,
+ int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < 1; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ loop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d4u8, &d5u8, &d6u8, &d7u8);
+
+ s -= (pitch * 5);
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ s += pitch;
+ vst1_u8(s, d6u8);
+ s += pitch;
+ vst1_u8(s, d7u8);
+ }
+ return;
+}
+
+void vpx_lpf_vertical_4_neon(
+ uint8_t *src,
+ int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ int i, pitch8;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ pitch8 = pitch * 8;
+ for (i = 0; i < 1; i++, src += pitch8) {
+ s = src - (i + 1) * 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+ vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+ vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+ vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+ vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ loop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d4u8, &d5u8, &d6u8, &d7u8);
+
+ d4Result.val[0] = d4u8;
+ d4Result.val[1] = d5u8;
+ d4Result.val[2] = d6u8;
+ d4Result.val[3] = d7u8;
+
+ src -= 2;
+ vst4_lane_u8(src, d4Result, 0);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 1);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 2);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 3);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 4);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 5);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 6);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 7);
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c
new file mode 100644
index 0000000000..ec3757380d
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static INLINE void mbloop_filter_neon(
+ uint8x8_t dblimit, // mblimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p2
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d0ru8, // p1
+ uint8x8_t *d1ru8, // p1
+ uint8x8_t *d2ru8, // p0
+ uint8x8_t *d3ru8, // q0
+ uint8x8_t *d4ru8, // q1
+ uint8x8_t *d5ru8) { // q1
+ uint32_t flat;
+ uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+ uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+ int16x8_t q15s16;
+ uint16x8_t q10u16, q14u16;
+ int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d23u8 = vabd_u8(d17u8, d16u8);
+ d24u8 = vabd_u8(d18u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
+
+ d25u8 = vabd_u8(d6u8, d4u8);
+
+ d23u8 = vmax_u8(d23u8, d24u8);
+
+ d26u8 = vabd_u8(d7u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+
+ d24u8 = vabd_u8(d6u8, d7u8);
+ d27u8 = vabd_u8(d3u8, d6u8);
+ d28u8 = vabd_u8(d18u8, d7u8);
+
+ d19u8 = vmax_u8(d19u8, d23u8);
+
+ d23u8 = vabd_u8(d5u8, d16u8);
+ d24u8 = vqadd_u8(d24u8, d24u8);
+
+
+ d19u8 = vcge_u8(dlimit, d19u8);
+
+
+ d25u8 = vmax_u8(d25u8, d26u8);
+ d26u8 = vmax_u8(d27u8, d28u8);
+
+ d23u8 = vshr_n_u8(d23u8, 1);
+
+ d25u8 = vmax_u8(d25u8, d26u8);
+
+ d24u8 = vqadd_u8(d24u8, d23u8);
+
+ d20u8 = vmax_u8(d20u8, d25u8);
+
+ d23u8 = vdup_n_u8(1);
+ d24u8 = vcge_u8(dblimit, d24u8);
+
+ d21u8 = vcgt_u8(d21u8, dthresh);
+
+ d20u8 = vcge_u8(d23u8, d20u8);
+
+ d19u8 = vand_u8(d19u8, d24u8);
+
+ d23u8 = vcgt_u8(d22u8, dthresh);
+
+ d20u8 = vand_u8(d20u8, d19u8);
+
+ d22u8 = vdup_n_u8(0x80);
+
+ d23u8 = vorr_u8(d21u8, d23u8);
+
+ q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
+ vreinterpret_u16_u8(d21u8));
+
+ d30u8 = vshrn_n_u16(q10u16, 4);
+ flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+
+ if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
+ d27u8 = vdup_n_u8(3);
+ d21u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+ } else {
+ d21u8 = veor_u8(d7u8, d22u8);
+ d24u8 = veor_u8(d6u8, d22u8);
+ d25u8 = veor_u8(d5u8, d22u8);
+ d26u8 = veor_u8(d16u8, d22u8);
+
+ d27u8 = vdup_n_u8(3);
+
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+ d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+
+ q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+
+ d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+ q15s16 = vaddw_s8(q15s16, d29s8);
+
+ d29u8 = vdup_n_u8(4);
+
+ d28s8 = vqmovn_s16(q15s16);
+
+ d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+ d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+ d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+ d30s8 = vshr_n_s8(d30s8, 3);
+ d29s8 = vshr_n_s8(d29s8, 3);
+
+ d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+ d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+
+ d29s8 = vrshr_n_s8(d29s8, 1);
+ d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+ d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
+
+ if (flat == 0) { // filter_branch_only
+ *d0ru8 = d4u8;
+ *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+ *d5ru8 = d17u8;
+ return;
+ }
+
+ d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+
+ d23u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+
+ d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+
+ q14u16 = vaddw_u8(q14u16, d5u8);
+
+ d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+
+ d30u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+
+ d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+
+ d31u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+
+ *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+
+ d23u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
+
+ *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+
+ d22u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+
+ d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+
+ d6u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+
+ d7u8 = vqrshrn_n_u16(q14u16, 3);
+
+ *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+ *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+ *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+ }
+ return;
+}
+
+void vpx_lpf_horizontal_8_neon(
+ uint8_t *src,
+ int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < 1; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ mbloop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+ s -= (pitch * 6);
+ vst1_u8(s, d0u8);
+ s += pitch;
+ vst1_u8(s, d1u8);
+ s += pitch;
+ vst1_u8(s, d2u8);
+ s += pitch;
+ vst1_u8(s, d3u8);
+ s += pitch;
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ }
+ return;
+}
+
+void vpx_lpf_vertical_8_neon(
+ uint8_t *src,
+ int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ int i;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+ uint8x8x2_t d2Result;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ for (i = 0; i < 1; i++) {
+ s = src + (i * (pitch << 3)) - 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+ vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+ vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+ vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+ vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ mbloop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+ d4Result.val[0] = d0u8;
+ d4Result.val[1] = d1u8;
+ d4Result.val[2] = d2u8;
+ d4Result.val[3] = d3u8;
+
+ d2Result.val[0] = d4u8;
+ d2Result.val[1] = d5u8;
+
+ s = src - 3;
+ vst4_lane_u8(s, d4Result, 0);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 1);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 2);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 3);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 4);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 5);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 6);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 7);
+
+ s = src + 1;
+ vst2_lane_u8(s, d2Result, 0);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 1);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 2);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 3);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 4);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 5);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 6);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 7);
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm
new file mode 100644
index 0000000000..d5da7a8409
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm
@@ -0,0 +1,635 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_lpf_horizontal_edge_8_neon|
+ EXPORT |vpx_lpf_horizontal_edge_16_neon|
+ EXPORT |vpx_lpf_vertical_16_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void mb_lpf_horizontal_edge(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; r12 int count
+|mb_lpf_horizontal_edge| PROC
+ push {r4-r8, lr}
+ vpush {d8-d15}
+ ldr r4, [sp, #88] ; load thresh
+
+h_count
+ vld1.8 {d16[]}, [r2] ; load *blimit
+ vld1.8 {d17[]}, [r3] ; load *limit
+ vld1.8 {d18[]}, [r4] ; load *thresh
+
+ sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines
+
+ vld1.u8 {d0}, [r8@64], r1 ; p7
+ vld1.u8 {d1}, [r8@64], r1 ; p6
+ vld1.u8 {d2}, [r8@64], r1 ; p5
+ vld1.u8 {d3}, [r8@64], r1 ; p4
+ vld1.u8 {d4}, [r8@64], r1 ; p3
+ vld1.u8 {d5}, [r8@64], r1 ; p2
+ vld1.u8 {d6}, [r8@64], r1 ; p1
+ vld1.u8 {d7}, [r8@64], r1 ; p0
+ vld1.u8 {d8}, [r8@64], r1 ; q0
+ vld1.u8 {d9}, [r8@64], r1 ; q1
+ vld1.u8 {d10}, [r8@64], r1 ; q2
+ vld1.u8 {d11}, [r8@64], r1 ; q3
+ vld1.u8 {d12}, [r8@64], r1 ; q4
+ vld1.u8 {d13}, [r8@64], r1 ; q5
+ vld1.u8 {d14}, [r8@64], r1 ; q6
+ vld1.u8 {d15}, [r8@64], r1 ; q7
+
+ bl vpx_wide_mbfilter_neon
+
+ tst r7, #1
+ beq h_mbfilter
+
+ ; flat && mask were not set for any of the channels. Just store the values
+ ; from filter.
+ sub r8, r0, r1, lsl #1
+
+ vst1.u8 {d25}, [r8@64], r1 ; store op1
+ vst1.u8 {d24}, [r8@64], r1 ; store op0
+ vst1.u8 {d23}, [r8@64], r1 ; store oq0
+ vst1.u8 {d26}, [r8@64], r1 ; store oq1
+
+ b h_next
+
+h_mbfilter
+ tst r7, #2
+ beq h_wide_mbfilter
+
+ ; flat2 was not set for any of the channels. Just store the values from
+ ; mbfilter.
+ sub r8, r0, r1, lsl #1
+ sub r8, r8, r1
+
+ vst1.u8 {d18}, [r8@64], r1 ; store op2
+ vst1.u8 {d19}, [r8@64], r1 ; store op1
+ vst1.u8 {d20}, [r8@64], r1 ; store op0
+ vst1.u8 {d21}, [r8@64], r1 ; store oq0
+ vst1.u8 {d22}, [r8@64], r1 ; store oq1
+ vst1.u8 {d23}, [r8@64], r1 ; store oq2
+
+ b h_next
+
+h_wide_mbfilter
+ sub r8, r0, r1, lsl #3
+ add r8, r8, r1
+
+ vst1.u8 {d16}, [r8@64], r1 ; store op6
+ vst1.u8 {d24}, [r8@64], r1 ; store op5
+ vst1.u8 {d25}, [r8@64], r1 ; store op4
+ vst1.u8 {d26}, [r8@64], r1 ; store op3
+ vst1.u8 {d27}, [r8@64], r1 ; store op2
+ vst1.u8 {d18}, [r8@64], r1 ; store op1
+ vst1.u8 {d19}, [r8@64], r1 ; store op0
+ vst1.u8 {d20}, [r8@64], r1 ; store oq0
+ vst1.u8 {d21}, [r8@64], r1 ; store oq1
+ vst1.u8 {d22}, [r8@64], r1 ; store oq2
+ vst1.u8 {d23}, [r8@64], r1 ; store oq3
+ vst1.u8 {d1}, [r8@64], r1 ; store oq4
+ vst1.u8 {d2}, [r8@64], r1 ; store oq5
+ vst1.u8 {d3}, [r8@64], r1 ; store oq6
+
+h_next
+ add r0, r0, #8
+ subs r12, r12, #1
+ bne h_count
+
+ vpop {d8-d15}
+ pop {r4-r8, pc}
+
+ ENDP ; |mb_lpf_horizontal_edge|
+
+; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int pitch,
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh
+|vpx_lpf_horizontal_edge_8_neon| PROC
+ mov r12, #1
+ b mb_lpf_horizontal_edge
+ ENDP ; |vpx_lpf_horizontal_edge_8_neon|
+
+; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int pitch,
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh
+|vpx_lpf_horizontal_edge_16_neon| PROC
+ mov r12, #2
+ b mb_lpf_horizontal_edge
+ ENDP ; |vpx_lpf_horizontal_edge_16_neon|
+
+; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|vpx_lpf_vertical_16_neon| PROC
+ push {r4-r8, lr}
+ vpush {d8-d15}
+ ldr r4, [sp, #88] ; load thresh
+
+ vld1.8 {d16[]}, [r2] ; load *blimit
+ vld1.8 {d17[]}, [r3] ; load *limit
+ vld1.8 {d18[]}, [r4] ; load *thresh
+
+ sub r8, r0, #8
+
+ vld1.8 {d0}, [r8@64], r1
+ vld1.8 {d8}, [r0@64], r1
+ vld1.8 {d1}, [r8@64], r1
+ vld1.8 {d9}, [r0@64], r1
+ vld1.8 {d2}, [r8@64], r1
+ vld1.8 {d10}, [r0@64], r1
+ vld1.8 {d3}, [r8@64], r1
+ vld1.8 {d11}, [r0@64], r1
+ vld1.8 {d4}, [r8@64], r1
+ vld1.8 {d12}, [r0@64], r1
+ vld1.8 {d5}, [r8@64], r1
+ vld1.8 {d13}, [r0@64], r1
+ vld1.8 {d6}, [r8@64], r1
+ vld1.8 {d14}, [r0@64], r1
+ vld1.8 {d7}, [r8@64], r1
+ vld1.8 {d15}, [r0@64], r1
+
+ sub r0, r0, r1, lsl #3
+
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ vtrn.8 d0, d1
+ vtrn.8 d2, d3
+ vtrn.8 d4, d5
+ vtrn.8 d6, d7
+
+ vtrn.8 d8, d9
+ vtrn.8 d10, d11
+ vtrn.8 d12, d13
+ vtrn.8 d14, d15
+
+ bl vpx_wide_mbfilter_neon
+
+ tst r7, #1
+ beq v_mbfilter
+
+ ; flat && mask were not set for any of the channels. Just store the values
+ ; from filter.
+ sub r8, r0, #2
+
+ vswp d23, d25
+
+ vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
+ vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
+ vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
+ vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
+ vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
+ vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
+ vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
+ vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
+
+ b v_end
+
+v_mbfilter
+ tst r7, #2
+ beq v_wide_mbfilter
+
+ ; flat2 was not set for any of the channels. Just store the values from
+ ; mbfilter.
+ sub r8, r0, #3
+
+ vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1
+ vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1
+ vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1
+ vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1
+ vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1
+ vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1
+ vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1
+ vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1
+ vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1
+ vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1
+ vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1
+ vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1
+ vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1
+ vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1
+ vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1
+ vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1
+
+ b v_end
+
+v_wide_mbfilter
+ sub r8, r0, #8
+
+ vtrn.32 d0, d26
+ vtrn.32 d16, d27
+ vtrn.32 d24, d18
+ vtrn.32 d25, d19
+
+ vtrn.16 d0, d24
+ vtrn.16 d16, d25
+ vtrn.16 d26, d18
+ vtrn.16 d27, d19
+
+ vtrn.8 d0, d16
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+ vtrn.8 d18, d19
+
+ vtrn.32 d20, d1
+ vtrn.32 d21, d2
+ vtrn.32 d22, d3
+ vtrn.32 d23, d15
+
+ vtrn.16 d20, d22
+ vtrn.16 d21, d23
+ vtrn.16 d1, d3
+ vtrn.16 d2, d15
+
+ vtrn.8 d20, d21
+ vtrn.8 d22, d23
+ vtrn.8 d1, d2
+ vtrn.8 d3, d15
+
+ vst1.8 {d0}, [r8@64], r1
+ vst1.8 {d20}, [r0@64], r1
+ vst1.8 {d16}, [r8@64], r1
+ vst1.8 {d21}, [r0@64], r1
+ vst1.8 {d24}, [r8@64], r1
+ vst1.8 {d22}, [r0@64], r1
+ vst1.8 {d25}, [r8@64], r1
+ vst1.8 {d23}, [r0@64], r1
+ vst1.8 {d26}, [r8@64], r1
+ vst1.8 {d1}, [r0@64], r1
+ vst1.8 {d27}, [r8@64], r1
+ vst1.8 {d2}, [r0@64], r1
+ vst1.8 {d18}, [r8@64], r1
+ vst1.8 {d3}, [r0@64], r1
+ vst1.8 {d19}, [r8@64], r1
+ vst1.8 {d15}, [r0@64], r1
+
+v_end
+ vpop {d8-d15}
+ pop {r4-r8, pc}
+
+ ENDP ; |vpx_lpf_vertical_16_neon|
+
+; void vpx_wide_mbfilter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+;
+; r0-r3 PRESERVE
+; d16 blimit
+; d17 limit
+; d18 thresh
+; d0 p7
+; d1 p6
+; d2 p5
+; d3 p4
+; d4 p3
+; d5 p2
+; d6 p1
+; d7 p0
+; d8 q0
+; d9 q1
+; d10 q2
+; d11 q3
+; d12 q4
+; d13 q5
+; d14 q6
+; d15 q7
+|vpx_wide_mbfilter_neon| PROC
+ mov r7, #0
+
+ ; filter_mask
+ vabd.u8 d19, d4, d5 ; abs(p3 - p2)
+ vabd.u8 d20, d5, d6 ; abs(p2 - p1)
+ vabd.u8 d21, d6, d7 ; abs(p1 - p0)
+ vabd.u8 d22, d9, d8 ; abs(q1 - q0)
+ vabd.u8 d23, d10, d9 ; abs(q2 - q1)
+ vabd.u8 d24, d11, d10 ; abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1))
+ vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0))
+ vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2))
+ vmax.u8 d19, d19, d20
+
+ vabd.u8 d24, d7, d8 ; abs(p0 - q0)
+
+ vmax.u8 d19, d19, d23
+
+ vabd.u8 d23, d6, d9 ; a = abs(p1 - q1)
+ vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2
+
+ ; abs () > limit
+ vcge.u8 d19, d17, d19
+
+ ; flatmask4
+ vabd.u8 d25, d7, d5 ; abs(p0 - p2)
+ vabd.u8 d26, d8, d10 ; abs(q0 - q2)
+ vabd.u8 d27, d4, d7 ; abs(p3 - p0)
+ vabd.u8 d28, d11, d8 ; abs(q3 - q0)
+
+ ; only compare the largest value to thresh
+ vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2))
+ vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0))
+ vmax.u8 d25, d25, d26
+ vmax.u8 d20, d20, d25
+
+ vshr.u8 d23, d23, #1 ; a = a / 2
+ vqadd.u8 d24, d24, d23 ; a = b + a
+
+ vmov.u8 d30, #1
+ vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1
+
+ vcge.u8 d20, d30, d20 ; flat
+
+ vand d19, d19, d24 ; mask
+
+ ; hevmask
+ vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1
+ vorr d21, d21, d22 ; hev
+
+ vand d16, d20, d19 ; flat && mask
+ vmov r5, r6, d16
+
+ ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
+ vabd.u8 d22, d3, d7 ; abs(p4 - p0)
+ vabd.u8 d23, d12, d8 ; abs(q4 - q0)
+ vabd.u8 d24, d7, d2 ; abs(p0 - p5)
+ vabd.u8 d25, d8, d13 ; abs(q0 - q5)
+ vabd.u8 d26, d1, d7 ; abs(p6 - p0)
+ vabd.u8 d27, d14, d8 ; abs(q6 - q0)
+ vabd.u8 d28, d0, d7 ; abs(p7 - p0)
+ vabd.u8 d29, d15, d8 ; abs(q7 - q0)
+
+ ; only compare the largest value to thresh
+ vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0))
+ vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5))
+ vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0))
+ vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0))
+
+ vmax.u8 d26, d22, d23
+ vmax.u8 d27, d24, d25
+ vmax.u8 d23, d26, d27
+
+ vcge.u8 d18, d30, d23 ; flat2
+
+ vmov.u8 d22, #0x80
+
+ orrs r5, r5, r6 ; Check for 0
+ orreq r7, r7, #1 ; Only do filter branch
+
+ vand d17, d18, d16 ; flat2 && flat && mask
+ vmov r5, r6, d17
+
+ ; mbfilter() function
+
+ ; filter() function
+ ; convert to signed
+ veor d23, d8, d22 ; qs0
+ veor d24, d7, d22 ; ps0
+ veor d25, d6, d22 ; ps1
+ veor d26, d9, d22 ; qs1
+
+ vmov.u8 d27, #3
+
+ vsub.s8 d28, d23, d24 ; ( qs0 - ps0)
+ vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1)
+ vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0)
+ vand d29, d29, d21 ; filter &= hev
+ vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0)
+ vmov.u8 d29, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d28, q15
+
+ vand d28, d28, d19 ; filter &= mask
+
+ vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3)
+ vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4)
+ vshr.s8 d30, d30, #3 ; filter2 >>= 3
+ vshr.s8 d29, d29, #3 ; filter1 >>= 3
+
+
+ vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2)
+ vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1)
+
+ ; outer tap adjustments: ++filter1 >> 1
+ vrshr.s8 d29, d29, #1
+ vbic d29, d29, d21 ; filter &= ~hev
+
+ vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter)
+ vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter)
+
+ veor d24, d24, d22 ; *f_op0 = u^0x80
+ veor d23, d23, d22 ; *f_oq0 = u^0x80
+ veor d25, d25, d22 ; *f_op1 = u^0x80
+ veor d26, d26, d22 ; *f_oq1 = u^0x80
+
+ tst r7, #1
+ bxne lr
+
+ orrs r5, r5, r6 ; Check for 0
+ orreq r7, r7, #2 ; Only do mbfilter branch
+
+ ; mbfilter flat && mask branch
+ ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
+ ; and using vibt on the q's?
+ vmov.u8 d29, #2
+ vaddl.u8 q15, d7, d8 ; op2 = p0 + q0
+ vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3
+ vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+ vaddl.u8 q10, d4, d5
+ vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+ vaddl.u8 q14, d6, d9
+ vqrshrn.u16 d18, q15, #3 ; r_op2
+
+ vsub.i16 q15, q10
+ vaddl.u8 q10, d4, d6
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d7, d10
+ vqrshrn.u16 d19, q15, #3 ; r_op1
+
+ vsub.i16 q15, q10
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d8, d11
+ vqrshrn.u16 d20, q15, #3 ; r_op0
+
+ vsubw.u8 q15, d4 ; oq0 = op0 - p3
+ vsubw.u8 q15, d7 ; oq0 -= p0
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d9, d11
+ vqrshrn.u16 d21, q15, #3 ; r_oq0
+
+ vsubw.u8 q15, d5 ; oq1 = oq0 - p2
+ vsubw.u8 q15, d8 ; oq1 -= q0
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d10, d11
+ vqrshrn.u16 d22, q15, #3 ; r_oq1
+
+ vsubw.u8 q15, d6 ; oq2 = oq0 - p1
+ vsubw.u8 q15, d9 ; oq2 -= q1
+ vadd.i16 q15, q14
+ vqrshrn.u16 d27, q15, #3 ; r_oq2
+
+ ; Filter does not set op2 or oq2, so use p2 and q2.
+ vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask)
+ vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask)
+ vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask)
+ vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask)
+ vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask)
+
+ vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask)
+ vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask)
+
+ tst r7, #2
+ bxne lr
+
+ ; wide_mbfilter flat2 && flat && mask branch
+ vmov.u8 d16, #7
+ vaddl.u8 q15, d7, d8 ; op6 = p0 + q0
+ vaddl.u8 q12, d2, d3
+ vaddl.u8 q13, d4, d5
+ vaddl.u8 q14, d1, d6
+ vmlal.u8 q15, d0, d16 ; op6 += p7 * 3
+ vadd.i16 q12, q13
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d2, d9
+ vadd.i16 q15, q12
+ vaddl.u8 q12, d0, d1
+ vaddw.u8 q15, d1
+ vaddl.u8 q13, d0, d2
+ vadd.i16 q14, q15, q14
+ vqrshrn.u16 d16, q15, #4 ; w_op6
+
+ vsub.i16 q15, q14, q12
+ vaddl.u8 q14, d3, d10
+ vqrshrn.u16 d24, q15, #4 ; w_op5
+
+ vsub.i16 q15, q13
+ vaddl.u8 q13, d0, d3
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d4, d11
+ vqrshrn.u16 d25, q15, #4 ; w_op4
+
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d0, d4
+ vsub.i16 q15, q13
+ vsub.i16 q14, q15, q14
+ vqrshrn.u16 d26, q15, #4 ; w_op3
+
+ vaddw.u8 q15, q14, d5 ; op2 += p2
+ vaddl.u8 q14, d0, d5
+ vaddw.u8 q15, d12 ; op2 += q4
+ vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m)
+ vqrshrn.u16 d27, q15, #4 ; w_op2
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d0, d6
+ vaddw.u8 q15, d6 ; op1 += p1
+ vaddw.u8 q15, d13 ; op1 += q5
+ vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m)
+ vqrshrn.u16 d18, q15, #4 ; w_op1
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d0, d7
+ vaddw.u8 q15, d7 ; op0 += p0
+ vaddw.u8 q15, d14 ; op0 += q6
+ vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m)
+ vqrshrn.u16 d19, q15, #4 ; w_op0
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d1, d8
+ vaddw.u8 q15, d8 ; oq0 += q0
+ vaddw.u8 q15, d15 ; oq0 += q7
+ vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m)
+ vqrshrn.u16 d20, q15, #4 ; w_oq0
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d2, d9
+ vaddw.u8 q15, d9 ; oq1 += q1
+ vaddl.u8 q4, d10, d15
+ vaddw.u8 q15, d15 ; oq1 += q7
+ vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m)
+ vqrshrn.u16 d21, q15, #4 ; w_oq1
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d3, d10
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d11, d15
+ vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m)
+ vqrshrn.u16 d22, q15, #4 ; w_oq2
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d4, d11
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d12, d15
+ vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m)
+ vqrshrn.u16 d23, q15, #4 ; w_oq3
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d5, d12
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d13, d15
+ vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m)
+ vqrshrn.u16 d1, q15, #4 ; w_oq4
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d6, d13
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d14, d15
+ vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m)
+ vqrshrn.u16 d2, q15, #4 ; w_oq5
+
+ vsub.i16 q15, q14
+ vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m)
+ vadd.i16 q15, q4
+ vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m)
+ vqrshrn.u16 d3, q15, #4 ; w_oq6
+ vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m)
+ vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m)
+ vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m)
+
+ bx lr
+ ENDP ; |vpx_wide_mbfilter_neon|
+
+ END
diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c
new file mode 100644
index 0000000000..aa31f29358
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
+ vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+#if HAVE_NEON_ASM
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
+ vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
+ vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+ vpx_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif // HAVE_NEON_ASM
diff --git a/thirdparty/libvpx/vpx_dsp/arm/save_reg_neon.asm b/thirdparty/libvpx/vpx_dsp/arm/save_reg_neon.asm
new file mode 100644
index 0000000000..c9ca10801d
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/save_reg_neon.asm
@@ -0,0 +1,36 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vpx_push_neon|
+ EXPORT |vpx_pop_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_push_neon| PROC
+ vst1.i64 {d8, d9, d10, d11}, [r0]!
+ vst1.i64 {d12, d13, d14, d15}, [r0]!
+ bx lr
+
+ ENDP
+
+|vpx_pop_neon| PROC
+ vld1.i64 {d8, d9, d10, d11}, [r0]!
+ vld1.i64 {d12, d13, d14, d15}, [r0]!
+ bx lr
+
+ ENDP
+
+ END
+
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c
new file mode 100644
index 0000000000..8632250138
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+ int16x4_t dsrc0,
+ int16x4_t dsrc1,
+ int16x4_t dsrc2,
+ int16x4_t dsrc3,
+ int16x4_t dsrc4,
+ int16x4_t dsrc5,
+ int16x4_t dsrc6,
+ int16x4_t dsrc7,
+ int16x8_t q0s16) {
+ int32x4_t qdst;
+ int16x4_t d0s16, d1s16;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+
+ qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+ qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+ return qdst;
+}
+
+void vpx_convolve8_avg_horiz_neon(
+ const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w,
+ int h) {
+ int width;
+ const uint8_t *s;
+ uint8_t *d;
+ uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+ uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
+ uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+ uint16x8x2_t q0x2u16;
+ uint8x8x2_t d0x2u8, d1x2u8;
+ uint32x2x2_t d0x2u32;
+ uint16x4x2_t d0x2u16, d1x2u16;
+ uint32x4x2_t q0x2u32;
+
+ assert(x_step_q4 == 16);
+
+ q0s16 = vld1q_s16(filter_x);
+
+ src -= 3; // adjust for taps
+ for (; h > 0; h -= 4) { // loop_horiz_v
+ s = src;
+ d24u8 = vld1_u8(s);
+ s += src_stride;
+ d25u8 = vld1_u8(s);
+ s += src_stride;
+ d26u8 = vld1_u8(s);
+ s += src_stride;
+ d27u8 = vld1_u8(s);
+
+ q12u8 = vcombine_u8(d24u8, d25u8);
+ q13u8 = vcombine_u8(d26u8, d27u8);
+
+ q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+ vreinterpretq_u16_u8(q13u8));
+ d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+ d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+ d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+ d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(d24u8, d25u8);
+ d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+ __builtin_prefetch(src + src_stride * 4);
+ __builtin_prefetch(src + src_stride * 5);
+
+ q8u16 = vmovl_u8(d0x2u8.val[0]);
+ q9u16 = vmovl_u8(d0x2u8.val[1]);
+ q10u16 = vmovl_u8(d1x2u8.val[0]);
+ q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+ src += 7;
+ d16u16 = vget_low_u16(q8u16);
+ d17u16 = vget_high_u16(q8u16);
+ d18u16 = vget_low_u16(q9u16);
+ d19u16 = vget_high_u16(q9u16);
+ q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
+ q9u16 = vcombine_u16(d17u16, d19u16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
+ for (width = w;
+ width > 0;
+ width -= 4, src += 4, dst += 4) { // loop_horiz
+ s = src;
+ d28u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d29u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d31u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+ __builtin_prefetch(src + 64);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+ vreinterpret_u16_u32(d31u32));
+ d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+ vreinterpret_u16_u32(d30u32));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
+ vreinterpret_u8_u16(d1x2u16.val[0])); // d29
+ d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
+ vreinterpret_u8_u16(d1x2u16.val[1])); // d30
+
+ __builtin_prefetch(src + 64 + src_stride);
+
+ q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+ q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+ vreinterpretq_u32_u8(q15u8));
+
+ d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+ d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+ q12u16 = vmovl_u8(d28u8);
+ q13u16 = vmovl_u8(d29u8);
+
+ __builtin_prefetch(src + 64 + src_stride * 2);
+
+ d = dst;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+ d += dst_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+ d18s16, d19s16, d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+ d19s16, d23s16, d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ __builtin_prefetch(src + 64 + src_stride * 3);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+ vreinterpret_u16_u8(d3u8));
+ d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+ vreinterpret_u32_u16(d0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+ vreinterpret_u8_u32(d0x2u32.val[1]));
+
+ q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+ d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+ d = dst;
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+ q8u16 = q9u16;
+ d20s16 = d23s16;
+ q11u16 = q12u16;
+ q9u16 = q13u16;
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ }
+ src += src_stride * 4 - w - 7;
+ dst += dst_stride * 4 - w;
+ }
+ return;
+}
+
+void vpx_convolve8_avg_vert_neon(
+ const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y,
+ int y_step_q4,
+ int w,
+ int h) {
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+ uint8x8_t d2u8, d3u8;
+ uint32x2_t d2u32, d3u32, d6u32, d7u32;
+ uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+ uint8x16_t q1u8, q3u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+ assert(y_step_q4 == 16);
+
+ src -= src_stride * 3;
+ q0s16 = vld1q_s16(filter_y);
+ for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
+ s = src;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+ s += src_stride;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+ s += src_stride;
+ d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+ s += src_stride;
+
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+ q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d = dst;
+ for (height = h; height > 0; height -= 4) { // loop_vert
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+ s += src_stride;
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+ s += src_stride;
+
+ q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+ q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+ d += dst_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+ d -= dst_stride * 3;
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ __builtin_prefetch(s);
+ __builtin_prefetch(s + src_stride);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+ d20s16, d21s16, d22s16, d24s16, q0s16);
+ __builtin_prefetch(s + src_stride * 2);
+ __builtin_prefetch(s + src_stride * 3);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+ d21s16, d22s16, d24s16, d26s16, q0s16);
+ __builtin_prefetch(d);
+ __builtin_prefetch(d + dst_stride);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, d26s16, d27s16, q0s16);
+ __builtin_prefetch(d + dst_stride * 2);
+ __builtin_prefetch(d + dst_stride * 3);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+ d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+ d += dst_stride;
+
+ q8u16 = q10u16;
+ d18s16 = d22s16;
+ d19s16 = d24s16;
+ q10u16 = q13u16;
+ d22s16 = d25s16;
+ }
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
new file mode 100644
index 0000000000..9bd715e2c6
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+ int16x4_t dsrc0,
+ int16x4_t dsrc1,
+ int16x4_t dsrc2,
+ int16x4_t dsrc3,
+ int16x4_t dsrc4,
+ int16x4_t dsrc5,
+ int16x4_t dsrc6,
+ int16x4_t dsrc7,
+ int16x8_t q0s16) {
+ int32x4_t qdst;
+ int16x4_t d0s16, d1s16;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+
+ qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+ qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+ return qdst;
+}
+
+void vpx_convolve8_horiz_neon(
+ const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w,
+ int h) {
+ int width;
+ const uint8_t *s, *psrc;
+ uint8_t *d, *pdst;
+ uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+ uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
+ uint8x16_t q12u8, q13u8, q14u8, q15u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+ uint16x8x2_t q0x2u16;
+ uint8x8x2_t d0x2u8, d1x2u8;
+ uint32x2x2_t d0x2u32;
+ uint16x4x2_t d0x2u16, d1x2u16;
+ uint32x4x2_t q0x2u32;
+
+ assert(x_step_q4 == 16);
+
+ q0s16 = vld1q_s16(filter_x);
+
+ src -= 3; // adjust for taps
+ for (; h > 0; h -= 4,
+ src += src_stride * 4,
+ dst += dst_stride * 4) { // loop_horiz_v
+ s = src;
+ d24u8 = vld1_u8(s);
+ s += src_stride;
+ d25u8 = vld1_u8(s);
+ s += src_stride;
+ d26u8 = vld1_u8(s);
+ s += src_stride;
+ d27u8 = vld1_u8(s);
+
+ q12u8 = vcombine_u8(d24u8, d25u8);
+ q13u8 = vcombine_u8(d26u8, d27u8);
+
+ q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+ vreinterpretq_u16_u8(q13u8));
+ d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+ d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+ d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+ d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(d24u8, d25u8);
+ d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+ __builtin_prefetch(src + src_stride * 4);
+ __builtin_prefetch(src + src_stride * 5);
+ __builtin_prefetch(src + src_stride * 6);
+
+ q8u16 = vmovl_u8(d0x2u8.val[0]);
+ q9u16 = vmovl_u8(d0x2u8.val[1]);
+ q10u16 = vmovl_u8(d1x2u8.val[0]);
+ q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+ d16u16 = vget_low_u16(q8u16);
+ d17u16 = vget_high_u16(q8u16);
+ d18u16 = vget_low_u16(q9u16);
+ d19u16 = vget_high_u16(q9u16);
+ q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
+ q9u16 = vcombine_u16(d17u16, d19u16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
+ for (width = w, psrc = src + 7, pdst = dst;
+ width > 0;
+ width -= 4, psrc += 4, pdst += 4) { // loop_horiz
+ s = psrc;
+ d28u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d29u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d31u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+ __builtin_prefetch(psrc + 64);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+ vreinterpret_u16_u32(d31u32));
+ d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+ vreinterpret_u16_u32(d30u32));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
+ vreinterpret_u8_u16(d1x2u16.val[0])); // d29
+ d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
+ vreinterpret_u8_u16(d1x2u16.val[1])); // d30
+
+ __builtin_prefetch(psrc + 64 + src_stride);
+
+ q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+ q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+ vreinterpretq_u32_u8(q15u8));
+
+ d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+ d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+ q12u16 = vmovl_u8(d28u8);
+ q13u16 = vmovl_u8(d29u8);
+
+ __builtin_prefetch(psrc + 64 + src_stride * 2);
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+ d18s16, d19s16, d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+ d19s16, d23s16, d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ __builtin_prefetch(psrc + 60 + src_stride * 3);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+ vreinterpret_u16_u8(d3u8));
+ d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+ vreinterpret_u32_u16(d0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+ vreinterpret_u8_u32(d0x2u32.val[1]));
+
+ d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
+ d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
+
+ d = pdst;
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+ q8u16 = q9u16;
+ d20s16 = d23s16;
+ q11u16 = q12u16;
+ q9u16 = q13u16;
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ }
+ }
+ return;
+}
+
+void vpx_convolve8_vert_neon(
+ const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y,
+ int y_step_q4,
+ int w,
+ int h) {
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+ uint32x2_t d2u32, d3u32;
+ uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+ assert(y_step_q4 == 16);
+
+ src -= src_stride * 3;
+ q0s16 = vld1q_s16(filter_y);
+ for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
+ s = src;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+ s += src_stride;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+ s += src_stride;
+ d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+ s += src_stride;
+
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+ q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d = dst;
+ for (height = h; height > 0; height -= 4) { // loop_vert
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+ s += src_stride;
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+ s += src_stride;
+
+ q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+ q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ __builtin_prefetch(d);
+ __builtin_prefetch(d + dst_stride);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+ d20s16, d21s16, d22s16, d24s16, q0s16);
+ __builtin_prefetch(d + dst_stride * 2);
+ __builtin_prefetch(d + dst_stride * 3);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+ d21s16, d22s16, d24s16, d26s16, q0s16);
+ __builtin_prefetch(s);
+ __builtin_prefetch(s + src_stride);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, d26s16, d27s16, q0s16);
+ __builtin_prefetch(s + src_stride * 2);
+ __builtin_prefetch(s + src_stride * 3);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
+ d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
+
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+ d += dst_stride;
+
+ q8u16 = q10u16;
+ d18s16 = d22s16;
+ d19s16 = d24s16;
+ q10u16 = q13u16;
+ d22s16 = d25s16;
+ }
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
new file mode 100644
index 0000000000..dc58a332f8
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_avg_neon(
+ const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w,
+ int h) {
+ uint8_t *d;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint32x2_t d0u32, d2u32;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
+ (void)filter_x; (void)filter_x_stride;
+ (void)filter_y; (void)filter_y_stride;
+
+ d = dst;
+ if (w > 32) { // avg64
+ for (; h > 0; h -= 1) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ q2u8 = vld1q_u8(src + 32);
+ q3u8 = vld1q_u8(src + 48);
+ src += src_stride;
+ q8u8 = vld1q_u8(d);
+ q9u8 = vld1q_u8(d + 16);
+ q10u8 = vld1q_u8(d + 32);
+ q11u8 = vld1q_u8(d + 48);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q8u8);
+ q1u8 = vrhaddq_u8(q1u8, q9u8);
+ q2u8 = vrhaddq_u8(q2u8, q10u8);
+ q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ vst1q_u8(dst + 32, q2u8);
+ vst1q_u8(dst + 48, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w == 32) { // avg32
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ q3u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q8u8 = vld1q_u8(d);
+ q9u8 = vld1q_u8(d + 16);
+ d += dst_stride;
+ q10u8 = vld1q_u8(d);
+ q11u8 = vld1q_u8(d + 16);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q8u8);
+ q1u8 = vrhaddq_u8(q1u8, q9u8);
+ q2u8 = vrhaddq_u8(q2u8, q10u8);
+ q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q2u8);
+ vst1q_u8(dst + 16, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w > 8) { // avg16
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(src);
+ src += src_stride;
+ q2u8 = vld1q_u8(d);
+ d += dst_stride;
+ q3u8 = vld1q_u8(d);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q2u8);
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ vst1q_u8(dst, q0u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q1u8);
+ dst += dst_stride;
+ }
+ } else if (w == 8) { // avg8
+ for (; h > 0; h -= 2) {
+ d0u8 = vld1_u8(src);
+ src += src_stride;
+ d1u8 = vld1_u8(src);
+ src += src_stride;
+ d2u8 = vld1_u8(d);
+ d += dst_stride;
+ d3u8 = vld1_u8(d);
+ d += dst_stride;
+
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+
+ vst1_u8(dst, vget_low_u8(q0u8));
+ dst += dst_stride;
+ vst1_u8(dst, vget_high_u8(q0u8));
+ dst += dst_stride;
+ }
+ } else { // avg4
+ for (; h > 0; h -= 2) {
+ d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
+ src += src_stride;
+ d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
+ src += src_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+
+ d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
+ vreinterpret_u8_u32(d2u32));
+
+ d0u32 = vreinterpret_u32_u8(d0u8);
+ vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, d0u32, 1);
+ dst += dst_stride;
+ }
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
new file mode 100644
index 0000000000..d8fb97a861
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_copy_neon(
+ const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w,
+ int h) {
+ uint8x8_t d0u8, d2u8;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ (void)filter_x; (void)filter_x_stride;
+ (void)filter_y; (void)filter_y_stride;
+
+ if (w > 32) { // copy64
+ for (; h > 0; h--) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ q2u8 = vld1q_u8(src + 32);
+ q3u8 = vld1q_u8(src + 48);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ vst1q_u8(dst + 32, q2u8);
+ vst1q_u8(dst + 48, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w == 32) { // copy32
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ q3u8 = vld1q_u8(src + 16);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q2u8);
+ vst1q_u8(dst + 16, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w > 8) { // copy16
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(src);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q1u8);
+ dst += dst_stride;
+ }
+ } else if (w == 8) { // copy8
+ for (; h > 0; h -= 2) {
+ d0u8 = vld1_u8(src);
+ src += src_stride;
+ d2u8 = vld1_u8(src);
+ src += src_stride;
+
+ vst1_u8(dst, d0u8);
+ dst += dst_stride;
+ vst1_u8(dst, d2u8);
+ dst += dst_stride;
+ }
+ } else { // copy4
+ for (; h > 0; h--) {
+ *(uint32_t *)dst = *(const uint32_t *)src;
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+ return;
+}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
new file mode 100644
index 0000000000..1506ce6203
--- /dev/null
+++ b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+ * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+ */
+ DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+
+ // Account for the vertical phase needing 3 lines prior and 4 lines post
+ int intermediate_height = h + 7;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ /* Filter starting 3 lines back. The neon implementation will ignore the
+ * given height and filter a multiple of 4 lines. Since this goes in to
+ * the temp buffer which has lots of extra room and is subsequently discarded
+ * this is safe if somewhat less than ideal.
+ */
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+ temp, 64,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, intermediate_height);
+
+ /* Step into the temp buffer 3 lines to get the actual frame data */
+ vpx_convolve8_vert_neon(temp + 64 * 3, 64,
+ dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+}
+
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+ int intermediate_height = h + 7;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ /* This implementation has the same issues as above. In addition, we only want
+ * to average the values after both passes.
+ */
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+ temp, 64,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, intermediate_height);
+ vpx_convolve8_avg_vert_neon(temp + 64 * 3,
+ 64, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+}