summaryrefslogtreecommitdiff
path: root/thirdparty/libvpx/vpx_dsp
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/libvpx/vpx_dsp')
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/armasm_ms/intrapred_neon_asm.asm643
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/armasm_ms/loopfilter_mb_neon.asm641
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/armasm_ms/save_reg_neon.asm39
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/gas/intrapred_neon_asm.s658
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/gas/loopfilter_mb_neon.s647
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/gas/save_reg_neon.s44
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/gas_apple/intrapred_neon_asm.s660
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s649
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/gas_apple/save_reg_neon.s46
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c61
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c1317
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c185
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c165
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c719
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c50
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c151
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c64
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c540
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c822
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c179
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c266
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c445
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c58
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c373
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c340
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c147
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c94
-rw-r--r--thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c72
-rw-r--r--thirdparty/libvpx/vpx_dsp/bitreader.c103
-rw-r--r--thirdparty/libvpx/vpx_dsp/bitreader.h140
-rw-r--r--thirdparty/libvpx/vpx_dsp/bitreader_buffer.c53
-rw-r--r--thirdparty/libvpx/vpx_dsp/bitreader_buffer.h47
-rw-r--r--thirdparty/libvpx/vpx_dsp/intrapred.c870
-rw-r--r--thirdparty/libvpx/vpx_dsp/inv_txfm.c2518
-rw-r--r--thirdparty/libvpx/vpx_dsp/inv_txfm.h133
-rw-r--r--thirdparty/libvpx/vpx_dsp/loopfilter.c767
-rw-r--r--thirdparty/libvpx/vpx_dsp/prob.c53
-rw-r--r--thirdparty/libvpx/vpx_dsp/prob.h103
-rw-r--r--thirdparty/libvpx/vpx_dsp/txfm_common.h66
-rw-r--r--thirdparty/libvpx/vpx_dsp/vpx_convolve.c612
-rw-r--r--thirdparty/libvpx/vpx_dsp/vpx_convolve.h38
-rw-r--r--thirdparty/libvpx/vpx_dsp/vpx_dsp_common.h69
-rw-r--r--thirdparty/libvpx/vpx_dsp/vpx_dsp_rtcd.c17
-rw-r--r--thirdparty/libvpx/vpx_dsp/vpx_filter.h34
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/convolve.h274
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm860
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm871
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c4046
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h196
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm1793
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm109
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c979
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c1776
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h29
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c162
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm228
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c606
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c915
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm987
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm629
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm448
-rw-r--r--thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm422
62 files changed, 0 insertions, 31028 deletions
diff --git a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/intrapred_neon_asm.asm b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/intrapred_neon_asm.asm
deleted file mode 100644
index b2846c410b..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/intrapred_neon_asm.asm
+++ /dev/null
@@ -1,643 +0,0 @@
-; This file was created from a .asm file
-; using the ads2armasm_ms.pl script.
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vpx_v_predictor_4x4_neon|
- EXPORT |vpx_v_predictor_8x8_neon|
- EXPORT |vpx_v_predictor_16x16_neon|
- EXPORT |vpx_v_predictor_32x32_neon|
- EXPORT |vpx_h_predictor_4x4_neon|
- EXPORT |vpx_h_predictor_8x8_neon|
- EXPORT |vpx_h_predictor_16x16_neon|
- EXPORT |vpx_h_predictor_32x32_neon|
- EXPORT |vpx_tm_predictor_4x4_neon|
- EXPORT |vpx_tm_predictor_8x8_neon|
- EXPORT |vpx_tm_predictor_16x16_neon|
- EXPORT |vpx_tm_predictor_32x32_neon|
-
-
-
- AREA |.text|, CODE, READONLY, ALIGN=2
-
-;void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
-; const uint8_t *above,
-; const uint8_t *left)
-; r0 uint8_t *dst
-; r1 ptrdiff_t y_stride
-; r2 const uint8_t *above
-; r3 const uint8_t *left
-
-|vpx_v_predictor_4x4_neon| PROC
- vld1.32 {d0[0]}, [r2]
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d0[0]}, [r0], r1
- bx lr
- ENDP ; |vpx_v_predictor_4x4_neon|
- ALIGN 4
-
-;void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
-; const uint8_t *above,
-; const uint8_t *left)
-; r0 uint8_t *dst
-; r1 ptrdiff_t y_stride
-; r2 const uint8_t *above
-; r3 const uint8_t *left
-
-|vpx_v_predictor_8x8_neon| PROC
- vld1.8 {d0}, [r2]
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- bx lr
- ENDP ; |vpx_v_predictor_8x8_neon|
- ALIGN 4
-
-;void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
-; const uint8_t *above,
-; const uint8_t *left)
-; r0 uint8_t *dst
-; r1 ptrdiff_t y_stride
-; r2 const uint8_t *above
-; r3 const uint8_t *left
-
-|vpx_v_predictor_16x16_neon| PROC
- vld1.8 {q0}, [r2]
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- bx lr
- ENDP ; |vpx_v_predictor_16x16_neon|
- ALIGN 4
-
-;void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
-; const uint8_t *above,
-; const uint8_t *left)
-; r0 uint8_t *dst
-; r1 ptrdiff_t y_stride
-; r2 const uint8_t *above
-; r3 const uint8_t *left
-
-|vpx_v_predictor_32x32_neon| PROC
- vld1.8 {q0, q1}, [r2]
- mov r2, #2
-loop_v
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- subs r2, r2, #1
- bgt loop_v
- bx lr
- ENDP ; |vpx_v_predictor_32x32_neon|
- ALIGN 4
-
-;void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
-; const uint8_t *above,
-; const uint8_t *left)
-; r0 uint8_t *dst
-; r1 ptrdiff_t y_stride
-; r2 const uint8_t *above
-; r3 const uint8_t *left
-
-|vpx_h_predictor_4x4_neon| PROC
- vld1.32 {d1[0]}, [r3]
- vdup.8 d0, d1[0]
- vst1.32 {d0[0]}, [r0], r1
- vdup.8 d0, d1[1]
- vst1.32 {d0[0]}, [r0], r1
- vdup.8 d0, d1[2]
- vst1.32 {d0[0]}, [r0], r1
- vdup.8 d0, d1[3]
- vst1.32 {d0[0]}, [r0], r1
- bx lr
- ENDP ; |vpx_h_predictor_4x4_neon|
- ALIGN 4
-
-;void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
-; const uint8_t *above,
-; const uint8_t *left)
-; r0 uint8_t *dst
-; r1 ptrdiff_t y_stride
-; r2 const uint8_t *above
-; r3 const uint8_t *left
-
-|vpx_h_predictor_8x8_neon| PROC
- vld1.64 {d1}, [r3]
- vdup.8 d0, d1[0]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[1]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[2]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[3]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[4]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[5]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[6]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[7]
- vst1.64 {d0}, [r0], r1
- bx lr
- ENDP ; |vpx_h_predictor_8x8_neon|
- ALIGN 4
-
-;void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
-; const uint8_t *above,
-; const uint8_t *left)
-; r0 uint8_t *dst
-; r1 ptrdiff_t y_stride
-; r2 const uint8_t *above
-; r3 const uint8_t *left
-
-|vpx_h_predictor_16x16_neon| PROC
- vld1.8 {q1}, [r3]
- vdup.8 q0, d2[0]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[1]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[2]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[3]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[4]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[5]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[6]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[7]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[0]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[1]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[2]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[3]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[4]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[5]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[6]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[7]
- vst1.8 {q0}, [r0], r1
- bx lr
- ENDP ; |vpx_h_predictor_16x16_neon|
- ALIGN 4
-
-;void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
-; const uint8_t *above,
-; const uint8_t *left)
-; r0 uint8_t *dst
-; r1 ptrdiff_t y_stride
-; r2 const uint8_t *above
-; r3 const uint8_t *left
-
-|vpx_h_predictor_32x32_neon| PROC
- sub r1, r1, #16
- mov r2, #2
-loop_h
- vld1.8 {q1}, [r3]!
- vdup.8 q0, d2[0]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[1]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[2]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[3]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[4]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[5]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[6]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[7]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[0]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[1]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[2]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[3]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[4]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[5]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[6]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[7]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- subs r2, r2, #1
- bgt loop_h
- bx lr
- ENDP ; |vpx_h_predictor_32x32_neon|
- ALIGN 4
-
-;void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
-; const uint8_t *above,
-; const uint8_t *left)
-; r0 uint8_t *dst
-; r1 ptrdiff_t y_stride
-; r2 const uint8_t *above
-; r3 const uint8_t *left
-
-|vpx_tm_predictor_4x4_neon| PROC
- ; Load ytop_left = above[-1];
- sub r12, r2, #1
- vld1.u8 {d0[]}, [r12]
-
- ; Load above 4 pixels
- vld1.32 {d2[0]}, [r2]
-
- ; Compute above - ytop_left
- vsubl.u8 q3, d2, d0
-
- ; Load left row by row and compute left + (above - ytop_left)
- ; 1st row and 2nd row
- vld1.u8 {d2[]}, [r3]!
- vld1.u8 {d4[]}, [r3]!
- vmovl.u8 q1, d2
- vmovl.u8 q2, d4
- vadd.s16 q1, q1, q3
- vadd.s16 q2, q2, q3
- vqmovun.s16 d0, q1
- vqmovun.s16 d1, q2
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d1[0]}, [r0], r1
-
- ; 3rd row and 4th row
- vld1.u8 {d2[]}, [r3]!
- vld1.u8 {d4[]}, [r3]
- vmovl.u8 q1, d2
- vmovl.u8 q2, d4
- vadd.s16 q1, q1, q3
- vadd.s16 q2, q2, q3
- vqmovun.s16 d0, q1
- vqmovun.s16 d1, q2
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d1[0]}, [r0], r1
- bx lr
- ENDP ; |vpx_tm_predictor_4x4_neon|
- ALIGN 4
-
-;void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
-; const uint8_t *above,
-; const uint8_t *left)
-; r0 uint8_t *dst
-; r1 ptrdiff_t y_stride
-; r2 const uint8_t *above
-; r3 const uint8_t *left
-
-|vpx_tm_predictor_8x8_neon| PROC
- ; Load ytop_left = above[-1];
- sub r12, r2, #1
- vld1.8 {d0[]}, [r12]
-
- ; preload 8 left
- vld1.8 {d30}, [r3]
-
- ; Load above 8 pixels
- vld1.64 {d2}, [r2]
-
- vmovl.u8 q10, d30
-
- ; Compute above - ytop_left
- vsubl.u8 q3, d2, d0
-
- ; Load left row by row and compute left + (above - ytop_left)
- ; 1st row and 2nd row
- vdup.16 q0, d20[0]
- vdup.16 q1, d20[1]
- vadd.s16 q0, q3, q0
- vadd.s16 q1, q3, q1
-
- ; 3rd row and 4th row
- vdup.16 q8, d20[2]
- vdup.16 q9, d20[3]
- vadd.s16 q8, q3, q8
- vadd.s16 q9, q3, q9
-
- vqmovun.s16 d0, q0
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q8
- vqmovun.s16 d3, q9
-
- vst1.64 {d0}, [r0], r1
- vst1.64 {d1}, [r0], r1
- vst1.64 {d2}, [r0], r1
- vst1.64 {d3}, [r0], r1
-
- ; 5th row and 6th row
- vdup.16 q0, d21[0]
- vdup.16 q1, d21[1]
- vadd.s16 q0, q3, q0
- vadd.s16 q1, q3, q1
-
- ; 7th row and 8th row
- vdup.16 q8, d21[2]
- vdup.16 q9, d21[3]
- vadd.s16 q8, q3, q8
- vadd.s16 q9, q3, q9
-
- vqmovun.s16 d0, q0
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q8
- vqmovun.s16 d3, q9
-
- vst1.64 {d0}, [r0], r1
- vst1.64 {d1}, [r0], r1
- vst1.64 {d2}, [r0], r1
- vst1.64 {d3}, [r0], r1
-
- bx lr
- ENDP ; |vpx_tm_predictor_8x8_neon|
- ALIGN 4
-
-;void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
-; const uint8_t *above,
-; const uint8_t *left)
-; r0 uint8_t *dst
-; r1 ptrdiff_t y_stride
-; r2 const uint8_t *above
-; r3 const uint8_t *left
-
-|vpx_tm_predictor_16x16_neon| PROC
- ; Load ytop_left = above[-1];
- sub r12, r2, #1
- vld1.8 {d0[]}, [r12]
-
- ; Load above 8 pixels
- vld1.8 {q1}, [r2]
-
- ; preload 8 left into r12
- vld1.8 {d18}, [r3]!
-
- ; Compute above - ytop_left
- vsubl.u8 q2, d2, d0
- vsubl.u8 q3, d3, d0
-
- vmovl.u8 q10, d18
-
- ; Load left row by row and compute left + (above - ytop_left)
- ; Process 8 rows in each single loop and loop 2 times to process 16 rows.
- mov r2, #2
-
-loop_16x16_neon
- ; Process two rows.
- vdup.16 q0, d20[0]
- vdup.16 q8, d20[1]
- vadd.s16 q1, q0, q2
- vadd.s16 q0, q0, q3
- vadd.s16 q11, q8, q2
- vadd.s16 q8, q8, q3
- vqmovun.s16 d2, q1
- vqmovun.s16 d3, q0
- vqmovun.s16 d22, q11
- vqmovun.s16 d23, q8
- vdup.16 q0, d20[2] ; proload next 2 rows data
- vdup.16 q8, d20[3]
- vst1.64 {d2,d3}, [r0], r1
- vst1.64 {d22,d23}, [r0], r1
-
- ; Process two rows.
- vadd.s16 q1, q0, q2
- vadd.s16 q0, q0, q3
- vadd.s16 q11, q8, q2
- vadd.s16 q8, q8, q3
- vqmovun.s16 d2, q1
- vqmovun.s16 d3, q0
- vqmovun.s16 d22, q11
- vqmovun.s16 d23, q8
- vdup.16 q0, d21[0] ; proload next 2 rows data
- vdup.16 q8, d21[1]
- vst1.64 {d2,d3}, [r0], r1
- vst1.64 {d22,d23}, [r0], r1
-
- vadd.s16 q1, q0, q2
- vadd.s16 q0, q0, q3
- vadd.s16 q11, q8, q2
- vadd.s16 q8, q8, q3
- vqmovun.s16 d2, q1
- vqmovun.s16 d3, q0
- vqmovun.s16 d22, q11
- vqmovun.s16 d23, q8
- vdup.16 q0, d21[2] ; proload next 2 rows data
- vdup.16 q8, d21[3]
- vst1.64 {d2,d3}, [r0], r1
- vst1.64 {d22,d23}, [r0], r1
-
-
- vadd.s16 q1, q0, q2
- vadd.s16 q0, q0, q3
- vadd.s16 q11, q8, q2
- vadd.s16 q8, q8, q3
- vqmovun.s16 d2, q1
- vqmovun.s16 d3, q0
- vqmovun.s16 d22, q11
- vqmovun.s16 d23, q8
- vld1.8 {d18}, [r3]! ; preload 8 left into r12
- vmovl.u8 q10, d18
- vst1.64 {d2,d3}, [r0], r1
- vst1.64 {d22,d23}, [r0], r1
-
- subs r2, r2, #1
- bgt loop_16x16_neon
-
- bx lr
- ENDP ; |vpx_tm_predictor_16x16_neon|
- ALIGN 4
-
-;void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
-; const uint8_t *above,
-; const uint8_t *left)
-; r0 uint8_t *dst
-; r1 ptrdiff_t y_stride
-; r2 const uint8_t *above
-; r3 const uint8_t *left
-
-|vpx_tm_predictor_32x32_neon| PROC
- ; Load ytop_left = above[-1];
- sub r12, r2, #1
- vld1.8 {d0[]}, [r12]
-
- ; Load above 32 pixels
- vld1.8 {q1}, [r2]!
- vld1.8 {q2}, [r2]
-
- ; preload 8 left pixels
- vld1.8 {d26}, [r3]!
-
- ; Compute above - ytop_left
- vsubl.u8 q8, d2, d0
- vsubl.u8 q9, d3, d0
- vsubl.u8 q10, d4, d0
- vsubl.u8 q11, d5, d0
-
- vmovl.u8 q3, d26
-
- ; Load left row by row and compute left + (above - ytop_left)
- ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
- mov r2, #4
-
-loop_32x32_neon
- ; Process two rows.
- vdup.16 q0, d6[0]
- vdup.16 q2, d6[1]
- vadd.s16 q12, q0, q8
- vadd.s16 q13, q0, q9
- vadd.s16 q14, q0, q10
- vadd.s16 q15, q0, q11
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vadd.s16 q12, q2, q8
- vadd.s16 q13, q2, q9
- vqmovun.s16 d2, q14
- vqmovun.s16 d3, q15
- vadd.s16 q14, q2, q10
- vadd.s16 q15, q2, q11
- vst1.64 {d0-d3}, [r0], r1
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vqmovun.s16 d26, q14
- vqmovun.s16 d27, q15
- vdup.16 q1, d6[2]
- vdup.16 q2, d6[3]
- vst1.64 {d24-d27}, [r0], r1
-
- ; Process two rows.
- vadd.s16 q12, q1, q8
- vadd.s16 q13, q1, q9
- vadd.s16 q14, q1, q10
- vadd.s16 q15, q1, q11
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vadd.s16 q12, q2, q8
- vadd.s16 q13, q2, q9
- vqmovun.s16 d2, q14
- vqmovun.s16 d3, q15
- vadd.s16 q14, q2, q10
- vadd.s16 q15, q2, q11
- vst1.64 {d0-d3}, [r0], r1
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vqmovun.s16 d26, q14
- vqmovun.s16 d27, q15
- vdup.16 q0, d7[0]
- vdup.16 q2, d7[1]
- vst1.64 {d24-d27}, [r0], r1
-
- ; Process two rows.
- vadd.s16 q12, q0, q8
- vadd.s16 q13, q0, q9
- vadd.s16 q14, q0, q10
- vadd.s16 q15, q0, q11
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vadd.s16 q12, q2, q8
- vadd.s16 q13, q2, q9
- vqmovun.s16 d2, q14
- vqmovun.s16 d3, q15
- vadd.s16 q14, q2, q10
- vadd.s16 q15, q2, q11
- vst1.64 {d0-d3}, [r0], r1
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vqmovun.s16 d26, q14
- vqmovun.s16 d27, q15
- vdup.16 q0, d7[2]
- vdup.16 q2, d7[3]
- vst1.64 {d24-d27}, [r0], r1
-
- ; Process two rows.
- vadd.s16 q12, q0, q8
- vadd.s16 q13, q0, q9
- vadd.s16 q14, q0, q10
- vadd.s16 q15, q0, q11
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vadd.s16 q12, q2, q8
- vadd.s16 q13, q2, q9
- vqmovun.s16 d2, q14
- vqmovun.s16 d3, q15
- vadd.s16 q14, q2, q10
- vadd.s16 q15, q2, q11
- vst1.64 {d0-d3}, [r0], r1
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vld1.8 {d0}, [r3]! ; preload 8 left pixels
- vqmovun.s16 d26, q14
- vqmovun.s16 d27, q15
- vmovl.u8 q3, d0
- vst1.64 {d24-d27}, [r0], r1
-
- subs r2, r2, #1
- bgt loop_32x32_neon
-
- bx lr
- ENDP ; |vpx_tm_predictor_32x32_neon|
- ALIGN 4
-
- END
diff --git a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/loopfilter_mb_neon.asm b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/loopfilter_mb_neon.asm
deleted file mode 100644
index 9c3736faf8..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/loopfilter_mb_neon.asm
+++ /dev/null
@@ -1,641 +0,0 @@
-; This file was created from a .asm file
-; using the ads2armasm_ms.pl script.
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vpx_lpf_horizontal_edge_8_neon|
- EXPORT |vpx_lpf_horizontal_edge_16_neon|
- EXPORT |vpx_lpf_vertical_16_neon|
-
- AREA |.text|, CODE, READONLY, ALIGN=2
-
-; void mb_lpf_horizontal_edge(uint8_t *s, int p,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh,
-; int count)
-; r0 uint8_t *s,
-; r1 int p, /* pitch */
-; r2 const uint8_t *blimit,
-; r3 const uint8_t *limit,
-; sp const uint8_t *thresh,
-; r12 int count
-|mb_lpf_horizontal_edge| PROC
- push {r4-r8, lr}
- vpush {d8-d15}
- ldr r4, [sp, #88] ; load thresh
-
-h_count
- vld1.8 {d16[]}, [r2] ; load *blimit
- vld1.8 {d17[]}, [r3] ; load *limit
- vld1.8 {d18[]}, [r4] ; load *thresh
-
- sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines
-
- vld1.u8 {d0}, [r8@64], r1 ; p7
- vld1.u8 {d1}, [r8@64], r1 ; p6
- vld1.u8 {d2}, [r8@64], r1 ; p5
- vld1.u8 {d3}, [r8@64], r1 ; p4
- vld1.u8 {d4}, [r8@64], r1 ; p3
- vld1.u8 {d5}, [r8@64], r1 ; p2
- vld1.u8 {d6}, [r8@64], r1 ; p1
- vld1.u8 {d7}, [r8@64], r1 ; p0
- vld1.u8 {d8}, [r8@64], r1 ; q0
- vld1.u8 {d9}, [r8@64], r1 ; q1
- vld1.u8 {d10}, [r8@64], r1 ; q2
- vld1.u8 {d11}, [r8@64], r1 ; q3
- vld1.u8 {d12}, [r8@64], r1 ; q4
- vld1.u8 {d13}, [r8@64], r1 ; q5
- vld1.u8 {d14}, [r8@64], r1 ; q6
- vld1.u8 {d15}, [r8@64], r1 ; q7
-
- bl vpx_wide_mbfilter_neon
-
- tst r7, #1
- beq h_mbfilter
-
- ; flat && mask were not set for any of the channels. Just store the values
- ; from filter.
- sub r8, r0, r1, lsl #1
-
- vst1.u8 {d25}, [r8@64], r1 ; store op1
- vst1.u8 {d24}, [r8@64], r1 ; store op0
- vst1.u8 {d23}, [r8@64], r1 ; store oq0
- vst1.u8 {d26}, [r8@64], r1 ; store oq1
-
- b h_next
-
-h_mbfilter
- tst r7, #2
- beq h_wide_mbfilter
-
- ; flat2 was not set for any of the channels. Just store the values from
- ; mbfilter.
- sub r8, r0, r1, lsl #1
- sub r8, r8, r1
-
- vst1.u8 {d18}, [r8@64], r1 ; store op2
- vst1.u8 {d19}, [r8@64], r1 ; store op1
- vst1.u8 {d20}, [r8@64], r1 ; store op0
- vst1.u8 {d21}, [r8@64], r1 ; store oq0
- vst1.u8 {d22}, [r8@64], r1 ; store oq1
- vst1.u8 {d23}, [r8@64], r1 ; store oq2
-
- b h_next
-
-h_wide_mbfilter
- sub r8, r0, r1, lsl #3
- add r8, r8, r1
-
- vst1.u8 {d16}, [r8@64], r1 ; store op6
- vst1.u8 {d24}, [r8@64], r1 ; store op5
- vst1.u8 {d25}, [r8@64], r1 ; store op4
- vst1.u8 {d26}, [r8@64], r1 ; store op3
- vst1.u8 {d27}, [r8@64], r1 ; store op2
- vst1.u8 {d18}, [r8@64], r1 ; store op1
- vst1.u8 {d19}, [r8@64], r1 ; store op0
- vst1.u8 {d20}, [r8@64], r1 ; store oq0
- vst1.u8 {d21}, [r8@64], r1 ; store oq1
- vst1.u8 {d22}, [r8@64], r1 ; store oq2
- vst1.u8 {d23}, [r8@64], r1 ; store oq3
- vst1.u8 {d1}, [r8@64], r1 ; store oq4
- vst1.u8 {d2}, [r8@64], r1 ; store oq5
- vst1.u8 {d3}, [r8@64], r1 ; store oq6
-
-h_next
- add r0, r0, #8
- subs r12, r12, #1
- bne h_count
-
- vpop {d8-d15}
- pop {r4-r8, pc}
-
- ENDP ; |mb_lpf_horizontal_edge|
- ALIGN 4
-
-; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh)
-; r0 uint8_t *s,
-; r1 int pitch,
-; r2 const uint8_t *blimit,
-; r3 const uint8_t *limit,
-; sp const uint8_t *thresh
-|vpx_lpf_horizontal_edge_8_neon| PROC
- mov r12, #1
- b mb_lpf_horizontal_edge
- ENDP ; |vpx_lpf_horizontal_edge_8_neon|
- ALIGN 4
-
-; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh)
-; r0 uint8_t *s,
-; r1 int pitch,
-; r2 const uint8_t *blimit,
-; r3 const uint8_t *limit,
-; sp const uint8_t *thresh
-|vpx_lpf_horizontal_edge_16_neon| PROC
- mov r12, #2
- b mb_lpf_horizontal_edge
- ENDP ; |vpx_lpf_horizontal_edge_16_neon|
- ALIGN 4
-
-; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh)
-; r0 uint8_t *s,
-; r1 int p, /* pitch */
-; r2 const uint8_t *blimit,
-; r3 const uint8_t *limit,
-; sp const uint8_t *thresh,
-|vpx_lpf_vertical_16_neon| PROC
- push {r4-r8, lr}
- vpush {d8-d15}
- ldr r4, [sp, #88] ; load thresh
-
- vld1.8 {d16[]}, [r2] ; load *blimit
- vld1.8 {d17[]}, [r3] ; load *limit
- vld1.8 {d18[]}, [r4] ; load *thresh
-
- sub r8, r0, #8
-
- vld1.8 {d0}, [r8@64], r1
- vld1.8 {d8}, [r0@64], r1
- vld1.8 {d1}, [r8@64], r1
- vld1.8 {d9}, [r0@64], r1
- vld1.8 {d2}, [r8@64], r1
- vld1.8 {d10}, [r0@64], r1
- vld1.8 {d3}, [r8@64], r1
- vld1.8 {d11}, [r0@64], r1
- vld1.8 {d4}, [r8@64], r1
- vld1.8 {d12}, [r0@64], r1
- vld1.8 {d5}, [r8@64], r1
- vld1.8 {d13}, [r0@64], r1
- vld1.8 {d6}, [r8@64], r1
- vld1.8 {d14}, [r0@64], r1
- vld1.8 {d7}, [r8@64], r1
- vld1.8 {d15}, [r0@64], r1
-
- sub r0, r0, r1, lsl #3
-
- vtrn.32 q0, q2
- vtrn.32 q1, q3
- vtrn.32 q4, q6
- vtrn.32 q5, q7
-
- vtrn.16 q0, q1
- vtrn.16 q2, q3
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- vtrn.8 d0, d1
- vtrn.8 d2, d3
- vtrn.8 d4, d5
- vtrn.8 d6, d7
-
- vtrn.8 d8, d9
- vtrn.8 d10, d11
- vtrn.8 d12, d13
- vtrn.8 d14, d15
-
- bl vpx_wide_mbfilter_neon
-
- tst r7, #1
- beq v_mbfilter
-
- ; flat && mask were not set for any of the channels. Just store the values
- ; from filter.
- sub r8, r0, #2
-
- vswp d23, d25
-
- vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
- vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
- vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
- vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
- vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
- vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
- vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
- vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
-
- b v_end
-
-v_mbfilter
- tst r7, #2
- beq v_wide_mbfilter
-
- ; flat2 was not set for any of the channels. Just store the values from
- ; mbfilter.
- sub r8, r0, #3
-
- vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1
- vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1
- vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1
- vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1
- vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1
- vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1
- vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1
- vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1
- vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1
- vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1
- vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1
- vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1
- vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1
- vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1
- vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1
- vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1
-
- b v_end
-
-v_wide_mbfilter
- sub r8, r0, #8
-
- vtrn.32 d0, d26
- vtrn.32 d16, d27
- vtrn.32 d24, d18
- vtrn.32 d25, d19
-
- vtrn.16 d0, d24
- vtrn.16 d16, d25
- vtrn.16 d26, d18
- vtrn.16 d27, d19
-
- vtrn.8 d0, d16
- vtrn.8 d24, d25
- vtrn.8 d26, d27
- vtrn.8 d18, d19
-
- vtrn.32 d20, d1
- vtrn.32 d21, d2
- vtrn.32 d22, d3
- vtrn.32 d23, d15
-
- vtrn.16 d20, d22
- vtrn.16 d21, d23
- vtrn.16 d1, d3
- vtrn.16 d2, d15
-
- vtrn.8 d20, d21
- vtrn.8 d22, d23
- vtrn.8 d1, d2
- vtrn.8 d3, d15
-
- vst1.8 {d0}, [r8@64], r1
- vst1.8 {d20}, [r0@64], r1
- vst1.8 {d16}, [r8@64], r1
- vst1.8 {d21}, [r0@64], r1
- vst1.8 {d24}, [r8@64], r1
- vst1.8 {d22}, [r0@64], r1
- vst1.8 {d25}, [r8@64], r1
- vst1.8 {d23}, [r0@64], r1
- vst1.8 {d26}, [r8@64], r1
- vst1.8 {d1}, [r0@64], r1
- vst1.8 {d27}, [r8@64], r1
- vst1.8 {d2}, [r0@64], r1
- vst1.8 {d18}, [r8@64], r1
- vst1.8 {d3}, [r0@64], r1
- vst1.8 {d19}, [r8@64], r1
- vst1.8 {d15}, [r0@64], r1
-
-v_end
- vpop {d8-d15}
- pop {r4-r8, pc}
-
- ENDP ; |vpx_lpf_vertical_16_neon|
- ALIGN 4
-
-; void vpx_wide_mbfilter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-;
-; r0-r3 PRESERVE
-; d16 blimit
-; d17 limit
-; d18 thresh
-; d0 p7
-; d1 p6
-; d2 p5
-; d3 p4
-; d4 p3
-; d5 p2
-; d6 p1
-; d7 p0
-; d8 q0
-; d9 q1
-; d10 q2
-; d11 q3
-; d12 q4
-; d13 q5
-; d14 q6
-; d15 q7
-|vpx_wide_mbfilter_neon| PROC
- mov r7, #0
-
- ; filter_mask
- vabd.u8 d19, d4, d5 ; abs(p3 - p2)
- vabd.u8 d20, d5, d6 ; abs(p2 - p1)
- vabd.u8 d21, d6, d7 ; abs(p1 - p0)
- vabd.u8 d22, d9, d8 ; abs(q1 - q0)
- vabd.u8 d23, d10, d9 ; abs(q2 - q1)
- vabd.u8 d24, d11, d10 ; abs(q3 - q2)
-
- ; only compare the largest value to limit
- vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1))
- vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0))
- vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2))
- vmax.u8 d19, d19, d20
-
- vabd.u8 d24, d7, d8 ; abs(p0 - q0)
-
- vmax.u8 d19, d19, d23
-
- vabd.u8 d23, d6, d9 ; a = abs(p1 - q1)
- vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2
-
- ; abs () > limit
- vcge.u8 d19, d17, d19
-
- ; flatmask4
- vabd.u8 d25, d7, d5 ; abs(p0 - p2)
- vabd.u8 d26, d8, d10 ; abs(q0 - q2)
- vabd.u8 d27, d4, d7 ; abs(p3 - p0)
- vabd.u8 d28, d11, d8 ; abs(q3 - q0)
-
- ; only compare the largest value to thresh
- vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2))
- vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0))
- vmax.u8 d25, d25, d26
- vmax.u8 d20, d20, d25
-
- vshr.u8 d23, d23, #1 ; a = a / 2
- vqadd.u8 d24, d24, d23 ; a = b + a
-
- vmov.u8 d30, #1
- vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1
-
- vcge.u8 d20, d30, d20 ; flat
-
- vand d19, d19, d24 ; mask
-
- ; hevmask
- vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1
- vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1
- vorr d21, d21, d22 ; hev
-
- vand d16, d20, d19 ; flat && mask
- vmov r5, r6, d16
-
- ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
- vabd.u8 d22, d3, d7 ; abs(p4 - p0)
- vabd.u8 d23, d12, d8 ; abs(q4 - q0)
- vabd.u8 d24, d7, d2 ; abs(p0 - p5)
- vabd.u8 d25, d8, d13 ; abs(q0 - q5)
- vabd.u8 d26, d1, d7 ; abs(p6 - p0)
- vabd.u8 d27, d14, d8 ; abs(q6 - q0)
- vabd.u8 d28, d0, d7 ; abs(p7 - p0)
- vabd.u8 d29, d15, d8 ; abs(q7 - q0)
-
- ; only compare the largest value to thresh
- vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0))
- vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5))
- vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0))
- vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0))
-
- vmax.u8 d26, d22, d23
- vmax.u8 d27, d24, d25
- vmax.u8 d23, d26, d27
-
- vcge.u8 d18, d30, d23 ; flat2
-
- vmov.u8 d22, #0x80
-
- orrs r5, r5, r6 ; Check for 0
- orreq r7, r7, #1 ; Only do filter branch
-
- vand d17, d18, d16 ; flat2 && flat && mask
- vmov r5, r6, d17
-
- ; mbfilter() function
-
- ; filter() function
- ; convert to signed
- veor d23, d8, d22 ; qs0
- veor d24, d7, d22 ; ps0
- veor d25, d6, d22 ; ps1
- veor d26, d9, d22 ; qs1
-
- vmov.u8 d27, #3
-
- vsub.s8 d28, d23, d24 ; ( qs0 - ps0)
- vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1)
- vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0)
- vand d29, d29, d21 ; filter &= hev
- vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0)
- vmov.u8 d29, #4
-
- ; filter = clamp(filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d28, q15
-
- vand d28, d28, d19 ; filter &= mask
-
- vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3)
- vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4)
- vshr.s8 d30, d30, #3 ; filter2 >>= 3
- vshr.s8 d29, d29, #3 ; filter1 >>= 3
-
-
- vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2)
- vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1)
-
- ; outer tap adjustments: ++filter1 >> 1
- vrshr.s8 d29, d29, #1
- vbic d29, d29, d21 ; filter &= ~hev
-
- vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter)
- vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter)
-
- veor d24, d24, d22 ; *f_op0 = u^0x80
- veor d23, d23, d22 ; *f_oq0 = u^0x80
- veor d25, d25, d22 ; *f_op1 = u^0x80
- veor d26, d26, d22 ; *f_oq1 = u^0x80
-
- tst r7, #1
- bxne lr
-
- orrs r5, r5, r6 ; Check for 0
- orreq r7, r7, #2 ; Only do mbfilter branch
-
- ; mbfilter flat && mask branch
- ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
- ; and using vibt on the q's?
- vmov.u8 d29, #2
- vaddl.u8 q15, d7, d8 ; op2 = p0 + q0
- vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3
- vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2
- vaddl.u8 q10, d4, d5
- vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
- vaddl.u8 q14, d6, d9
- vqrshrn.u16 d18, q15, #3 ; r_op2
-
- vsub.i16 q15, q10
- vaddl.u8 q10, d4, d6
- vadd.i16 q15, q14
- vaddl.u8 q14, d7, d10
- vqrshrn.u16 d19, q15, #3 ; r_op1
-
- vsub.i16 q15, q10
- vadd.i16 q15, q14
- vaddl.u8 q14, d8, d11
- vqrshrn.u16 d20, q15, #3 ; r_op0
-
- vsubw.u8 q15, d4 ; oq0 = op0 - p3
- vsubw.u8 q15, d7 ; oq0 -= p0
- vadd.i16 q15, q14
- vaddl.u8 q14, d9, d11
- vqrshrn.u16 d21, q15, #3 ; r_oq0
-
- vsubw.u8 q15, d5 ; oq1 = oq0 - p2
- vsubw.u8 q15, d8 ; oq1 -= q0
- vadd.i16 q15, q14
- vaddl.u8 q14, d10, d11
- vqrshrn.u16 d22, q15, #3 ; r_oq1
-
- vsubw.u8 q15, d6 ; oq2 = oq0 - p1
- vsubw.u8 q15, d9 ; oq2 -= q1
- vadd.i16 q15, q14
- vqrshrn.u16 d27, q15, #3 ; r_oq2
-
- ; Filter does not set op2 or oq2, so use p2 and q2.
- vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask)
- vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask)
- vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask)
- vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask)
- vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask)
-
- vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask)
- vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask)
-
- tst r7, #2
- bxne lr
-
- ; wide_mbfilter flat2 && flat && mask branch
- vmov.u8 d16, #7
- vaddl.u8 q15, d7, d8 ; op6 = p0 + q0
- vaddl.u8 q12, d2, d3
- vaddl.u8 q13, d4, d5
- vaddl.u8 q14, d1, d6
- vmlal.u8 q15, d0, d16 ; op6 += p7 * 3
- vadd.i16 q12, q13
- vadd.i16 q15, q14
- vaddl.u8 q14, d2, d9
- vadd.i16 q15, q12
- vaddl.u8 q12, d0, d1
- vaddw.u8 q15, d1
- vaddl.u8 q13, d0, d2
- vadd.i16 q14, q15, q14
- vqrshrn.u16 d16, q15, #4 ; w_op6
-
- vsub.i16 q15, q14, q12
- vaddl.u8 q14, d3, d10
- vqrshrn.u16 d24, q15, #4 ; w_op5
-
- vsub.i16 q15, q13
- vaddl.u8 q13, d0, d3
- vadd.i16 q15, q14
- vaddl.u8 q14, d4, d11
- vqrshrn.u16 d25, q15, #4 ; w_op4
-
- vadd.i16 q15, q14
- vaddl.u8 q14, d0, d4
- vsub.i16 q15, q13
- vsub.i16 q14, q15, q14
- vqrshrn.u16 d26, q15, #4 ; w_op3
-
- vaddw.u8 q15, q14, d5 ; op2 += p2
- vaddl.u8 q14, d0, d5
- vaddw.u8 q15, d12 ; op2 += q4
- vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m)
- vqrshrn.u16 d27, q15, #4 ; w_op2
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d0, d6
- vaddw.u8 q15, d6 ; op1 += p1
- vaddw.u8 q15, d13 ; op1 += q5
- vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m)
- vqrshrn.u16 d18, q15, #4 ; w_op1
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d0, d7
- vaddw.u8 q15, d7 ; op0 += p0
- vaddw.u8 q15, d14 ; op0 += q6
- vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m)
- vqrshrn.u16 d19, q15, #4 ; w_op0
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d1, d8
- vaddw.u8 q15, d8 ; oq0 += q0
- vaddw.u8 q15, d15 ; oq0 += q7
- vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m)
- vqrshrn.u16 d20, q15, #4 ; w_oq0
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d2, d9
- vaddw.u8 q15, d9 ; oq1 += q1
- vaddl.u8 q4, d10, d15
- vaddw.u8 q15, d15 ; oq1 += q7
- vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m)
- vqrshrn.u16 d21, q15, #4 ; w_oq1
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d3, d10
- vadd.i16 q15, q4
- vaddl.u8 q4, d11, d15
- vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m)
- vqrshrn.u16 d22, q15, #4 ; w_oq2
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d4, d11
- vadd.i16 q15, q4
- vaddl.u8 q4, d12, d15
- vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m)
- vqrshrn.u16 d23, q15, #4 ; w_oq3
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d5, d12
- vadd.i16 q15, q4
- vaddl.u8 q4, d13, d15
- vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m)
- vqrshrn.u16 d1, q15, #4 ; w_oq4
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d6, d13
- vadd.i16 q15, q4
- vaddl.u8 q4, d14, d15
- vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m)
- vqrshrn.u16 d2, q15, #4 ; w_oq5
-
- vsub.i16 q15, q14
- vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m)
- vadd.i16 q15, q4
- vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m)
- vqrshrn.u16 d3, q15, #4 ; w_oq6
- vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m)
- vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m)
- vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m)
-
- bx lr
- ENDP ; |vpx_wide_mbfilter_neon|
- ALIGN 4
-
- END
diff --git a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/save_reg_neon.asm b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/save_reg_neon.asm
deleted file mode 100644
index 4cf9988e65..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/save_reg_neon.asm
+++ /dev/null
@@ -1,39 +0,0 @@
-; This file was created from a .asm file
-; using the ads2armasm_ms.pl script.
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vpx_push_neon|
- EXPORT |vpx_pop_neon|
-
-
-
-
- AREA |.text|, CODE, READONLY, ALIGN=2
-
-|vpx_push_neon| PROC
- vst1.i64 {d8, d9, d10, d11}, [r0]!
- vst1.i64 {d12, d13, d14, d15}, [r0]!
- bx lr
-
- ENDP
- ALIGN 4
-
-|vpx_pop_neon| PROC
- vld1.i64 {d8, d9, d10, d11}, [r0]!
- vld1.i64 {d12, d13, d14, d15}, [r0]!
- bx lr
-
- ENDP
- ALIGN 4
-
- END
-
diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas/intrapred_neon_asm.s b/thirdparty/libvpx/vpx_dsp/arm/gas/intrapred_neon_asm.s
deleted file mode 100644
index 3932227fc5..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/gas/intrapred_neon_asm.s
+++ /dev/null
@@ -1,658 +0,0 @@
-@ This file was created from a .asm file
-@ using the ads2gas.pl script.
- .equ DO1STROUNDING, 0
-@
-@ Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS. All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
- .global vpx_v_predictor_4x4_neon
- .type vpx_v_predictor_4x4_neon, function
- .global vpx_v_predictor_8x8_neon
- .type vpx_v_predictor_8x8_neon, function
- .global vpx_v_predictor_16x16_neon
- .type vpx_v_predictor_16x16_neon, function
- .global vpx_v_predictor_32x32_neon
- .type vpx_v_predictor_32x32_neon, function
- .global vpx_h_predictor_4x4_neon
- .type vpx_h_predictor_4x4_neon, function
- .global vpx_h_predictor_8x8_neon
- .type vpx_h_predictor_8x8_neon, function
- .global vpx_h_predictor_16x16_neon
- .type vpx_h_predictor_16x16_neon, function
- .global vpx_h_predictor_32x32_neon
- .type vpx_h_predictor_32x32_neon, function
- .global vpx_tm_predictor_4x4_neon
- .type vpx_tm_predictor_4x4_neon, function
- .global vpx_tm_predictor_8x8_neon
- .type vpx_tm_predictor_8x8_neon, function
- .global vpx_tm_predictor_16x16_neon
- .type vpx_tm_predictor_16x16_neon, function
- .global vpx_tm_predictor_32x32_neon
- .type vpx_tm_predictor_32x32_neon, function
- .arm
- .eabi_attribute 24, 1 @Tag_ABI_align_needed
- .eabi_attribute 25, 1 @Tag_ABI_align_preserved
-
-.text
-.p2align 2
-
-@void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
-@ const uint8_t *above,
-@ const uint8_t *left)
-@ r0 uint8_t *dst
-@ r1 ptrdiff_t y_stride
-@ r2 const uint8_t *above
-@ r3 const uint8_t *left
-
-_vpx_v_predictor_4x4_neon:
- vpx_v_predictor_4x4_neon: @ PROC
- vld1.32 {d0[0]}, [r2]
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d0[0]}, [r0], r1
- bx lr
- .size vpx_v_predictor_4x4_neon, .-vpx_v_predictor_4x4_neon @ ENDP @ |vpx_v_predictor_4x4_neon|
-
-@void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
-@ const uint8_t *above,
-@ const uint8_t *left)
-@ r0 uint8_t *dst
-@ r1 ptrdiff_t y_stride
-@ r2 const uint8_t *above
-@ r3 const uint8_t *left
-
-_vpx_v_predictor_8x8_neon:
- vpx_v_predictor_8x8_neon: @ PROC
- vld1.8 {d0}, [r2]
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- bx lr
- .size vpx_v_predictor_8x8_neon, .-vpx_v_predictor_8x8_neon @ ENDP @ |vpx_v_predictor_8x8_neon|
-
-@void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
-@ const uint8_t *above,
-@ const uint8_t *left)
-@ r0 uint8_t *dst
-@ r1 ptrdiff_t y_stride
-@ r2 const uint8_t *above
-@ r3 const uint8_t *left
-
-_vpx_v_predictor_16x16_neon:
- vpx_v_predictor_16x16_neon: @ PROC
- vld1.8 {q0}, [r2]
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- bx lr
- .size vpx_v_predictor_16x16_neon, .-vpx_v_predictor_16x16_neon @ ENDP @ |vpx_v_predictor_16x16_neon|
-
-@void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
-@ const uint8_t *above,
-@ const uint8_t *left)
-@ r0 uint8_t *dst
-@ r1 ptrdiff_t y_stride
-@ r2 const uint8_t *above
-@ r3 const uint8_t *left
-
-_vpx_v_predictor_32x32_neon:
- vpx_v_predictor_32x32_neon: @ PROC
- vld1.8 {q0, q1}, [r2]
- mov r2, #2
-loop_v:
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- subs r2, r2, #1
- bgt loop_v
- bx lr
- .size vpx_v_predictor_32x32_neon, .-vpx_v_predictor_32x32_neon @ ENDP @ |vpx_v_predictor_32x32_neon|
-
-@void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
-@ const uint8_t *above,
-@ const uint8_t *left)
-@ r0 uint8_t *dst
-@ r1 ptrdiff_t y_stride
-@ r2 const uint8_t *above
-@ r3 const uint8_t *left
-
-_vpx_h_predictor_4x4_neon:
- vpx_h_predictor_4x4_neon: @ PROC
- vld1.32 {d1[0]}, [r3]
- vdup.8 d0, d1[0]
- vst1.32 {d0[0]}, [r0], r1
- vdup.8 d0, d1[1]
- vst1.32 {d0[0]}, [r0], r1
- vdup.8 d0, d1[2]
- vst1.32 {d0[0]}, [r0], r1
- vdup.8 d0, d1[3]
- vst1.32 {d0[0]}, [r0], r1
- bx lr
- .size vpx_h_predictor_4x4_neon, .-vpx_h_predictor_4x4_neon @ ENDP @ |vpx_h_predictor_4x4_neon|
-
-@void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
-@ const uint8_t *above,
-@ const uint8_t *left)
-@ r0 uint8_t *dst
-@ r1 ptrdiff_t y_stride
-@ r2 const uint8_t *above
-@ r3 const uint8_t *left
-
-_vpx_h_predictor_8x8_neon:
- vpx_h_predictor_8x8_neon: @ PROC
- vld1.64 {d1}, [r3]
- vdup.8 d0, d1[0]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[1]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[2]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[3]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[4]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[5]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[6]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[7]
- vst1.64 {d0}, [r0], r1
- bx lr
- .size vpx_h_predictor_8x8_neon, .-vpx_h_predictor_8x8_neon @ ENDP @ |vpx_h_predictor_8x8_neon|
-
-@void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
-@ const uint8_t *above,
-@ const uint8_t *left)
-@ r0 uint8_t *dst
-@ r1 ptrdiff_t y_stride
-@ r2 const uint8_t *above
-@ r3 const uint8_t *left
-
-_vpx_h_predictor_16x16_neon:
- vpx_h_predictor_16x16_neon: @ PROC
- vld1.8 {q1}, [r3]
- vdup.8 q0, d2[0]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[1]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[2]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[3]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[4]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[5]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[6]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[7]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[0]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[1]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[2]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[3]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[4]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[5]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[6]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[7]
- vst1.8 {q0}, [r0], r1
- bx lr
- .size vpx_h_predictor_16x16_neon, .-vpx_h_predictor_16x16_neon @ ENDP @ |vpx_h_predictor_16x16_neon|
-
-@void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
-@ const uint8_t *above,
-@ const uint8_t *left)
-@ r0 uint8_t *dst
-@ r1 ptrdiff_t y_stride
-@ r2 const uint8_t *above
-@ r3 const uint8_t *left
-
-_vpx_h_predictor_32x32_neon:
- vpx_h_predictor_32x32_neon: @ PROC
- sub r1, r1, #16
- mov r2, #2
-loop_h:
- vld1.8 {q1}, [r3]!
- vdup.8 q0, d2[0]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[1]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[2]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[3]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[4]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[5]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[6]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[7]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[0]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[1]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[2]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[3]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[4]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[5]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[6]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[7]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- subs r2, r2, #1
- bgt loop_h
- bx lr
- .size vpx_h_predictor_32x32_neon, .-vpx_h_predictor_32x32_neon @ ENDP @ |vpx_h_predictor_32x32_neon|
-
-@void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
-@ const uint8_t *above,
-@ const uint8_t *left)
-@ r0 uint8_t *dst
-@ r1 ptrdiff_t y_stride
-@ r2 const uint8_t *above
-@ r3 const uint8_t *left
-
-_vpx_tm_predictor_4x4_neon:
- vpx_tm_predictor_4x4_neon: @ PROC
- @ Load ytop_left = above[-1];
- sub r12, r2, #1
- vld1.u8 {d0[]}, [r12]
-
- @ Load above 4 pixels
- vld1.32 {d2[0]}, [r2]
-
- @ Compute above - ytop_left
- vsubl.u8 q3, d2, d0
-
- @ Load left row by row and compute left + (above - ytop_left)
- @ 1st row and 2nd row
- vld1.u8 {d2[]}, [r3]!
- vld1.u8 {d4[]}, [r3]!
- vmovl.u8 q1, d2
- vmovl.u8 q2, d4
- vadd.s16 q1, q1, q3
- vadd.s16 q2, q2, q3
- vqmovun.s16 d0, q1
- vqmovun.s16 d1, q2
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d1[0]}, [r0], r1
-
- @ 3rd row and 4th row
- vld1.u8 {d2[]}, [r3]!
- vld1.u8 {d4[]}, [r3]
- vmovl.u8 q1, d2
- vmovl.u8 q2, d4
- vadd.s16 q1, q1, q3
- vadd.s16 q2, q2, q3
- vqmovun.s16 d0, q1
- vqmovun.s16 d1, q2
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d1[0]}, [r0], r1
- bx lr
- .size vpx_tm_predictor_4x4_neon, .-vpx_tm_predictor_4x4_neon @ ENDP @ |vpx_tm_predictor_4x4_neon|
-
-@void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
-@ const uint8_t *above,
-@ const uint8_t *left)
-@ r0 uint8_t *dst
-@ r1 ptrdiff_t y_stride
-@ r2 const uint8_t *above
-@ r3 const uint8_t *left
-
-_vpx_tm_predictor_8x8_neon:
- vpx_tm_predictor_8x8_neon: @ PROC
- @ Load ytop_left = above[-1];
- sub r12, r2, #1
- vld1.8 {d0[]}, [r12]
-
- @ preload 8 left
- vld1.8 {d30}, [r3]
-
- @ Load above 8 pixels
- vld1.64 {d2}, [r2]
-
- vmovl.u8 q10, d30
-
- @ Compute above - ytop_left
- vsubl.u8 q3, d2, d0
-
- @ Load left row by row and compute left + (above - ytop_left)
- @ 1st row and 2nd row
- vdup.16 q0, d20[0]
- vdup.16 q1, d20[1]
- vadd.s16 q0, q3, q0
- vadd.s16 q1, q3, q1
-
- @ 3rd row and 4th row
- vdup.16 q8, d20[2]
- vdup.16 q9, d20[3]
- vadd.s16 q8, q3, q8
- vadd.s16 q9, q3, q9
-
- vqmovun.s16 d0, q0
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q8
- vqmovun.s16 d3, q9
-
- vst1.64 {d0}, [r0], r1
- vst1.64 {d1}, [r0], r1
- vst1.64 {d2}, [r0], r1
- vst1.64 {d3}, [r0], r1
-
- @ 5th row and 6th row
- vdup.16 q0, d21[0]
- vdup.16 q1, d21[1]
- vadd.s16 q0, q3, q0
- vadd.s16 q1, q3, q1
-
- @ 7th row and 8th row
- vdup.16 q8, d21[2]
- vdup.16 q9, d21[3]
- vadd.s16 q8, q3, q8
- vadd.s16 q9, q3, q9
-
- vqmovun.s16 d0, q0
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q8
- vqmovun.s16 d3, q9
-
- vst1.64 {d0}, [r0], r1
- vst1.64 {d1}, [r0], r1
- vst1.64 {d2}, [r0], r1
- vst1.64 {d3}, [r0], r1
-
- bx lr
- .size vpx_tm_predictor_8x8_neon, .-vpx_tm_predictor_8x8_neon @ ENDP @ |vpx_tm_predictor_8x8_neon|
-
-@void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
-@ const uint8_t *above,
-@ const uint8_t *left)
-@ r0 uint8_t *dst
-@ r1 ptrdiff_t y_stride
-@ r2 const uint8_t *above
-@ r3 const uint8_t *left
-
-_vpx_tm_predictor_16x16_neon:
- vpx_tm_predictor_16x16_neon: @ PROC
- @ Load ytop_left = above[-1];
- sub r12, r2, #1
- vld1.8 {d0[]}, [r12]
-
- @ Load above 8 pixels
- vld1.8 {q1}, [r2]
-
- @ preload 8 left into r12
- vld1.8 {d18}, [r3]!
-
- @ Compute above - ytop_left
- vsubl.u8 q2, d2, d0
- vsubl.u8 q3, d3, d0
-
- vmovl.u8 q10, d18
-
- @ Load left row by row and compute left + (above - ytop_left)
- @ Process 8 rows in each single loop and loop 2 times to process 16 rows.
- mov r2, #2
-
-loop_16x16_neon:
- @ Process two rows.
- vdup.16 q0, d20[0]
- vdup.16 q8, d20[1]
- vadd.s16 q1, q0, q2
- vadd.s16 q0, q0, q3
- vadd.s16 q11, q8, q2
- vadd.s16 q8, q8, q3
- vqmovun.s16 d2, q1
- vqmovun.s16 d3, q0
- vqmovun.s16 d22, q11
- vqmovun.s16 d23, q8
- vdup.16 q0, d20[2] @ proload next 2 rows data
- vdup.16 q8, d20[3]
- vst1.64 {d2,d3}, [r0], r1
- vst1.64 {d22,d23}, [r0], r1
-
- @ Process two rows.
- vadd.s16 q1, q0, q2
- vadd.s16 q0, q0, q3
- vadd.s16 q11, q8, q2
- vadd.s16 q8, q8, q3
- vqmovun.s16 d2, q1
- vqmovun.s16 d3, q0
- vqmovun.s16 d22, q11
- vqmovun.s16 d23, q8
- vdup.16 q0, d21[0] @ proload next 2 rows data
- vdup.16 q8, d21[1]
- vst1.64 {d2,d3}, [r0], r1
- vst1.64 {d22,d23}, [r0], r1
-
- vadd.s16 q1, q0, q2
- vadd.s16 q0, q0, q3
- vadd.s16 q11, q8, q2
- vadd.s16 q8, q8, q3
- vqmovun.s16 d2, q1
- vqmovun.s16 d3, q0
- vqmovun.s16 d22, q11
- vqmovun.s16 d23, q8
- vdup.16 q0, d21[2] @ proload next 2 rows data
- vdup.16 q8, d21[3]
- vst1.64 {d2,d3}, [r0], r1
- vst1.64 {d22,d23}, [r0], r1
-
-
- vadd.s16 q1, q0, q2
- vadd.s16 q0, q0, q3
- vadd.s16 q11, q8, q2
- vadd.s16 q8, q8, q3
- vqmovun.s16 d2, q1
- vqmovun.s16 d3, q0
- vqmovun.s16 d22, q11
- vqmovun.s16 d23, q8
- vld1.8 {d18}, [r3]! @ preload 8 left into r12
- vmovl.u8 q10, d18
- vst1.64 {d2,d3}, [r0], r1
- vst1.64 {d22,d23}, [r0], r1
-
- subs r2, r2, #1
- bgt loop_16x16_neon
-
- bx lr
- .size vpx_tm_predictor_16x16_neon, .-vpx_tm_predictor_16x16_neon @ ENDP @ |vpx_tm_predictor_16x16_neon|
-
-@void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
-@ const uint8_t *above,
-@ const uint8_t *left)
-@ r0 uint8_t *dst
-@ r1 ptrdiff_t y_stride
-@ r2 const uint8_t *above
-@ r3 const uint8_t *left
-
-_vpx_tm_predictor_32x32_neon:
- vpx_tm_predictor_32x32_neon: @ PROC
- @ Load ytop_left = above[-1];
- sub r12, r2, #1
- vld1.8 {d0[]}, [r12]
-
- @ Load above 32 pixels
- vld1.8 {q1}, [r2]!
- vld1.8 {q2}, [r2]
-
- @ preload 8 left pixels
- vld1.8 {d26}, [r3]!
-
- @ Compute above - ytop_left
- vsubl.u8 q8, d2, d0
- vsubl.u8 q9, d3, d0
- vsubl.u8 q10, d4, d0
- vsubl.u8 q11, d5, d0
-
- vmovl.u8 q3, d26
-
- @ Load left row by row and compute left + (above - ytop_left)
- @ Process 8 rows in each single loop and loop 4 times to process 32 rows.
- mov r2, #4
-
-loop_32x32_neon:
- @ Process two rows.
- vdup.16 q0, d6[0]
- vdup.16 q2, d6[1]
- vadd.s16 q12, q0, q8
- vadd.s16 q13, q0, q9
- vadd.s16 q14, q0, q10
- vadd.s16 q15, q0, q11
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vadd.s16 q12, q2, q8
- vadd.s16 q13, q2, q9
- vqmovun.s16 d2, q14
- vqmovun.s16 d3, q15
- vadd.s16 q14, q2, q10
- vadd.s16 q15, q2, q11
- vst1.64 {d0-d3}, [r0], r1
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vqmovun.s16 d26, q14
- vqmovun.s16 d27, q15
- vdup.16 q1, d6[2]
- vdup.16 q2, d6[3]
- vst1.64 {d24-d27}, [r0], r1
-
- @ Process two rows.
- vadd.s16 q12, q1, q8
- vadd.s16 q13, q1, q9
- vadd.s16 q14, q1, q10
- vadd.s16 q15, q1, q11
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vadd.s16 q12, q2, q8
- vadd.s16 q13, q2, q9
- vqmovun.s16 d2, q14
- vqmovun.s16 d3, q15
- vadd.s16 q14, q2, q10
- vadd.s16 q15, q2, q11
- vst1.64 {d0-d3}, [r0], r1
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vqmovun.s16 d26, q14
- vqmovun.s16 d27, q15
- vdup.16 q0, d7[0]
- vdup.16 q2, d7[1]
- vst1.64 {d24-d27}, [r0], r1
-
- @ Process two rows.
- vadd.s16 q12, q0, q8
- vadd.s16 q13, q0, q9
- vadd.s16 q14, q0, q10
- vadd.s16 q15, q0, q11
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vadd.s16 q12, q2, q8
- vadd.s16 q13, q2, q9
- vqmovun.s16 d2, q14
- vqmovun.s16 d3, q15
- vadd.s16 q14, q2, q10
- vadd.s16 q15, q2, q11
- vst1.64 {d0-d3}, [r0], r1
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vqmovun.s16 d26, q14
- vqmovun.s16 d27, q15
- vdup.16 q0, d7[2]
- vdup.16 q2, d7[3]
- vst1.64 {d24-d27}, [r0], r1
-
- @ Process two rows.
- vadd.s16 q12, q0, q8
- vadd.s16 q13, q0, q9
- vadd.s16 q14, q0, q10
- vadd.s16 q15, q0, q11
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vadd.s16 q12, q2, q8
- vadd.s16 q13, q2, q9
- vqmovun.s16 d2, q14
- vqmovun.s16 d3, q15
- vadd.s16 q14, q2, q10
- vadd.s16 q15, q2, q11
- vst1.64 {d0-d3}, [r0], r1
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vld1.8 {d0}, [r3]! @ preload 8 left pixels
- vqmovun.s16 d26, q14
- vqmovun.s16 d27, q15
- vmovl.u8 q3, d0
- vst1.64 {d24-d27}, [r0], r1
-
- subs r2, r2, #1
- bgt loop_32x32_neon
-
- bx lr
- .size vpx_tm_predictor_32x32_neon, .-vpx_tm_predictor_32x32_neon @ ENDP @ |vpx_tm_predictor_32x32_neon|
-
- .section .note.GNU-stack,"",%progbits
diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas/loopfilter_mb_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas/loopfilter_mb_neon.s
deleted file mode 100644
index f6b05406fb..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/gas/loopfilter_mb_neon.s
+++ /dev/null
@@ -1,647 +0,0 @@
-@ This file was created from a .asm file
-@ using the ads2gas.pl script.
- .equ DO1STROUNDING, 0
-@
-@ Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS. All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
- .global vpx_lpf_horizontal_edge_8_neon
- .type vpx_lpf_horizontal_edge_8_neon, function
- .global vpx_lpf_horizontal_edge_16_neon
- .type vpx_lpf_horizontal_edge_16_neon, function
- .global vpx_lpf_vertical_16_neon
- .type vpx_lpf_vertical_16_neon, function
- .arm
-
-.text
-.p2align 2
-
-@ void mb_lpf_horizontal_edge(uint8_t *s, int p,
-@ const uint8_t *blimit,
-@ const uint8_t *limit,
-@ const uint8_t *thresh,
-@ int count)
-@ r0 uint8_t *s,
-@ r1 int p, /* pitch */
-@ r2 const uint8_t *blimit,
-@ r3 const uint8_t *limit,
-@ sp const uint8_t *thresh,
-@ r12 int count
-_mb_lpf_horizontal_edge:
- mb_lpf_horizontal_edge: @ PROC
- push {r4-r8, lr}
- vpush {d8-d15}
- ldr r4, [sp, #88] @ load thresh
-
-h_count:
- vld1.8 {d16[]}, [r2] @ load *blimit
- vld1.8 {d17[]}, [r3] @ load *limit
- vld1.8 {d18[]}, [r4] @ load *thresh
-
- sub r8, r0, r1, lsl #3 @ move src pointer down by 8 lines
-
- vld1.u8 {d0}, [r8,:64], r1 @ p7
- vld1.u8 {d1}, [r8,:64], r1 @ p6
- vld1.u8 {d2}, [r8,:64], r1 @ p5
- vld1.u8 {d3}, [r8,:64], r1 @ p4
- vld1.u8 {d4}, [r8,:64], r1 @ p3
- vld1.u8 {d5}, [r8,:64], r1 @ p2
- vld1.u8 {d6}, [r8,:64], r1 @ p1
- vld1.u8 {d7}, [r8,:64], r1 @ p0
- vld1.u8 {d8}, [r8,:64], r1 @ q0
- vld1.u8 {d9}, [r8,:64], r1 @ q1
- vld1.u8 {d10}, [r8,:64], r1 @ q2
- vld1.u8 {d11}, [r8,:64], r1 @ q3
- vld1.u8 {d12}, [r8,:64], r1 @ q4
- vld1.u8 {d13}, [r8,:64], r1 @ q5
- vld1.u8 {d14}, [r8,:64], r1 @ q6
- vld1.u8 {d15}, [r8,:64], r1 @ q7
-
- bl vpx_wide_mbfilter_neon
-
- tst r7, #1
- beq h_mbfilter
-
- @ flat && mask were not set for any of the channels. Just store the values
- @ from filter.
- sub r8, r0, r1, lsl #1
-
- vst1.u8 {d25}, [r8,:64], r1 @ store op1
- vst1.u8 {d24}, [r8,:64], r1 @ store op0
- vst1.u8 {d23}, [r8,:64], r1 @ store oq0
- vst1.u8 {d26}, [r8,:64], r1 @ store oq1
-
- b h_next
-
-h_mbfilter:
- tst r7, #2
- beq h_wide_mbfilter
-
- @ flat2 was not set for any of the channels. Just store the values from
- @ mbfilter.
- sub r8, r0, r1, lsl #1
- sub r8, r8, r1
-
- vst1.u8 {d18}, [r8,:64], r1 @ store op2
- vst1.u8 {d19}, [r8,:64], r1 @ store op1
- vst1.u8 {d20}, [r8,:64], r1 @ store op0
- vst1.u8 {d21}, [r8,:64], r1 @ store oq0
- vst1.u8 {d22}, [r8,:64], r1 @ store oq1
- vst1.u8 {d23}, [r8,:64], r1 @ store oq2
-
- b h_next
-
-h_wide_mbfilter:
- sub r8, r0, r1, lsl #3
- add r8, r8, r1
-
- vst1.u8 {d16}, [r8,:64], r1 @ store op6
- vst1.u8 {d24}, [r8,:64], r1 @ store op5
- vst1.u8 {d25}, [r8,:64], r1 @ store op4
- vst1.u8 {d26}, [r8,:64], r1 @ store op3
- vst1.u8 {d27}, [r8,:64], r1 @ store op2
- vst1.u8 {d18}, [r8,:64], r1 @ store op1
- vst1.u8 {d19}, [r8,:64], r1 @ store op0
- vst1.u8 {d20}, [r8,:64], r1 @ store oq0
- vst1.u8 {d21}, [r8,:64], r1 @ store oq1
- vst1.u8 {d22}, [r8,:64], r1 @ store oq2
- vst1.u8 {d23}, [r8,:64], r1 @ store oq3
- vst1.u8 {d1}, [r8,:64], r1 @ store oq4
- vst1.u8 {d2}, [r8,:64], r1 @ store oq5
- vst1.u8 {d3}, [r8,:64], r1 @ store oq6
-
-h_next:
- add r0, r0, #8
- subs r12, r12, #1
- bne h_count
-
- vpop {d8-d15}
- pop {r4-r8, pc}
-
- .size mb_lpf_horizontal_edge, .-mb_lpf_horizontal_edge @ ENDP @ |mb_lpf_horizontal_edge|
-
-@ void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
-@ const uint8_t *blimit,
-@ const uint8_t *limit,
-@ const uint8_t *thresh)
-@ r0 uint8_t *s,
-@ r1 int pitch,
-@ r2 const uint8_t *blimit,
-@ r3 const uint8_t *limit,
-@ sp const uint8_t *thresh
-_vpx_lpf_horizontal_edge_8_neon:
- vpx_lpf_horizontal_edge_8_neon: @ PROC
- mov r12, #1
- b mb_lpf_horizontal_edge
- .size vpx_lpf_horizontal_edge_8_neon, .-vpx_lpf_horizontal_edge_8_neon @ ENDP @ |vpx_lpf_horizontal_edge_8_neon|
-
-@ void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
-@ const uint8_t *blimit,
-@ const uint8_t *limit,
-@ const uint8_t *thresh)
-@ r0 uint8_t *s,
-@ r1 int pitch,
-@ r2 const uint8_t *blimit,
-@ r3 const uint8_t *limit,
-@ sp const uint8_t *thresh
-_vpx_lpf_horizontal_edge_16_neon:
- vpx_lpf_horizontal_edge_16_neon: @ PROC
- mov r12, #2
- b mb_lpf_horizontal_edge
- .size vpx_lpf_horizontal_edge_16_neon, .-vpx_lpf_horizontal_edge_16_neon @ ENDP @ |vpx_lpf_horizontal_edge_16_neon|
-
-@ void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
-@ const uint8_t *blimit,
-@ const uint8_t *limit,
-@ const uint8_t *thresh)
-@ r0 uint8_t *s,
-@ r1 int p, /* pitch */
-@ r2 const uint8_t *blimit,
-@ r3 const uint8_t *limit,
-@ sp const uint8_t *thresh,
-_vpx_lpf_vertical_16_neon:
- vpx_lpf_vertical_16_neon: @ PROC
- push {r4-r8, lr}
- vpush {d8-d15}
- ldr r4, [sp, #88] @ load thresh
-
- vld1.8 {d16[]}, [r2] @ load *blimit
- vld1.8 {d17[]}, [r3] @ load *limit
- vld1.8 {d18[]}, [r4] @ load *thresh
-
- sub r8, r0, #8
-
- vld1.8 {d0}, [r8,:64], r1
- vld1.8 {d8}, [r0,:64], r1
- vld1.8 {d1}, [r8,:64], r1
- vld1.8 {d9}, [r0,:64], r1
- vld1.8 {d2}, [r8,:64], r1
- vld1.8 {d10}, [r0,:64], r1
- vld1.8 {d3}, [r8,:64], r1
- vld1.8 {d11}, [r0,:64], r1
- vld1.8 {d4}, [r8,:64], r1
- vld1.8 {d12}, [r0,:64], r1
- vld1.8 {d5}, [r8,:64], r1
- vld1.8 {d13}, [r0,:64], r1
- vld1.8 {d6}, [r8,:64], r1
- vld1.8 {d14}, [r0,:64], r1
- vld1.8 {d7}, [r8,:64], r1
- vld1.8 {d15}, [r0,:64], r1
-
- sub r0, r0, r1, lsl #3
-
- vtrn.32 q0, q2
- vtrn.32 q1, q3
- vtrn.32 q4, q6
- vtrn.32 q5, q7
-
- vtrn.16 q0, q1
- vtrn.16 q2, q3
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- vtrn.8 d0, d1
- vtrn.8 d2, d3
- vtrn.8 d4, d5
- vtrn.8 d6, d7
-
- vtrn.8 d8, d9
- vtrn.8 d10, d11
- vtrn.8 d12, d13
- vtrn.8 d14, d15
-
- bl vpx_wide_mbfilter_neon
-
- tst r7, #1
- beq v_mbfilter
-
- @ flat && mask were not set for any of the channels. Just store the values
- @ from filter.
- sub r8, r0, #2
-
- vswp d23, d25
-
- vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
- vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
- vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
- vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
- vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
- vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
- vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
- vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
-
- b v_end
-
-v_mbfilter:
- tst r7, #2
- beq v_wide_mbfilter
-
- @ flat2 was not set for any of the channels. Just store the values from
- @ mbfilter.
- sub r8, r0, #3
-
- vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1
- vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1
- vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1
- vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1
- vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1
- vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1
- vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1
- vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1
- vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1
- vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1
- vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1
- vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1
- vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1
- vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1
- vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1
- vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1
-
- b v_end
-
-v_wide_mbfilter:
- sub r8, r0, #8
-
- vtrn.32 d0, d26
- vtrn.32 d16, d27
- vtrn.32 d24, d18
- vtrn.32 d25, d19
-
- vtrn.16 d0, d24
- vtrn.16 d16, d25
- vtrn.16 d26, d18
- vtrn.16 d27, d19
-
- vtrn.8 d0, d16
- vtrn.8 d24, d25
- vtrn.8 d26, d27
- vtrn.8 d18, d19
-
- vtrn.32 d20, d1
- vtrn.32 d21, d2
- vtrn.32 d22, d3
- vtrn.32 d23, d15
-
- vtrn.16 d20, d22
- vtrn.16 d21, d23
- vtrn.16 d1, d3
- vtrn.16 d2, d15
-
- vtrn.8 d20, d21
- vtrn.8 d22, d23
- vtrn.8 d1, d2
- vtrn.8 d3, d15
-
- vst1.8 {d0}, [r8,:64], r1
- vst1.8 {d20}, [r0,:64], r1
- vst1.8 {d16}, [r8,:64], r1
- vst1.8 {d21}, [r0,:64], r1
- vst1.8 {d24}, [r8,:64], r1
- vst1.8 {d22}, [r0,:64], r1
- vst1.8 {d25}, [r8,:64], r1
- vst1.8 {d23}, [r0,:64], r1
- vst1.8 {d26}, [r8,:64], r1
- vst1.8 {d1}, [r0,:64], r1
- vst1.8 {d27}, [r8,:64], r1
- vst1.8 {d2}, [r0,:64], r1
- vst1.8 {d18}, [r8,:64], r1
- vst1.8 {d3}, [r0,:64], r1
- vst1.8 {d19}, [r8,:64], r1
- vst1.8 {d15}, [r0,:64], r1
-
-v_end:
- vpop {d8-d15}
- pop {r4-r8, pc}
-
- .size vpx_lpf_vertical_16_neon, .-vpx_lpf_vertical_16_neon @ ENDP @ |vpx_lpf_vertical_16_neon|
-
-@ void vpx_wide_mbfilter_neon();
-@ This is a helper function for the loopfilters. The invidual functions do the
-@ necessary load, transpose (if necessary) and store.
-@
-@ r0-r3 PRESERVE
-@ d16 blimit
-@ d17 limit
-@ d18 thresh
-@ d0 p7
-@ d1 p6
-@ d2 p5
-@ d3 p4
-@ d4 p3
-@ d5 p2
-@ d6 p1
-@ d7 p0
-@ d8 q0
-@ d9 q1
-@ d10 q2
-@ d11 q3
-@ d12 q4
-@ d13 q5
-@ d14 q6
-@ d15 q7
-_vpx_wide_mbfilter_neon:
- vpx_wide_mbfilter_neon: @ PROC
- mov r7, #0
-
- @ filter_mask
- vabd.u8 d19, d4, d5 @ abs(p3 - p2)
- vabd.u8 d20, d5, d6 @ abs(p2 - p1)
- vabd.u8 d21, d6, d7 @ abs(p1 - p0)
- vabd.u8 d22, d9, d8 @ abs(q1 - q0)
- vabd.u8 d23, d10, d9 @ abs(q2 - q1)
- vabd.u8 d24, d11, d10 @ abs(q3 - q2)
-
- @ only compare the largest value to limit
- vmax.u8 d19, d19, d20 @ max(abs(p3 - p2), abs(p2 - p1))
- vmax.u8 d20, d21, d22 @ max(abs(p1 - p0), abs(q1 - q0))
- vmax.u8 d23, d23, d24 @ max(abs(q2 - q1), abs(q3 - q2))
- vmax.u8 d19, d19, d20
-
- vabd.u8 d24, d7, d8 @ abs(p0 - q0)
-
- vmax.u8 d19, d19, d23
-
- vabd.u8 d23, d6, d9 @ a = abs(p1 - q1)
- vqadd.u8 d24, d24, d24 @ b = abs(p0 - q0) * 2
-
- @ abs () > limit
- vcge.u8 d19, d17, d19
-
- @ flatmask4
- vabd.u8 d25, d7, d5 @ abs(p0 - p2)
- vabd.u8 d26, d8, d10 @ abs(q0 - q2)
- vabd.u8 d27, d4, d7 @ abs(p3 - p0)
- vabd.u8 d28, d11, d8 @ abs(q3 - q0)
-
- @ only compare the largest value to thresh
- vmax.u8 d25, d25, d26 @ max(abs(p0 - p2), abs(q0 - q2))
- vmax.u8 d26, d27, d28 @ max(abs(p3 - p0), abs(q3 - q0))
- vmax.u8 d25, d25, d26
- vmax.u8 d20, d20, d25
-
- vshr.u8 d23, d23, #1 @ a = a / 2
- vqadd.u8 d24, d24, d23 @ a = b + a
-
- vmov.u8 d30, #1
- vcge.u8 d24, d16, d24 @ (a > blimit * 2 + limit) * -1
-
- vcge.u8 d20, d30, d20 @ flat
-
- vand d19, d19, d24 @ mask
-
- @ hevmask
- vcgt.u8 d21, d21, d18 @ (abs(p1 - p0) > thresh)*-1
- vcgt.u8 d22, d22, d18 @ (abs(q1 - q0) > thresh)*-1
- vorr d21, d21, d22 @ hev
-
- vand d16, d20, d19 @ flat && mask
- vmov r5, r6, d16
-
- @ flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
- vabd.u8 d22, d3, d7 @ abs(p4 - p0)
- vabd.u8 d23, d12, d8 @ abs(q4 - q0)
- vabd.u8 d24, d7, d2 @ abs(p0 - p5)
- vabd.u8 d25, d8, d13 @ abs(q0 - q5)
- vabd.u8 d26, d1, d7 @ abs(p6 - p0)
- vabd.u8 d27, d14, d8 @ abs(q6 - q0)
- vabd.u8 d28, d0, d7 @ abs(p7 - p0)
- vabd.u8 d29, d15, d8 @ abs(q7 - q0)
-
- @ only compare the largest value to thresh
- vmax.u8 d22, d22, d23 @ max(abs(p4 - p0), abs(q4 - q0))
- vmax.u8 d23, d24, d25 @ max(abs(p0 - p5), abs(q0 - q5))
- vmax.u8 d24, d26, d27 @ max(abs(p6 - p0), abs(q6 - q0))
- vmax.u8 d25, d28, d29 @ max(abs(p7 - p0), abs(q7 - q0))
-
- vmax.u8 d26, d22, d23
- vmax.u8 d27, d24, d25
- vmax.u8 d23, d26, d27
-
- vcge.u8 d18, d30, d23 @ flat2
-
- vmov.u8 d22, #0x80
-
- orrs r5, r5, r6 @ Check for 0
- orreq r7, r7, #1 @ Only do filter branch
-
- vand d17, d18, d16 @ flat2 && flat && mask
- vmov r5, r6, d17
-
- @ mbfilter() function
-
- @ filter() function
- @ convert to signed
- veor d23, d8, d22 @ qs0
- veor d24, d7, d22 @ ps0
- veor d25, d6, d22 @ ps1
- veor d26, d9, d22 @ qs1
-
- vmov.u8 d27, #3
-
- vsub.s8 d28, d23, d24 @ ( qs0 - ps0)
- vqsub.s8 d29, d25, d26 @ filter = clamp(ps1-qs1)
- vmull.s8 q15, d28, d27 @ 3 * ( qs0 - ps0)
- vand d29, d29, d21 @ filter &= hev
- vaddw.s8 q15, q15, d29 @ filter + 3 * (qs0 - ps0)
- vmov.u8 d29, #4
-
- @ filter = clamp(filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d28, q15
-
- vand d28, d28, d19 @ filter &= mask
-
- vqadd.s8 d30, d28, d27 @ filter2 = clamp(filter+3)
- vqadd.s8 d29, d28, d29 @ filter1 = clamp(filter+4)
- vshr.s8 d30, d30, #3 @ filter2 >>= 3
- vshr.s8 d29, d29, #3 @ filter1 >>= 3
-
-
- vqadd.s8 d24, d24, d30 @ op0 = clamp(ps0 + filter2)
- vqsub.s8 d23, d23, d29 @ oq0 = clamp(qs0 - filter1)
-
- @ outer tap adjustments: ++filter1 >> 1
- vrshr.s8 d29, d29, #1
- vbic d29, d29, d21 @ filter &= ~hev
-
- vqadd.s8 d25, d25, d29 @ op1 = clamp(ps1 + filter)
- vqsub.s8 d26, d26, d29 @ oq1 = clamp(qs1 - filter)
-
- veor d24, d24, d22 @ *f_op0 = u^0x80
- veor d23, d23, d22 @ *f_oq0 = u^0x80
- veor d25, d25, d22 @ *f_op1 = u^0x80
- veor d26, d26, d22 @ *f_oq1 = u^0x80
-
- tst r7, #1
- bxne lr
-
- orrs r5, r5, r6 @ Check for 0
- orreq r7, r7, #2 @ Only do mbfilter branch
-
- @ mbfilter flat && mask branch
- @ TODO(fgalligan): Can I decrease the cycles shifting to consective d's
- @ and using vibt on the q's?
- vmov.u8 d29, #2
- vaddl.u8 q15, d7, d8 @ op2 = p0 + q0
- vmlal.u8 q15, d4, d27 @ op2 = p0 + q0 + p3 * 3
- vmlal.u8 q15, d5, d29 @ op2 = p0 + q0 + p3 * 3 + p2 * 2
- vaddl.u8 q10, d4, d5
- vaddw.u8 q15, d6 @ op2=p1 + p0 + q0 + p3 * 3 + p2 *2
- vaddl.u8 q14, d6, d9
- vqrshrn.u16 d18, q15, #3 @ r_op2
-
- vsub.i16 q15, q10
- vaddl.u8 q10, d4, d6
- vadd.i16 q15, q14
- vaddl.u8 q14, d7, d10
- vqrshrn.u16 d19, q15, #3 @ r_op1
-
- vsub.i16 q15, q10
- vadd.i16 q15, q14
- vaddl.u8 q14, d8, d11
- vqrshrn.u16 d20, q15, #3 @ r_op0
-
- vsubw.u8 q15, d4 @ oq0 = op0 - p3
- vsubw.u8 q15, d7 @ oq0 -= p0
- vadd.i16 q15, q14
- vaddl.u8 q14, d9, d11
- vqrshrn.u16 d21, q15, #3 @ r_oq0
-
- vsubw.u8 q15, d5 @ oq1 = oq0 - p2
- vsubw.u8 q15, d8 @ oq1 -= q0
- vadd.i16 q15, q14
- vaddl.u8 q14, d10, d11
- vqrshrn.u16 d22, q15, #3 @ r_oq1
-
- vsubw.u8 q15, d6 @ oq2 = oq0 - p1
- vsubw.u8 q15, d9 @ oq2 -= q1
- vadd.i16 q15, q14
- vqrshrn.u16 d27, q15, #3 @ r_oq2
-
- @ Filter does not set op2 or oq2, so use p2 and q2.
- vbif d18, d5, d16 @ t_op2 |= p2 & ~(flat & mask)
- vbif d19, d25, d16 @ t_op1 |= f_op1 & ~(flat & mask)
- vbif d20, d24, d16 @ t_op0 |= f_op0 & ~(flat & mask)
- vbif d21, d23, d16 @ t_oq0 |= f_oq0 & ~(flat & mask)
- vbif d22, d26, d16 @ t_oq1 |= f_oq1 & ~(flat & mask)
-
- vbit d23, d27, d16 @ t_oq2 |= r_oq2 & (flat & mask)
- vbif d23, d10, d16 @ t_oq2 |= q2 & ~(flat & mask)
-
- tst r7, #2
- bxne lr
-
- @ wide_mbfilter flat2 && flat && mask branch
- vmov.u8 d16, #7
- vaddl.u8 q15, d7, d8 @ op6 = p0 + q0
- vaddl.u8 q12, d2, d3
- vaddl.u8 q13, d4, d5
- vaddl.u8 q14, d1, d6
- vmlal.u8 q15, d0, d16 @ op6 += p7 * 3
- vadd.i16 q12, q13
- vadd.i16 q15, q14
- vaddl.u8 q14, d2, d9
- vadd.i16 q15, q12
- vaddl.u8 q12, d0, d1
- vaddw.u8 q15, d1
- vaddl.u8 q13, d0, d2
- vadd.i16 q14, q15, q14
- vqrshrn.u16 d16, q15, #4 @ w_op6
-
- vsub.i16 q15, q14, q12
- vaddl.u8 q14, d3, d10
- vqrshrn.u16 d24, q15, #4 @ w_op5
-
- vsub.i16 q15, q13
- vaddl.u8 q13, d0, d3
- vadd.i16 q15, q14
- vaddl.u8 q14, d4, d11
- vqrshrn.u16 d25, q15, #4 @ w_op4
-
- vadd.i16 q15, q14
- vaddl.u8 q14, d0, d4
- vsub.i16 q15, q13
- vsub.i16 q14, q15, q14
- vqrshrn.u16 d26, q15, #4 @ w_op3
-
- vaddw.u8 q15, q14, d5 @ op2 += p2
- vaddl.u8 q14, d0, d5
- vaddw.u8 q15, d12 @ op2 += q4
- vbif d26, d4, d17 @ op3 |= p3 & ~(f2 & f & m)
- vqrshrn.u16 d27, q15, #4 @ w_op2
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d0, d6
- vaddw.u8 q15, d6 @ op1 += p1
- vaddw.u8 q15, d13 @ op1 += q5
- vbif d27, d18, d17 @ op2 |= t_op2 & ~(f2 & f & m)
- vqrshrn.u16 d18, q15, #4 @ w_op1
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d0, d7
- vaddw.u8 q15, d7 @ op0 += p0
- vaddw.u8 q15, d14 @ op0 += q6
- vbif d18, d19, d17 @ op1 |= t_op1 & ~(f2 & f & m)
- vqrshrn.u16 d19, q15, #4 @ w_op0
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d1, d8
- vaddw.u8 q15, d8 @ oq0 += q0
- vaddw.u8 q15, d15 @ oq0 += q7
- vbif d19, d20, d17 @ op0 |= t_op0 & ~(f2 & f & m)
- vqrshrn.u16 d20, q15, #4 @ w_oq0
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d2, d9
- vaddw.u8 q15, d9 @ oq1 += q1
- vaddl.u8 q4, d10, d15
- vaddw.u8 q15, d15 @ oq1 += q7
- vbif d20, d21, d17 @ oq0 |= t_oq0 & ~(f2 & f & m)
- vqrshrn.u16 d21, q15, #4 @ w_oq1
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d3, d10
- vadd.i16 q15, q4
- vaddl.u8 q4, d11, d15
- vbif d21, d22, d17 @ oq1 |= t_oq1 & ~(f2 & f & m)
- vqrshrn.u16 d22, q15, #4 @ w_oq2
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d4, d11
- vadd.i16 q15, q4
- vaddl.u8 q4, d12, d15
- vbif d22, d23, d17 @ oq2 |= t_oq2 & ~(f2 & f & m)
- vqrshrn.u16 d23, q15, #4 @ w_oq3
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d5, d12
- vadd.i16 q15, q4
- vaddl.u8 q4, d13, d15
- vbif d16, d1, d17 @ op6 |= p6 & ~(f2 & f & m)
- vqrshrn.u16 d1, q15, #4 @ w_oq4
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d6, d13
- vadd.i16 q15, q4
- vaddl.u8 q4, d14, d15
- vbif d24, d2, d17 @ op5 |= p5 & ~(f2 & f & m)
- vqrshrn.u16 d2, q15, #4 @ w_oq5
-
- vsub.i16 q15, q14
- vbif d25, d3, d17 @ op4 |= p4 & ~(f2 & f & m)
- vadd.i16 q15, q4
- vbif d23, d11, d17 @ oq3 |= q3 & ~(f2 & f & m)
- vqrshrn.u16 d3, q15, #4 @ w_oq6
- vbif d1, d12, d17 @ oq4 |= q4 & ~(f2 & f & m)
- vbif d2, d13, d17 @ oq5 |= q5 & ~(f2 & f & m)
- vbif d3, d14, d17 @ oq6 |= q6 & ~(f2 & f & m)
-
- bx lr
- .size vpx_wide_mbfilter_neon, .-vpx_wide_mbfilter_neon @ ENDP @ |vpx_wide_mbfilter_neon|
-
- .section .note.GNU-stack,"",%progbits
diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas/save_reg_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas/save_reg_neon.s
deleted file mode 100644
index e8852fa0d0..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/gas/save_reg_neon.s
+++ /dev/null
@@ -1,44 +0,0 @@
-@ This file was created from a .asm file
-@ using the ads2gas.pl script.
- .equ DO1STROUNDING, 0
-@
-@ Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS. All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
-
- .global vpx_push_neon
- .type vpx_push_neon, function
- .global vpx_pop_neon
- .type vpx_pop_neon, function
-
- .arm
- .eabi_attribute 24, 1 @Tag_ABI_align_needed
- .eabi_attribute 25, 1 @Tag_ABI_align_preserved
-
-.text
-.p2align 2
-
-_vpx_push_neon:
- vpx_push_neon: @ PROC
- vst1.i64 {d8, d9, d10, d11}, [r0]!
- vst1.i64 {d12, d13, d14, d15}, [r0]!
- bx lr
-
- .size vpx_push_neon, .-vpx_push_neon @ ENDP
-
-_vpx_pop_neon:
- vpx_pop_neon: @ PROC
- vld1.i64 {d8, d9, d10, d11}, [r0]!
- vld1.i64 {d12, d13, d14, d15}, [r0]!
- bx lr
-
- .size vpx_pop_neon, .-vpx_pop_neon @ ENDP
-
-
- .section .note.GNU-stack,"",%progbits
diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/intrapred_neon_asm.s b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/intrapred_neon_asm.s
deleted file mode 100644
index 1c527afcff..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/intrapred_neon_asm.s
+++ /dev/null
@@ -1,660 +0,0 @@
-@ This file was created from a .asm file
-@ using the ads2gas_apple.pl script.
-
- .set WIDE_REFERENCE, 0
- .set ARCHITECTURE, 5
- .set DO1STROUNDING, 0
- @
- @ Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- @
- @ Use of this source code is governed by a BSD-style license
- @ that can be found in the LICENSE file in the root of the source
- @ tree. An additional intellectual property rights grant can be found
- @ in the file PATENTS. All contributing project authors may
- @ be found in the AUTHORS file in the root of the source tree.
- @
-
- .globl _vpx_v_predictor_4x4_neon
- .globl vpx_v_predictor_4x4_neon
- .globl _vpx_v_predictor_8x8_neon
- .globl vpx_v_predictor_8x8_neon
- .globl _vpx_v_predictor_16x16_neon
- .globl vpx_v_predictor_16x16_neon
- .globl _vpx_v_predictor_32x32_neon
- .globl vpx_v_predictor_32x32_neon
- .globl _vpx_h_predictor_4x4_neon
- .globl vpx_h_predictor_4x4_neon
- .globl _vpx_h_predictor_8x8_neon
- .globl vpx_h_predictor_8x8_neon
- .globl _vpx_h_predictor_16x16_neon
- .globl vpx_h_predictor_16x16_neon
- .globl _vpx_h_predictor_32x32_neon
- .globl vpx_h_predictor_32x32_neon
- .globl _vpx_tm_predictor_4x4_neon
- .globl vpx_tm_predictor_4x4_neon
- .globl _vpx_tm_predictor_8x8_neon
- .globl vpx_tm_predictor_8x8_neon
- .globl _vpx_tm_predictor_16x16_neon
- .globl vpx_tm_predictor_16x16_neon
- .globl _vpx_tm_predictor_32x32_neon
- .globl vpx_tm_predictor_32x32_neon
- @ ARM
- @
- @ PRESERVE8
-
-.text
-.p2align 2
-
- @void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
- @ const uint8_t *above,
- @ const uint8_t *left)
- @ r0 uint8_t *dst
- @ r1 ptrdiff_t y_stride
- @ r2 const uint8_t *above
- @ r3 const uint8_t *left
-
-_vpx_v_predictor_4x4_neon:
- vpx_v_predictor_4x4_neon: @
- vld1.32 {d0[0]}, [r2]
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d0[0]}, [r0], r1
- bx lr
- @ @ |vpx_v_predictor_4x4_neon|
-
- @void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
- @ const uint8_t *above,
- @ const uint8_t *left)
- @ r0 uint8_t *dst
- @ r1 ptrdiff_t y_stride
- @ r2 const uint8_t *above
- @ r3 const uint8_t *left
-
-_vpx_v_predictor_8x8_neon:
- vpx_v_predictor_8x8_neon: @
- vld1.8 {d0}, [r2]
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- bx lr
- @ @ |vpx_v_predictor_8x8_neon|
-
- @void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
- @ const uint8_t *above,
- @ const uint8_t *left)
- @ r0 uint8_t *dst
- @ r1 ptrdiff_t y_stride
- @ r2 const uint8_t *above
- @ r3 const uint8_t *left
-
-_vpx_v_predictor_16x16_neon:
- vpx_v_predictor_16x16_neon: @
- vld1.8 {q0}, [r2]
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- vst1.8 {q0}, [r0], r1
- bx lr
- @ @ |vpx_v_predictor_16x16_neon|
-
- @void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
- @ const uint8_t *above,
- @ const uint8_t *left)
- @ r0 uint8_t *dst
- @ r1 ptrdiff_t y_stride
- @ r2 const uint8_t *above
- @ r3 const uint8_t *left
-
-_vpx_v_predictor_32x32_neon:
- vpx_v_predictor_32x32_neon: @
- vld1.8 {q0, q1}, [r2]
- mov r2, #2
-loop_v:
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- vst1.8 {q0, q1}, [r0], r1
- subs r2, r2, #1
- bgt loop_v
- bx lr
- @ @ |vpx_v_predictor_32x32_neon|
-
- @void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
- @ const uint8_t *above,
- @ const uint8_t *left)
- @ r0 uint8_t *dst
- @ r1 ptrdiff_t y_stride
- @ r2 const uint8_t *above
- @ r3 const uint8_t *left
-
-_vpx_h_predictor_4x4_neon:
- vpx_h_predictor_4x4_neon: @
- vld1.32 {d1[0]}, [r3]
- vdup.8 d0, d1[0]
- vst1.32 {d0[0]}, [r0], r1
- vdup.8 d0, d1[1]
- vst1.32 {d0[0]}, [r0], r1
- vdup.8 d0, d1[2]
- vst1.32 {d0[0]}, [r0], r1
- vdup.8 d0, d1[3]
- vst1.32 {d0[0]}, [r0], r1
- bx lr
- @ @ |vpx_h_predictor_4x4_neon|
-
- @void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
- @ const uint8_t *above,
- @ const uint8_t *left)
- @ r0 uint8_t *dst
- @ r1 ptrdiff_t y_stride
- @ r2 const uint8_t *above
- @ r3 const uint8_t *left
-
-_vpx_h_predictor_8x8_neon:
- vpx_h_predictor_8x8_neon: @
- vld1.64 {d1}, [r3]
- vdup.8 d0, d1[0]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[1]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[2]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[3]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[4]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[5]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[6]
- vst1.64 {d0}, [r0], r1
- vdup.8 d0, d1[7]
- vst1.64 {d0}, [r0], r1
- bx lr
- @ @ |vpx_h_predictor_8x8_neon|
-
- @void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
- @ const uint8_t *above,
- @ const uint8_t *left)
- @ r0 uint8_t *dst
- @ r1 ptrdiff_t y_stride
- @ r2 const uint8_t *above
- @ r3 const uint8_t *left
-
-_vpx_h_predictor_16x16_neon:
- vpx_h_predictor_16x16_neon: @
- vld1.8 {q1}, [r3]
- vdup.8 q0, d2[0]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[1]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[2]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[3]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[4]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[5]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[6]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[7]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[0]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[1]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[2]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[3]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[4]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[5]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[6]
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[7]
- vst1.8 {q0}, [r0], r1
- bx lr
- @ @ |vpx_h_predictor_16x16_neon|
-
- @void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
- @ const uint8_t *above,
- @ const uint8_t *left)
- @ r0 uint8_t *dst
- @ r1 ptrdiff_t y_stride
- @ r2 const uint8_t *above
- @ r3 const uint8_t *left
-
-_vpx_h_predictor_32x32_neon:
- vpx_h_predictor_32x32_neon: @
- sub r1, r1, #16
- mov r2, #2
-loop_h:
- vld1.8 {q1}, [r3]!
- vdup.8 q0, d2[0]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[1]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[2]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[3]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[4]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[5]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[6]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d2[7]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[0]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[1]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[2]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[3]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[4]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[5]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[6]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- vdup.8 q0, d3[7]
- vst1.8 {q0}, [r0]!
- vst1.8 {q0}, [r0], r1
- subs r2, r2, #1
- bgt loop_h
- bx lr
- @ @ |vpx_h_predictor_32x32_neon|
-
- @void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
- @ const uint8_t *above,
- @ const uint8_t *left)
- @ r0 uint8_t *dst
- @ r1 ptrdiff_t y_stride
- @ r2 const uint8_t *above
- @ r3 const uint8_t *left
-
-_vpx_tm_predictor_4x4_neon:
- vpx_tm_predictor_4x4_neon: @
- @ Load ytop_left = above[-1] @
- sub r12, r2, #1
- vld1.u8 {d0[]}, [r12]
-
- @ Load above 4 pixels
- vld1.32 {d2[0]}, [r2]
-
- @ Compute above - ytop_left
- vsubl.u8 q3, d2, d0
-
- @ Load left row by row and compute left + (above - ytop_left)
- @ 1st row and 2nd row
- vld1.u8 {d2[]}, [r3]!
- vld1.u8 {d4[]}, [r3]!
- vmovl.u8 q1, d2
- vmovl.u8 q2, d4
- vadd.s16 q1, q1, q3
- vadd.s16 q2, q2, q3
- vqmovun.s16 d0, q1
- vqmovun.s16 d1, q2
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d1[0]}, [r0], r1
-
- @ 3rd row and 4th row
- vld1.u8 {d2[]}, [r3]!
- vld1.u8 {d4[]}, [r3]
- vmovl.u8 q1, d2
- vmovl.u8 q2, d4
- vadd.s16 q1, q1, q3
- vadd.s16 q2, q2, q3
- vqmovun.s16 d0, q1
- vqmovun.s16 d1, q2
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d1[0]}, [r0], r1
- bx lr
- @ @ |vpx_tm_predictor_4x4_neon|
-
- @void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
- @ const uint8_t *above,
- @ const uint8_t *left)
- @ r0 uint8_t *dst
- @ r1 ptrdiff_t y_stride
- @ r2 const uint8_t *above
- @ r3 const uint8_t *left
-
-_vpx_tm_predictor_8x8_neon:
- vpx_tm_predictor_8x8_neon: @
- @ Load ytop_left = above[-1] @
- sub r12, r2, #1
- vld1.8 {d0[]}, [r12]
-
- @ preload 8 left
- vld1.8 {d30}, [r3]
-
- @ Load above 8 pixels
- vld1.64 {d2}, [r2]
-
- vmovl.u8 q10, d30
-
- @ Compute above - ytop_left
- vsubl.u8 q3, d2, d0
-
- @ Load left row by row and compute left + (above - ytop_left)
- @ 1st row and 2nd row
- vdup.16 q0, d20[0]
- vdup.16 q1, d20[1]
- vadd.s16 q0, q3, q0
- vadd.s16 q1, q3, q1
-
- @ 3rd row and 4th row
- vdup.16 q8, d20[2]
- vdup.16 q9, d20[3]
- vadd.s16 q8, q3, q8
- vadd.s16 q9, q3, q9
-
- vqmovun.s16 d0, q0
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q8
- vqmovun.s16 d3, q9
-
- vst1.64 {d0}, [r0], r1
- vst1.64 {d1}, [r0], r1
- vst1.64 {d2}, [r0], r1
- vst1.64 {d3}, [r0], r1
-
- @ 5th row and 6th row
- vdup.16 q0, d21[0]
- vdup.16 q1, d21[1]
- vadd.s16 q0, q3, q0
- vadd.s16 q1, q3, q1
-
- @ 7th row and 8th row
- vdup.16 q8, d21[2]
- vdup.16 q9, d21[3]
- vadd.s16 q8, q3, q8
- vadd.s16 q9, q3, q9
-
- vqmovun.s16 d0, q0
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q8
- vqmovun.s16 d3, q9
-
- vst1.64 {d0}, [r0], r1
- vst1.64 {d1}, [r0], r1
- vst1.64 {d2}, [r0], r1
- vst1.64 {d3}, [r0], r1
-
- bx lr
- @ @ |vpx_tm_predictor_8x8_neon|
-
- @void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
- @ const uint8_t *above,
- @ const uint8_t *left)
- @ r0 uint8_t *dst
- @ r1 ptrdiff_t y_stride
- @ r2 const uint8_t *above
- @ r3 const uint8_t *left
-
-_vpx_tm_predictor_16x16_neon:
- vpx_tm_predictor_16x16_neon: @
- @ Load ytop_left = above[-1] @
- sub r12, r2, #1
- vld1.8 {d0[]}, [r12]
-
- @ Load above 8 pixels
- vld1.8 {q1}, [r2]
-
- @ preload 8 left into r12
- vld1.8 {d18}, [r3]!
-
- @ Compute above - ytop_left
- vsubl.u8 q2, d2, d0
- vsubl.u8 q3, d3, d0
-
- vmovl.u8 q10, d18
-
- @ Load left row by row and compute left + (above - ytop_left)
- @ Process 8 rows in each single loop and loop 2 times to process 16 rows.
- mov r2, #2
-
-loop_16x16_neon:
- @ Process two rows.
- vdup.16 q0, d20[0]
- vdup.16 q8, d20[1]
- vadd.s16 q1, q0, q2
- vadd.s16 q0, q0, q3
- vadd.s16 q11, q8, q2
- vadd.s16 q8, q8, q3
- vqmovun.s16 d2, q1
- vqmovun.s16 d3, q0
- vqmovun.s16 d22, q11
- vqmovun.s16 d23, q8
- vdup.16 q0, d20[2] @ proload next 2 rows data
- vdup.16 q8, d20[3]
- vst1.64 {d2,d3}, [r0], r1
- vst1.64 {d22,d23}, [r0], r1
-
- @ Process two rows.
- vadd.s16 q1, q0, q2
- vadd.s16 q0, q0, q3
- vadd.s16 q11, q8, q2
- vadd.s16 q8, q8, q3
- vqmovun.s16 d2, q1
- vqmovun.s16 d3, q0
- vqmovun.s16 d22, q11
- vqmovun.s16 d23, q8
- vdup.16 q0, d21[0] @ proload next 2 rows data
- vdup.16 q8, d21[1]
- vst1.64 {d2,d3}, [r0], r1
- vst1.64 {d22,d23}, [r0], r1
-
- vadd.s16 q1, q0, q2
- vadd.s16 q0, q0, q3
- vadd.s16 q11, q8, q2
- vadd.s16 q8, q8, q3
- vqmovun.s16 d2, q1
- vqmovun.s16 d3, q0
- vqmovun.s16 d22, q11
- vqmovun.s16 d23, q8
- vdup.16 q0, d21[2] @ proload next 2 rows data
- vdup.16 q8, d21[3]
- vst1.64 {d2,d3}, [r0], r1
- vst1.64 {d22,d23}, [r0], r1
-
-
- vadd.s16 q1, q0, q2
- vadd.s16 q0, q0, q3
- vadd.s16 q11, q8, q2
- vadd.s16 q8, q8, q3
- vqmovun.s16 d2, q1
- vqmovun.s16 d3, q0
- vqmovun.s16 d22, q11
- vqmovun.s16 d23, q8
- vld1.8 {d18}, [r3]! @ preload 8 left into r12
- vmovl.u8 q10, d18
- vst1.64 {d2,d3}, [r0], r1
- vst1.64 {d22,d23}, [r0], r1
-
- subs r2, r2, #1
- bgt loop_16x16_neon
-
- bx lr
- @ @ |vpx_tm_predictor_16x16_neon|
-
- @void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
- @ const uint8_t *above,
- @ const uint8_t *left)
- @ r0 uint8_t *dst
- @ r1 ptrdiff_t y_stride
- @ r2 const uint8_t *above
- @ r3 const uint8_t *left
-
-_vpx_tm_predictor_32x32_neon:
- vpx_tm_predictor_32x32_neon: @
- @ Load ytop_left = above[-1] @
- sub r12, r2, #1
- vld1.8 {d0[]}, [r12]
-
- @ Load above 32 pixels
- vld1.8 {q1}, [r2]!
- vld1.8 {q2}, [r2]
-
- @ preload 8 left pixels
- vld1.8 {d26}, [r3]!
-
- @ Compute above - ytop_left
- vsubl.u8 q8, d2, d0
- vsubl.u8 q9, d3, d0
- vsubl.u8 q10, d4, d0
- vsubl.u8 q11, d5, d0
-
- vmovl.u8 q3, d26
-
- @ Load left row by row and compute left + (above - ytop_left)
- @ Process 8 rows in each single loop and loop 4 times to process 32 rows.
- mov r2, #4
-
-loop_32x32_neon:
- @ Process two rows.
- vdup.16 q0, d6[0]
- vdup.16 q2, d6[1]
- vadd.s16 q12, q0, q8
- vadd.s16 q13, q0, q9
- vadd.s16 q14, q0, q10
- vadd.s16 q15, q0, q11
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vadd.s16 q12, q2, q8
- vadd.s16 q13, q2, q9
- vqmovun.s16 d2, q14
- vqmovun.s16 d3, q15
- vadd.s16 q14, q2, q10
- vadd.s16 q15, q2, q11
- vst1.64 {d0-d3}, [r0], r1
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vqmovun.s16 d26, q14
- vqmovun.s16 d27, q15
- vdup.16 q1, d6[2]
- vdup.16 q2, d6[3]
- vst1.64 {d24-d27}, [r0], r1
-
- @ Process two rows.
- vadd.s16 q12, q1, q8
- vadd.s16 q13, q1, q9
- vadd.s16 q14, q1, q10
- vadd.s16 q15, q1, q11
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vadd.s16 q12, q2, q8
- vadd.s16 q13, q2, q9
- vqmovun.s16 d2, q14
- vqmovun.s16 d3, q15
- vadd.s16 q14, q2, q10
- vadd.s16 q15, q2, q11
- vst1.64 {d0-d3}, [r0], r1
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vqmovun.s16 d26, q14
- vqmovun.s16 d27, q15
- vdup.16 q0, d7[0]
- vdup.16 q2, d7[1]
- vst1.64 {d24-d27}, [r0], r1
-
- @ Process two rows.
- vadd.s16 q12, q0, q8
- vadd.s16 q13, q0, q9
- vadd.s16 q14, q0, q10
- vadd.s16 q15, q0, q11
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vadd.s16 q12, q2, q8
- vadd.s16 q13, q2, q9
- vqmovun.s16 d2, q14
- vqmovun.s16 d3, q15
- vadd.s16 q14, q2, q10
- vadd.s16 q15, q2, q11
- vst1.64 {d0-d3}, [r0], r1
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vqmovun.s16 d26, q14
- vqmovun.s16 d27, q15
- vdup.16 q0, d7[2]
- vdup.16 q2, d7[3]
- vst1.64 {d24-d27}, [r0], r1
-
- @ Process two rows.
- vadd.s16 q12, q0, q8
- vadd.s16 q13, q0, q9
- vadd.s16 q14, q0, q10
- vadd.s16 q15, q0, q11
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vadd.s16 q12, q2, q8
- vadd.s16 q13, q2, q9
- vqmovun.s16 d2, q14
- vqmovun.s16 d3, q15
- vadd.s16 q14, q2, q10
- vadd.s16 q15, q2, q11
- vst1.64 {d0-d3}, [r0], r1
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vld1.8 {d0}, [r3]! @ preload 8 left pixels
- vqmovun.s16 d26, q14
- vqmovun.s16 d27, q15
- vmovl.u8 q3, d0
- vst1.64 {d24-d27}, [r0], r1
-
- subs r2, r2, #1
- bgt loop_32x32_neon
-
- bx lr
- @ @ |vpx_tm_predictor_32x32_neon|
-
diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s
deleted file mode 100644
index 69f7e5207e..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s
+++ /dev/null
@@ -1,649 +0,0 @@
-@ This file was created from a .asm file
-@ using the ads2gas_apple.pl script.
-
- .set WIDE_REFERENCE, 0
- .set ARCHITECTURE, 5
- .set DO1STROUNDING, 0
- @
- @ Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- @
- @ Use of this source code is governed by a BSD-style license
- @ that can be found in the LICENSE file in the root of the source
- @ tree. An additional intellectual property rights grant can be found
- @ in the file PATENTS. All contributing project authors may
- @ be found in the AUTHORS file in the root of the source tree.
- @
-
- .globl _vpx_lpf_horizontal_edge_8_neon
- .globl vpx_lpf_horizontal_edge_8_neon
- .globl _vpx_lpf_horizontal_edge_16_neon
- .globl vpx_lpf_horizontal_edge_16_neon
- .globl _vpx_lpf_vertical_16_neon
- .globl vpx_lpf_vertical_16_neon
- @ ARM
-
-.text
-.p2align 2
-
- @ void mb_lpf_horizontal_edge(uint8_t *s, int p,
- @ const uint8_t *blimit,
- @ const uint8_t *limit,
- @ const uint8_t *thresh,
- @ int count)
- @ r0 uint8_t *s,
- @ r1 int p, /* pitch */
- @ r2 const uint8_t *blimit,
- @ r3 const uint8_t *limit,
- @ sp const uint8_t *thresh,
- @ r12 int count
-_mb_lpf_horizontal_edge:
- mb_lpf_horizontal_edge: @
- push {r4-r8, lr}
- vpush {d8-d15}
- ldr r4, [sp, #88] @ load thresh
-
-h_count:
- vld1.8 {d16[]}, [r2] @ load *blimit
- vld1.8 {d17[]}, [r3] @ load *limit
- vld1.8 {d18[]}, [r4] @ load *thresh
-
- sub r8, r0, r1, lsl #3 @ move src pointer down by 8 lines
-
- vld1.u8 {d0}, [r8,:64], r1 @ p7
- vld1.u8 {d1}, [r8,:64], r1 @ p6
- vld1.u8 {d2}, [r8,:64], r1 @ p5
- vld1.u8 {d3}, [r8,:64], r1 @ p4
- vld1.u8 {d4}, [r8,:64], r1 @ p3
- vld1.u8 {d5}, [r8,:64], r1 @ p2
- vld1.u8 {d6}, [r8,:64], r1 @ p1
- vld1.u8 {d7}, [r8,:64], r1 @ p0
- vld1.u8 {d8}, [r8,:64], r1 @ q0
- vld1.u8 {d9}, [r8,:64], r1 @ q1
- vld1.u8 {d10}, [r8,:64], r1 @ q2
- vld1.u8 {d11}, [r8,:64], r1 @ q3
- vld1.u8 {d12}, [r8,:64], r1 @ q4
- vld1.u8 {d13}, [r8,:64], r1 @ q5
- vld1.u8 {d14}, [r8,:64], r1 @ q6
- vld1.u8 {d15}, [r8,:64], r1 @ q7
-
- bl vpx_wide_mbfilter_neon
-
- tst r7, #1
- beq h_mbfilter
-
- @ flat && mask were not set for any of the channels. Just store the values
- @ from filter.
- sub r8, r0, r1, lsl #1
-
- vst1.u8 {d25}, [r8,:64], r1 @ store op1
- vst1.u8 {d24}, [r8,:64], r1 @ store op0
- vst1.u8 {d23}, [r8,:64], r1 @ store oq0
- vst1.u8 {d26}, [r8,:64], r1 @ store oq1
-
- b h_next
-
-h_mbfilter:
- tst r7, #2
- beq h_wide_mbfilter
-
- @ flat2 was not set for any of the channels. Just store the values from
- @ mbfilter.
- sub r8, r0, r1, lsl #1
- sub r8, r8, r1
-
- vst1.u8 {d18}, [r8,:64], r1 @ store op2
- vst1.u8 {d19}, [r8,:64], r1 @ store op1
- vst1.u8 {d20}, [r8,:64], r1 @ store op0
- vst1.u8 {d21}, [r8,:64], r1 @ store oq0
- vst1.u8 {d22}, [r8,:64], r1 @ store oq1
- vst1.u8 {d23}, [r8,:64], r1 @ store oq2
-
- b h_next
-
-h_wide_mbfilter:
- sub r8, r0, r1, lsl #3
- add r8, r8, r1
-
- vst1.u8 {d16}, [r8,:64], r1 @ store op6
- vst1.u8 {d24}, [r8,:64], r1 @ store op5
- vst1.u8 {d25}, [r8,:64], r1 @ store op4
- vst1.u8 {d26}, [r8,:64], r1 @ store op3
- vst1.u8 {d27}, [r8,:64], r1 @ store op2
- vst1.u8 {d18}, [r8,:64], r1 @ store op1
- vst1.u8 {d19}, [r8,:64], r1 @ store op0
- vst1.u8 {d20}, [r8,:64], r1 @ store oq0
- vst1.u8 {d21}, [r8,:64], r1 @ store oq1
- vst1.u8 {d22}, [r8,:64], r1 @ store oq2
- vst1.u8 {d23}, [r8,:64], r1 @ store oq3
- vst1.u8 {d1}, [r8,:64], r1 @ store oq4
- vst1.u8 {d2}, [r8,:64], r1 @ store oq5
- vst1.u8 {d3}, [r8,:64], r1 @ store oq6
-
-h_next:
- add r0, r0, #8
- subs r12, r12, #1
- bne h_count
-
- vpop {d8-d15}
- pop {r4-r8, pc}
-
- @ @ |mb_lpf_horizontal_edge|
-
- @ void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
- @ const uint8_t *blimit,
- @ const uint8_t *limit,
- @ const uint8_t *thresh)
- @ r0 uint8_t *s,
- @ r1 int pitch,
- @ r2 const uint8_t *blimit,
- @ r3 const uint8_t *limit,
- @ sp const uint8_t *thresh
-_vpx_lpf_horizontal_edge_8_neon:
- vpx_lpf_horizontal_edge_8_neon: @
- mov r12, #1
- b mb_lpf_horizontal_edge
- @ @ |vpx_lpf_horizontal_edge_8_neon|
-
- @ void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
- @ const uint8_t *blimit,
- @ const uint8_t *limit,
- @ const uint8_t *thresh)
- @ r0 uint8_t *s,
- @ r1 int pitch,
- @ r2 const uint8_t *blimit,
- @ r3 const uint8_t *limit,
- @ sp const uint8_t *thresh
-_vpx_lpf_horizontal_edge_16_neon:
- vpx_lpf_horizontal_edge_16_neon: @
- mov r12, #2
- b mb_lpf_horizontal_edge
- @ @ |vpx_lpf_horizontal_edge_16_neon|
-
- @ void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
- @ const uint8_t *blimit,
- @ const uint8_t *limit,
- @ const uint8_t *thresh)
- @ r0 uint8_t *s,
- @ r1 int p, /* pitch */
- @ r2 const uint8_t *blimit,
- @ r3 const uint8_t *limit,
- @ sp const uint8_t *thresh,
-_vpx_lpf_vertical_16_neon:
- vpx_lpf_vertical_16_neon: @
- push {r4-r8, lr}
- vpush {d8-d15}
- ldr r4, [sp, #88] @ load thresh
-
- vld1.8 {d16[]}, [r2] @ load *blimit
- vld1.8 {d17[]}, [r3] @ load *limit
- vld1.8 {d18[]}, [r4] @ load *thresh
-
- sub r8, r0, #8
-
- vld1.8 {d0}, [r8,:64], r1
- vld1.8 {d8}, [r0,:64], r1
- vld1.8 {d1}, [r8,:64], r1
- vld1.8 {d9}, [r0,:64], r1
- vld1.8 {d2}, [r8,:64], r1
- vld1.8 {d10}, [r0,:64], r1
- vld1.8 {d3}, [r8,:64], r1
- vld1.8 {d11}, [r0,:64], r1
- vld1.8 {d4}, [r8,:64], r1
- vld1.8 {d12}, [r0,:64], r1
- vld1.8 {d5}, [r8,:64], r1
- vld1.8 {d13}, [r0,:64], r1
- vld1.8 {d6}, [r8,:64], r1
- vld1.8 {d14}, [r0,:64], r1
- vld1.8 {d7}, [r8,:64], r1
- vld1.8 {d15}, [r0,:64], r1
-
- sub r0, r0, r1, lsl #3
-
- vtrn.32 q0, q2
- vtrn.32 q1, q3
- vtrn.32 q4, q6
- vtrn.32 q5, q7
-
- vtrn.16 q0, q1
- vtrn.16 q2, q3
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- vtrn.8 d0, d1
- vtrn.8 d2, d3
- vtrn.8 d4, d5
- vtrn.8 d6, d7
-
- vtrn.8 d8, d9
- vtrn.8 d10, d11
- vtrn.8 d12, d13
- vtrn.8 d14, d15
-
- bl vpx_wide_mbfilter_neon
-
- tst r7, #1
- beq v_mbfilter
-
- @ flat && mask were not set for any of the channels. Just store the values
- @ from filter.
- sub r8, r0, #2
-
- vswp d23, d25
-
- vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
- vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
- vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
- vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
- vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
- vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
- vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
- vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
-
- b v_end
-
-v_mbfilter:
- tst r7, #2
- beq v_wide_mbfilter
-
- @ flat2 was not set for any of the channels. Just store the values from
- @ mbfilter.
- sub r8, r0, #3
-
- vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1
- vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1
- vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1
- vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1
- vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1
- vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1
- vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1
- vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1
- vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1
- vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1
- vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1
- vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1
- vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1
- vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1
- vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1
- vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1
-
- b v_end
-
-v_wide_mbfilter:
- sub r8, r0, #8
-
- vtrn.32 d0, d26
- vtrn.32 d16, d27
- vtrn.32 d24, d18
- vtrn.32 d25, d19
-
- vtrn.16 d0, d24
- vtrn.16 d16, d25
- vtrn.16 d26, d18
- vtrn.16 d27, d19
-
- vtrn.8 d0, d16
- vtrn.8 d24, d25
- vtrn.8 d26, d27
- vtrn.8 d18, d19
-
- vtrn.32 d20, d1
- vtrn.32 d21, d2
- vtrn.32 d22, d3
- vtrn.32 d23, d15
-
- vtrn.16 d20, d22
- vtrn.16 d21, d23
- vtrn.16 d1, d3
- vtrn.16 d2, d15
-
- vtrn.8 d20, d21
- vtrn.8 d22, d23
- vtrn.8 d1, d2
- vtrn.8 d3, d15
-
- vst1.8 {d0}, [r8,:64], r1
- vst1.8 {d20}, [r0,:64], r1
- vst1.8 {d16}, [r8,:64], r1
- vst1.8 {d21}, [r0,:64], r1
- vst1.8 {d24}, [r8,:64], r1
- vst1.8 {d22}, [r0,:64], r1
- vst1.8 {d25}, [r8,:64], r1
- vst1.8 {d23}, [r0,:64], r1
- vst1.8 {d26}, [r8,:64], r1
- vst1.8 {d1}, [r0,:64], r1
- vst1.8 {d27}, [r8,:64], r1
- vst1.8 {d2}, [r0,:64], r1
- vst1.8 {d18}, [r8,:64], r1
- vst1.8 {d3}, [r0,:64], r1
- vst1.8 {d19}, [r8,:64], r1
- vst1.8 {d15}, [r0,:64], r1
-
-v_end:
- vpop {d8-d15}
- pop {r4-r8, pc}
-
- @ @ |vpx_lpf_vertical_16_neon|
-
- @ void vpx_wide_mbfilter_neon() @
- @ This is a helper function for the loopfilters. The invidual functions do the
- @ necessary load, transpose (if necessary) and store.
- @
- @ r0-r3 PRESERVE
- @ d16 blimit
- @ d17 limit
- @ d18 thresh
- @ d0 p7
- @ d1 p6
- @ d2 p5
- @ d3 p4
- @ d4 p3
- @ d5 p2
- @ d6 p1
- @ d7 p0
- @ d8 q0
- @ d9 q1
- @ d10 q2
- @ d11 q3
- @ d12 q4
- @ d13 q5
- @ d14 q6
- @ d15 q7
-_vpx_wide_mbfilter_neon:
- vpx_wide_mbfilter_neon: @
- mov r7, #0
-
- @ filter_mask
- vabd.u8 d19, d4, d5 @ abs(p3 - p2)
- vabd.u8 d20, d5, d6 @ abs(p2 - p1)
- vabd.u8 d21, d6, d7 @ abs(p1 - p0)
- vabd.u8 d22, d9, d8 @ abs(q1 - q0)
- vabd.u8 d23, d10, d9 @ abs(q2 - q1)
- vabd.u8 d24, d11, d10 @ abs(q3 - q2)
-
- @ only compare the largest value to limit
- vmax.u8 d19, d19, d20 @ max(abs(p3 - p2), abs(p2 - p1))
- vmax.u8 d20, d21, d22 @ max(abs(p1 - p0), abs(q1 - q0))
- vmax.u8 d23, d23, d24 @ max(abs(q2 - q1), abs(q3 - q2))
- vmax.u8 d19, d19, d20
-
- vabd.u8 d24, d7, d8 @ abs(p0 - q0)
-
- vmax.u8 d19, d19, d23
-
- vabd.u8 d23, d6, d9 @ a = abs(p1 - q1)
- vqadd.u8 d24, d24, d24 @ b = abs(p0 - q0) * 2
-
- @ abs () > limit
- vcge.u8 d19, d17, d19
-
- @ flatmask4
- vabd.u8 d25, d7, d5 @ abs(p0 - p2)
- vabd.u8 d26, d8, d10 @ abs(q0 - q2)
- vabd.u8 d27, d4, d7 @ abs(p3 - p0)
- vabd.u8 d28, d11, d8 @ abs(q3 - q0)
-
- @ only compare the largest value to thresh
- vmax.u8 d25, d25, d26 @ max(abs(p0 - p2), abs(q0 - q2))
- vmax.u8 d26, d27, d28 @ max(abs(p3 - p0), abs(q3 - q0))
- vmax.u8 d25, d25, d26
- vmax.u8 d20, d20, d25
-
- vshr.u8 d23, d23, #1 @ a = a / 2
- vqadd.u8 d24, d24, d23 @ a = b + a
-
- vmov.u8 d30, #1
- vcge.u8 d24, d16, d24 @ (a > blimit * 2 + limit) * -1
-
- vcge.u8 d20, d30, d20 @ flat
-
- vand d19, d19, d24 @ mask
-
- @ hevmask
- vcgt.u8 d21, d21, d18 @ (abs(p1 - p0) > thresh)*-1
- vcgt.u8 d22, d22, d18 @ (abs(q1 - q0) > thresh)*-1
- vorr d21, d21, d22 @ hev
-
- vand d16, d20, d19 @ flat && mask
- vmov r5, r6, d16
-
- @ flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
- vabd.u8 d22, d3, d7 @ abs(p4 - p0)
- vabd.u8 d23, d12, d8 @ abs(q4 - q0)
- vabd.u8 d24, d7, d2 @ abs(p0 - p5)
- vabd.u8 d25, d8, d13 @ abs(q0 - q5)
- vabd.u8 d26, d1, d7 @ abs(p6 - p0)
- vabd.u8 d27, d14, d8 @ abs(q6 - q0)
- vabd.u8 d28, d0, d7 @ abs(p7 - p0)
- vabd.u8 d29, d15, d8 @ abs(q7 - q0)
-
- @ only compare the largest value to thresh
- vmax.u8 d22, d22, d23 @ max(abs(p4 - p0), abs(q4 - q0))
- vmax.u8 d23, d24, d25 @ max(abs(p0 - p5), abs(q0 - q5))
- vmax.u8 d24, d26, d27 @ max(abs(p6 - p0), abs(q6 - q0))
- vmax.u8 d25, d28, d29 @ max(abs(p7 - p0), abs(q7 - q0))
-
- vmax.u8 d26, d22, d23
- vmax.u8 d27, d24, d25
- vmax.u8 d23, d26, d27
-
- vcge.u8 d18, d30, d23 @ flat2
-
- vmov.u8 d22, #0x80
-
- orrs r5, r5, r6 @ Check for 0
- orreq r7, r7, #1 @ Only do filter branch
-
- vand d17, d18, d16 @ flat2 && flat && mask
- vmov r5, r6, d17
-
- @ mbfilter() function
-
- @ filter() function
- @ convert to signed
- veor d23, d8, d22 @ qs0
- veor d24, d7, d22 @ ps0
- veor d25, d6, d22 @ ps1
- veor d26, d9, d22 @ qs1
-
- vmov.u8 d27, #3
-
- vsub.s8 d28, d23, d24 @ ( qs0 - ps0)
- vqsub.s8 d29, d25, d26 @ filter = clamp(ps1-qs1)
- vmull.s8 q15, d28, d27 @ 3 * ( qs0 - ps0)
- vand d29, d29, d21 @ filter &= hev
- vaddw.s8 q15, q15, d29 @ filter + 3 * (qs0 - ps0)
- vmov.u8 d29, #4
-
- @ filter = clamp(filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d28, q15
-
- vand d28, d28, d19 @ filter &= mask
-
- vqadd.s8 d30, d28, d27 @ filter2 = clamp(filter+3)
- vqadd.s8 d29, d28, d29 @ filter1 = clamp(filter+4)
- vshr.s8 d30, d30, #3 @ filter2 >>= 3
- vshr.s8 d29, d29, #3 @ filter1 >>= 3
-
-
- vqadd.s8 d24, d24, d30 @ op0 = clamp(ps0 + filter2)
- vqsub.s8 d23, d23, d29 @ oq0 = clamp(qs0 - filter1)
-
- @ outer tap adjustments: ++filter1 >> 1
- vrshr.s8 d29, d29, #1
- vbic d29, d29, d21 @ filter &= ~hev
-
- vqadd.s8 d25, d25, d29 @ op1 = clamp(ps1 + filter)
- vqsub.s8 d26, d26, d29 @ oq1 = clamp(qs1 - filter)
-
- veor d24, d24, d22 @ *f_op0 = u^0x80
- veor d23, d23, d22 @ *f_oq0 = u^0x80
- veor d25, d25, d22 @ *f_op1 = u^0x80
- veor d26, d26, d22 @ *f_oq1 = u^0x80
-
- tst r7, #1
- bxne lr
-
- orrs r5, r5, r6 @ Check for 0
- orreq r7, r7, #2 @ Only do mbfilter branch
-
- @ mbfilter flat && mask branch
- @ TODO(fgalligan): Can I decrease the cycles shifting to consective d's
- @ and using vibt on the q's?
- vmov.u8 d29, #2
- vaddl.u8 q15, d7, d8 @ op2 = p0 + q0
- vmlal.u8 q15, d4, d27 @ op2 = p0 + q0 + p3 * 3
- vmlal.u8 q15, d5, d29 @ op2 = p0 + q0 + p3 * 3 + p2 * 2
- vaddl.u8 q10, d4, d5
- vaddw.u8 q15, d6 @ op2=p1 + p0 + q0 + p3 * 3 + p2 *2
- vaddl.u8 q14, d6, d9
- vqrshrn.u16 d18, q15, #3 @ r_op2
-
- vsub.i16 q15, q10
- vaddl.u8 q10, d4, d6
- vadd.i16 q15, q14
- vaddl.u8 q14, d7, d10
- vqrshrn.u16 d19, q15, #3 @ r_op1
-
- vsub.i16 q15, q10
- vadd.i16 q15, q14
- vaddl.u8 q14, d8, d11
- vqrshrn.u16 d20, q15, #3 @ r_op0
-
- vsubw.u8 q15, d4 @ oq0 = op0 - p3
- vsubw.u8 q15, d7 @ oq0 -= p0
- vadd.i16 q15, q14
- vaddl.u8 q14, d9, d11
- vqrshrn.u16 d21, q15, #3 @ r_oq0
-
- vsubw.u8 q15, d5 @ oq1 = oq0 - p2
- vsubw.u8 q15, d8 @ oq1 -= q0
- vadd.i16 q15, q14
- vaddl.u8 q14, d10, d11
- vqrshrn.u16 d22, q15, #3 @ r_oq1
-
- vsubw.u8 q15, d6 @ oq2 = oq0 - p1
- vsubw.u8 q15, d9 @ oq2 -= q1
- vadd.i16 q15, q14
- vqrshrn.u16 d27, q15, #3 @ r_oq2
-
- @ Filter does not set op2 or oq2, so use p2 and q2.
- vbif d18, d5, d16 @ t_op2 |= p2 & ~(flat & mask)
- vbif d19, d25, d16 @ t_op1 |= f_op1 & ~(flat & mask)
- vbif d20, d24, d16 @ t_op0 |= f_op0 & ~(flat & mask)
- vbif d21, d23, d16 @ t_oq0 |= f_oq0 & ~(flat & mask)
- vbif d22, d26, d16 @ t_oq1 |= f_oq1 & ~(flat & mask)
-
- vbit d23, d27, d16 @ t_oq2 |= r_oq2 & (flat & mask)
- vbif d23, d10, d16 @ t_oq2 |= q2 & ~(flat & mask)
-
- tst r7, #2
- bxne lr
-
- @ wide_mbfilter flat2 && flat && mask branch
- vmov.u8 d16, #7
- vaddl.u8 q15, d7, d8 @ op6 = p0 + q0
- vaddl.u8 q12, d2, d3
- vaddl.u8 q13, d4, d5
- vaddl.u8 q14, d1, d6
- vmlal.u8 q15, d0, d16 @ op6 += p7 * 3
- vadd.i16 q12, q13
- vadd.i16 q15, q14
- vaddl.u8 q14, d2, d9
- vadd.i16 q15, q12
- vaddl.u8 q12, d0, d1
- vaddw.u8 q15, d1
- vaddl.u8 q13, d0, d2
- vadd.i16 q14, q15, q14
- vqrshrn.u16 d16, q15, #4 @ w_op6
-
- vsub.i16 q15, q14, q12
- vaddl.u8 q14, d3, d10
- vqrshrn.u16 d24, q15, #4 @ w_op5
-
- vsub.i16 q15, q13
- vaddl.u8 q13, d0, d3
- vadd.i16 q15, q14
- vaddl.u8 q14, d4, d11
- vqrshrn.u16 d25, q15, #4 @ w_op4
-
- vadd.i16 q15, q14
- vaddl.u8 q14, d0, d4
- vsub.i16 q15, q13
- vsub.i16 q14, q15, q14
- vqrshrn.u16 d26, q15, #4 @ w_op3
-
- vaddw.u8 q15, q14, d5 @ op2 += p2
- vaddl.u8 q14, d0, d5
- vaddw.u8 q15, d12 @ op2 += q4
- vbif d26, d4, d17 @ op3 |= p3 & ~(f2 & f & m)
- vqrshrn.u16 d27, q15, #4 @ w_op2
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d0, d6
- vaddw.u8 q15, d6 @ op1 += p1
- vaddw.u8 q15, d13 @ op1 += q5
- vbif d27, d18, d17 @ op2 |= t_op2 & ~(f2 & f & m)
- vqrshrn.u16 d18, q15, #4 @ w_op1
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d0, d7
- vaddw.u8 q15, d7 @ op0 += p0
- vaddw.u8 q15, d14 @ op0 += q6
- vbif d18, d19, d17 @ op1 |= t_op1 & ~(f2 & f & m)
- vqrshrn.u16 d19, q15, #4 @ w_op0
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d1, d8
- vaddw.u8 q15, d8 @ oq0 += q0
- vaddw.u8 q15, d15 @ oq0 += q7
- vbif d19, d20, d17 @ op0 |= t_op0 & ~(f2 & f & m)
- vqrshrn.u16 d20, q15, #4 @ w_oq0
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d2, d9
- vaddw.u8 q15, d9 @ oq1 += q1
- vaddl.u8 q4, d10, d15
- vaddw.u8 q15, d15 @ oq1 += q7
- vbif d20, d21, d17 @ oq0 |= t_oq0 & ~(f2 & f & m)
- vqrshrn.u16 d21, q15, #4 @ w_oq1
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d3, d10
- vadd.i16 q15, q4
- vaddl.u8 q4, d11, d15
- vbif d21, d22, d17 @ oq1 |= t_oq1 & ~(f2 & f & m)
- vqrshrn.u16 d22, q15, #4 @ w_oq2
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d4, d11
- vadd.i16 q15, q4
- vaddl.u8 q4, d12, d15
- vbif d22, d23, d17 @ oq2 |= t_oq2 & ~(f2 & f & m)
- vqrshrn.u16 d23, q15, #4 @ w_oq3
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d5, d12
- vadd.i16 q15, q4
- vaddl.u8 q4, d13, d15
- vbif d16, d1, d17 @ op6 |= p6 & ~(f2 & f & m)
- vqrshrn.u16 d1, q15, #4 @ w_oq4
-
- vsub.i16 q15, q14
- vaddl.u8 q14, d6, d13
- vadd.i16 q15, q4
- vaddl.u8 q4, d14, d15
- vbif d24, d2, d17 @ op5 |= p5 & ~(f2 & f & m)
- vqrshrn.u16 d2, q15, #4 @ w_oq5
-
- vsub.i16 q15, q14
- vbif d25, d3, d17 @ op4 |= p4 & ~(f2 & f & m)
- vadd.i16 q15, q4
- vbif d23, d11, d17 @ oq3 |= q3 & ~(f2 & f & m)
- vqrshrn.u16 d3, q15, #4 @ w_oq6
- vbif d1, d12, d17 @ oq4 |= q4 & ~(f2 & f & m)
- vbif d2, d13, d17 @ oq5 |= q5 & ~(f2 & f & m)
- vbif d3, d14, d17 @ oq6 |= q6 & ~(f2 & f & m)
-
- bx lr
- @ @ |vpx_wide_mbfilter_neon|
-
diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/save_reg_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/save_reg_neon.s
deleted file mode 100644
index f322b698b4..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/save_reg_neon.s
+++ /dev/null
@@ -1,46 +0,0 @@
-@ This file was created from a .asm file
-@ using the ads2gas_apple.pl script.
-
- .set WIDE_REFERENCE, 0
- .set ARCHITECTURE, 5
- .set DO1STROUNDING, 0
- @
- @ Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- @
- @ Use of this source code is governed by a BSD-style license
- @ that can be found in the LICENSE file in the root of the source
- @ tree. An additional intellectual property rights grant can be found
- @ in the file PATENTS. All contributing project authors may
- @ be found in the AUTHORS file in the root of the source tree.
- @
-
-
- .globl _vpx_push_neon
- .globl vpx_push_neon
- .globl _vpx_pop_neon
- .globl vpx_pop_neon
-
- @ ARM
- @
- @ PRESERVE8
-
-.text
-.p2align 2
-
-_vpx_push_neon:
- vpx_push_neon: @
- vst1.i64 {d8, d9, d10, d11}, [r0]!
- vst1.i64 {d12, d13, d14, d15}, [r0]!
- bx lr
-
- @
-
-_vpx_pop_neon:
- vpx_pop_neon: @
- vld1.i64 {d8, d9, d10, d11}, [r0]!
- vld1.i64 {d12, d13, d14, d15}, [r0]!
- bx lr
-
- @
-
-
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
deleted file mode 100644
index f734e48027..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-
-void vpx_idct16x16_1_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8x8_t d2u8, d3u8, d30u8, d31u8;
- uint64x1_t d2u64, d3u64, d4u64, d5u64;
- uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
- int16x8_t q0s16;
- uint8_t *d1, *d2;
- int16_t i, j, a1, cospi_16_64 = 11585;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 6);
-
- q0s16 = vdupq_n_s16(a1);
- q0u16 = vreinterpretq_u16_s16(q0s16);
-
- for (d1 = d2 = dest, i = 0; i < 4; i++) {
- for (j = 0; j < 2; j++) {
- d2u64 = vld1_u64((const uint64_t *)d1);
- d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
- d1 += dest_stride;
- d4u64 = vld1_u64((const uint64_t *)d1);
- d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
- d1 += dest_stride;
-
- q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
- q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
- q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
- q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
- d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
- vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
- d2 += dest_stride;
- }
- }
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
deleted file mode 100644
index 651ebb21f9..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
+++ /dev/null
@@ -1,1317 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "vpx_dsp/txfm_common.h"
-
-static INLINE void TRANSPOSE8X8(
- int16x8_t *q8s16,
- int16x8_t *q9s16,
- int16x8_t *q10s16,
- int16x8_t *q11s16,
- int16x8_t *q12s16,
- int16x8_t *q13s16,
- int16x8_t *q14s16,
- int16x8_t *q15s16) {
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
- int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
- d16s16 = vget_low_s16(*q8s16);
- d17s16 = vget_high_s16(*q8s16);
- d18s16 = vget_low_s16(*q9s16);
- d19s16 = vget_high_s16(*q9s16);
- d20s16 = vget_low_s16(*q10s16);
- d21s16 = vget_high_s16(*q10s16);
- d22s16 = vget_low_s16(*q11s16);
- d23s16 = vget_high_s16(*q11s16);
- d24s16 = vget_low_s16(*q12s16);
- d25s16 = vget_high_s16(*q12s16);
- d26s16 = vget_low_s16(*q13s16);
- d27s16 = vget_high_s16(*q13s16);
- d28s16 = vget_low_s16(*q14s16);
- d29s16 = vget_high_s16(*q14s16);
- d30s16 = vget_low_s16(*q15s16);
- d31s16 = vget_high_s16(*q15s16);
-
- *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
- *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
- *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
- *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
- *q12s16 = vcombine_s16(d17s16, d25s16);
- *q13s16 = vcombine_s16(d19s16, d27s16);
- *q14s16 = vcombine_s16(d21s16, d29s16);
- *q15s16 = vcombine_s16(d23s16, d31s16);
-
- q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
- vreinterpretq_s32_s16(*q10s16));
- q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
- vreinterpretq_s32_s16(*q11s16));
- q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
- vreinterpretq_s32_s16(*q14s16));
- q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
- vreinterpretq_s32_s16(*q15s16));
-
- q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
- vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
- q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
- vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
- q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
- vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
- q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
- vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
-
- *q8s16 = q0x2s16.val[0];
- *q9s16 = q0x2s16.val[1];
- *q10s16 = q1x2s16.val[0];
- *q11s16 = q1x2s16.val[1];
- *q12s16 = q2x2s16.val[0];
- *q13s16 = q2x2s16.val[1];
- *q14s16 = q3x2s16.val[0];
- *q15s16 = q3x2s16.val[1];
- return;
-}
-
-void vpx_idct16x16_256_add_neon_pass1(
- int16_t *in,
- int16_t *out,
- int output_stride) {
- int16x4_t d0s16, d1s16, d2s16, d3s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
- uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
- int16x8x2_t q0x2s16;
-
- q0x2s16 = vld2q_s16(in);
- q8s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q9s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q10s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q11s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q12s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q13s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q14s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q15s16 = q0x2s16.val[0];
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
- d30s16 = vget_low_s16(q15s16);
- d31s16 = vget_high_s16(q15s16);
-
- // stage 3
- d0s16 = vdup_n_s16(cospi_28_64);
- d1s16 = vdup_n_s16(cospi_4_64);
-
- q2s32 = vmull_s16(d18s16, d0s16);
- q3s32 = vmull_s16(d19s16, d0s16);
- q5s32 = vmull_s16(d18s16, d1s16);
- q6s32 = vmull_s16(d19s16, d1s16);
-
- q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
- q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
- q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
- q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
-
- d2s16 = vdup_n_s16(cospi_12_64);
- d3s16 = vdup_n_s16(cospi_20_64);
-
- d8s16 = vqrshrn_n_s32(q2s32, 14);
- d9s16 = vqrshrn_n_s32(q3s32, 14);
- d14s16 = vqrshrn_n_s32(q5s32, 14);
- d15s16 = vqrshrn_n_s32(q6s32, 14);
- q4s16 = vcombine_s16(d8s16, d9s16);
- q7s16 = vcombine_s16(d14s16, d15s16);
-
- q2s32 = vmull_s16(d26s16, d2s16);
- q3s32 = vmull_s16(d27s16, d2s16);
- q9s32 = vmull_s16(d26s16, d3s16);
- q15s32 = vmull_s16(d27s16, d3s16);
-
- q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
- q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
- q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
- q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
-
- d10s16 = vqrshrn_n_s32(q2s32, 14);
- d11s16 = vqrshrn_n_s32(q3s32, 14);
- d12s16 = vqrshrn_n_s32(q9s32, 14);
- d13s16 = vqrshrn_n_s32(q15s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- // stage 4
- d30s16 = vdup_n_s16(cospi_16_64);
-
- q2s32 = vmull_s16(d16s16, d30s16);
- q11s32 = vmull_s16(d17s16, d30s16);
- q0s32 = vmull_s16(d24s16, d30s16);
- q1s32 = vmull_s16(d25s16, d30s16);
-
- d30s16 = vdup_n_s16(cospi_24_64);
- d31s16 = vdup_n_s16(cospi_8_64);
-
- q3s32 = vaddq_s32(q2s32, q0s32);
- q12s32 = vaddq_s32(q11s32, q1s32);
- q13s32 = vsubq_s32(q2s32, q0s32);
- q1s32 = vsubq_s32(q11s32, q1s32);
-
- d16s16 = vqrshrn_n_s32(q3s32, 14);
- d17s16 = vqrshrn_n_s32(q12s32, 14);
- d18s16 = vqrshrn_n_s32(q13s32, 14);
- d19s16 = vqrshrn_n_s32(q1s32, 14);
- q8s16 = vcombine_s16(d16s16, d17s16);
- q9s16 = vcombine_s16(d18s16, d19s16);
-
- q0s32 = vmull_s16(d20s16, d31s16);
- q1s32 = vmull_s16(d21s16, d31s16);
- q12s32 = vmull_s16(d20s16, d30s16);
- q13s32 = vmull_s16(d21s16, d30s16);
-
- q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
- q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
- q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
- q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
-
- d22s16 = vqrshrn_n_s32(q0s32, 14);
- d23s16 = vqrshrn_n_s32(q1s32, 14);
- d20s16 = vqrshrn_n_s32(q12s32, 14);
- d21s16 = vqrshrn_n_s32(q13s32, 14);
- q10s16 = vcombine_s16(d20s16, d21s16);
- q11s16 = vcombine_s16(d22s16, d23s16);
-
- q13s16 = vsubq_s16(q4s16, q5s16);
- q4s16 = vaddq_s16(q4s16, q5s16);
- q14s16 = vsubq_s16(q7s16, q6s16);
- q15s16 = vaddq_s16(q6s16, q7s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
-
- // stage 5
- q0s16 = vaddq_s16(q8s16, q11s16);
- q1s16 = vaddq_s16(q9s16, q10s16);
- q2s16 = vsubq_s16(q9s16, q10s16);
- q3s16 = vsubq_s16(q8s16, q11s16);
-
- d16s16 = vdup_n_s16(cospi_16_64);
-
- q11s32 = vmull_s16(d26s16, d16s16);
- q12s32 = vmull_s16(d27s16, d16s16);
- q9s32 = vmull_s16(d28s16, d16s16);
- q10s32 = vmull_s16(d29s16, d16s16);
-
- q6s32 = vsubq_s32(q9s32, q11s32);
- q13s32 = vsubq_s32(q10s32, q12s32);
- q9s32 = vaddq_s32(q9s32, q11s32);
- q10s32 = vaddq_s32(q10s32, q12s32);
-
- d10s16 = vqrshrn_n_s32(q6s32, 14);
- d11s16 = vqrshrn_n_s32(q13s32, 14);
- d12s16 = vqrshrn_n_s32(q9s32, 14);
- d13s16 = vqrshrn_n_s32(q10s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- // stage 6
- q8s16 = vaddq_s16(q0s16, q15s16);
- q9s16 = vaddq_s16(q1s16, q6s16);
- q10s16 = vaddq_s16(q2s16, q5s16);
- q11s16 = vaddq_s16(q3s16, q4s16);
- q12s16 = vsubq_s16(q3s16, q4s16);
- q13s16 = vsubq_s16(q2s16, q5s16);
- q14s16 = vsubq_s16(q1s16, q6s16);
- q15s16 = vsubq_s16(q0s16, q15s16);
-
- d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
- d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
- d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
- d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
- d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
- d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
- d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
- d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
- d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
- d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
- d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
- // store the data
- output_stride >>= 1; // output_stride / 2, out is int16_t
- vst1_u64((uint64_t *)out, d16u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d17u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d18u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d19u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d20u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d21u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d22u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d23u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d24u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d25u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d26u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d27u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d28u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d29u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d30u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d31u64);
- return;
-}
-
-void vpx_idct16x16_256_add_neon_pass2(
- int16_t *src,
- int16_t *out,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride) {
- uint8_t *d;
- uint8x8_t d12u8, d13u8;
- int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- uint64x1_t d24u64, d25u64, d26u64, d27u64;
- int64x1_t d12s64, d13s64;
- uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
- uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q13s32;
- int16x8x2_t q0x2s16;
-
- q0x2s16 = vld2q_s16(src);
- q8s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q9s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q10s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q11s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q12s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q13s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q14s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q15s16 = q0x2s16.val[0];
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
- d30s16 = vget_low_s16(q15s16);
- d31s16 = vget_high_s16(q15s16);
-
- // stage 3
- d12s16 = vdup_n_s16(cospi_30_64);
- d13s16 = vdup_n_s16(cospi_2_64);
-
- q2s32 = vmull_s16(d16s16, d12s16);
- q3s32 = vmull_s16(d17s16, d12s16);
- q1s32 = vmull_s16(d16s16, d13s16);
- q4s32 = vmull_s16(d17s16, d13s16);
-
- q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
- q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
- q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
- q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
-
- d0s16 = vqrshrn_n_s32(q2s32, 14);
- d1s16 = vqrshrn_n_s32(q3s32, 14);
- d14s16 = vqrshrn_n_s32(q1s32, 14);
- d15s16 = vqrshrn_n_s32(q4s32, 14);
- q0s16 = vcombine_s16(d0s16, d1s16);
- q7s16 = vcombine_s16(d14s16, d15s16);
-
- d30s16 = vdup_n_s16(cospi_14_64);
- d31s16 = vdup_n_s16(cospi_18_64);
-
- q2s32 = vmull_s16(d24s16, d30s16);
- q3s32 = vmull_s16(d25s16, d30s16);
- q4s32 = vmull_s16(d24s16, d31s16);
- q5s32 = vmull_s16(d25s16, d31s16);
-
- q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
- q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
- q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
- q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
-
- d2s16 = vqrshrn_n_s32(q2s32, 14);
- d3s16 = vqrshrn_n_s32(q3s32, 14);
- d12s16 = vqrshrn_n_s32(q4s32, 14);
- d13s16 = vqrshrn_n_s32(q5s32, 14);
- q1s16 = vcombine_s16(d2s16, d3s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- d30s16 = vdup_n_s16(cospi_22_64);
- d31s16 = vdup_n_s16(cospi_10_64);
-
- q11s32 = vmull_s16(d20s16, d30s16);
- q12s32 = vmull_s16(d21s16, d30s16);
- q4s32 = vmull_s16(d20s16, d31s16);
- q5s32 = vmull_s16(d21s16, d31s16);
-
- q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
- q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
- q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
- q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
-
- d4s16 = vqrshrn_n_s32(q11s32, 14);
- d5s16 = vqrshrn_n_s32(q12s32, 14);
- d11s16 = vqrshrn_n_s32(q5s32, 14);
- d10s16 = vqrshrn_n_s32(q4s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- d30s16 = vdup_n_s16(cospi_6_64);
- d31s16 = vdup_n_s16(cospi_26_64);
-
- q10s32 = vmull_s16(d28s16, d30s16);
- q11s32 = vmull_s16(d29s16, d30s16);
- q12s32 = vmull_s16(d28s16, d31s16);
- q13s32 = vmull_s16(d29s16, d31s16);
-
- q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
- q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
- q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
- q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
-
- d6s16 = vqrshrn_n_s32(q10s32, 14);
- d7s16 = vqrshrn_n_s32(q11s32, 14);
- d8s16 = vqrshrn_n_s32(q12s32, 14);
- d9s16 = vqrshrn_n_s32(q13s32, 14);
- q3s16 = vcombine_s16(d6s16, d7s16);
- q4s16 = vcombine_s16(d8s16, d9s16);
-
- // stage 3
- q9s16 = vsubq_s16(q0s16, q1s16);
- q0s16 = vaddq_s16(q0s16, q1s16);
- q10s16 = vsubq_s16(q3s16, q2s16);
- q11s16 = vaddq_s16(q2s16, q3s16);
- q12s16 = vaddq_s16(q4s16, q5s16);
- q13s16 = vsubq_s16(q4s16, q5s16);
- q14s16 = vsubq_s16(q7s16, q6s16);
- q7s16 = vaddq_s16(q6s16, q7s16);
-
- // stage 4
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
-
- d30s16 = vdup_n_s16(cospi_8_64);
- d31s16 = vdup_n_s16(cospi_24_64);
-
- q2s32 = vmull_s16(d18s16, d31s16);
- q3s32 = vmull_s16(d19s16, d31s16);
- q4s32 = vmull_s16(d28s16, d31s16);
- q5s32 = vmull_s16(d29s16, d31s16);
-
- q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
- q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
- q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
- q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
-
- d12s16 = vqrshrn_n_s32(q2s32, 14);
- d13s16 = vqrshrn_n_s32(q3s32, 14);
- d2s16 = vqrshrn_n_s32(q4s32, 14);
- d3s16 = vqrshrn_n_s32(q5s32, 14);
- q1s16 = vcombine_s16(d2s16, d3s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- q3s16 = q11s16;
- q4s16 = q12s16;
-
- d30s16 = vdup_n_s16(-cospi_8_64);
- q11s32 = vmull_s16(d26s16, d30s16);
- q12s32 = vmull_s16(d27s16, d30s16);
- q8s32 = vmull_s16(d20s16, d30s16);
- q9s32 = vmull_s16(d21s16, d30s16);
-
- q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
- q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
- q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
- q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
-
- d4s16 = vqrshrn_n_s32(q11s32, 14);
- d5s16 = vqrshrn_n_s32(q12s32, 14);
- d10s16 = vqrshrn_n_s32(q8s32, 14);
- d11s16 = vqrshrn_n_s32(q9s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- // stage 5
- q8s16 = vaddq_s16(q0s16, q3s16);
- q9s16 = vaddq_s16(q1s16, q2s16);
- q10s16 = vsubq_s16(q1s16, q2s16);
- q11s16 = vsubq_s16(q0s16, q3s16);
- q12s16 = vsubq_s16(q7s16, q4s16);
- q13s16 = vsubq_s16(q6s16, q5s16);
- q14s16 = vaddq_s16(q6s16, q5s16);
- q15s16 = vaddq_s16(q7s16, q4s16);
-
- // stage 6
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
-
- d14s16 = vdup_n_s16(cospi_16_64);
-
- q3s32 = vmull_s16(d26s16, d14s16);
- q4s32 = vmull_s16(d27s16, d14s16);
- q0s32 = vmull_s16(d20s16, d14s16);
- q1s32 = vmull_s16(d21s16, d14s16);
-
- q5s32 = vsubq_s32(q3s32, q0s32);
- q6s32 = vsubq_s32(q4s32, q1s32);
- q10s32 = vaddq_s32(q3s32, q0s32);
- q4s32 = vaddq_s32(q4s32, q1s32);
-
- d4s16 = vqrshrn_n_s32(q5s32, 14);
- d5s16 = vqrshrn_n_s32(q6s32, 14);
- d10s16 = vqrshrn_n_s32(q10s32, 14);
- d11s16 = vqrshrn_n_s32(q4s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- q0s32 = vmull_s16(d22s16, d14s16);
- q1s32 = vmull_s16(d23s16, d14s16);
- q13s32 = vmull_s16(d24s16, d14s16);
- q6s32 = vmull_s16(d25s16, d14s16);
-
- q10s32 = vsubq_s32(q13s32, q0s32);
- q4s32 = vsubq_s32(q6s32, q1s32);
- q13s32 = vaddq_s32(q13s32, q0s32);
- q6s32 = vaddq_s32(q6s32, q1s32);
-
- d6s16 = vqrshrn_n_s32(q10s32, 14);
- d7s16 = vqrshrn_n_s32(q4s32, 14);
- d8s16 = vqrshrn_n_s32(q13s32, 14);
- d9s16 = vqrshrn_n_s32(q6s32, 14);
- q3s16 = vcombine_s16(d6s16, d7s16);
- q4s16 = vcombine_s16(d8s16, d9s16);
-
- // stage 7
- if (skip_adding != 0) {
- d = dest;
- // load the data in pass1
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
-
- q12s16 = vaddq_s16(q0s16, q15s16);
- q13s16 = vaddq_s16(q1s16, q14s16);
- q12s16 = vrshrq_n_s16(q12s16, 6);
- q13s16 = vrshrq_n_s16(q13s16, 6);
- q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
- vreinterpret_u8_s64(d12s64));
- q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
- vreinterpret_u8_s64(d13s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
- d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
- q14s16 = vsubq_s16(q1s16, q14s16);
- q15s16 = vsubq_s16(q0s16, q15s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q12s16 = vaddq_s16(q10s16, q5s16);
- q13s16 = vaddq_s16(q11s16, q4s16);
- q12s16 = vrshrq_n_s16(q12s16, 6);
- q13s16 = vrshrq_n_s16(q13s16, 6);
- q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
- vreinterpret_u8_s64(d12s64));
- q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
- vreinterpret_u8_s64(d13s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
- d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
- q4s16 = vsubq_s16(q11s16, q4s16);
- q5s16 = vsubq_s16(q10s16, q5s16);
-
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q12s16 = vaddq_s16(q0s16, q3s16);
- q13s16 = vaddq_s16(q1s16, q2s16);
- q12s16 = vrshrq_n_s16(q12s16, 6);
- q13s16 = vrshrq_n_s16(q13s16, 6);
- q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
- vreinterpret_u8_s64(d12s64));
- q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
- vreinterpret_u8_s64(d13s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
- d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
- q2s16 = vsubq_s16(q1s16, q2s16);
- q3s16 = vsubq_s16(q0s16, q3s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q12s16 = vaddq_s16(q10s16, q9s16);
- q13s16 = vaddq_s16(q11s16, q8s16);
- q12s16 = vrshrq_n_s16(q12s16, 6);
- q13s16 = vrshrq_n_s16(q13s16, 6);
- q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
- vreinterpret_u8_s64(d12s64));
- q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
- vreinterpret_u8_s64(d13s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
- d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
- q8s16 = vsubq_s16(q11s16, q8s16);
- q9s16 = vsubq_s16(q10s16, q9s16);
-
- // store the data out 8,9,10,11,12,13,14,15
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q8s16 = vrshrq_n_s16(q8s16, 6);
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q9s16 = vrshrq_n_s16(q9s16, 6);
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q2s16 = vrshrq_n_s16(q2s16, 6);
- q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q3s16 = vrshrq_n_s16(q3s16, 6);
- q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q4s16 = vrshrq_n_s16(q4s16, 6);
- q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q5s16 = vrshrq_n_s16(q5s16, 6);
- q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q14s16 = vrshrq_n_s16(q14s16, 6);
- q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- q15s16 = vrshrq_n_s16(q15s16, 6);
- q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
- vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- } else { // skip_adding_dest
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q0s16, q15s16);
- q13s16 = vaddq_s16(q1s16, q14s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q14s16 = vsubq_s16(q1s16, q14s16);
- q15s16 = vsubq_s16(q0s16, q15s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q10s16, q5s16);
- q13s16 = vaddq_s16(q11s16, q4s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q4s16 = vsubq_s16(q11s16, q4s16);
- q5s16 = vsubq_s16(q10s16, q5s16);
-
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q0s16, q3s16);
- q13s16 = vaddq_s16(q1s16, q2s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q2s16 = vsubq_s16(q1s16, q2s16);
- q3s16 = vsubq_s16(q0s16, q3s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q10s16, q9s16);
- q13s16 = vaddq_s16(q11s16, q8s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q8s16 = vsubq_s16(q11s16, q8s16);
- q9s16 = vsubq_s16(q10s16, q9s16);
-
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
- }
- return;
-}
-
-void vpx_idct16x16_10_add_neon_pass1(
- int16_t *in,
- int16_t *out,
- int output_stride) {
- int16x4_t d4s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
- uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
- int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4_t q6s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q15s32;
- int16x8x2_t q0x2s16;
-
- q0x2s16 = vld2q_s16(in);
- q8s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q9s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q10s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q11s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q12s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q13s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q14s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q15s16 = q0x2s16.val[0];
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- // stage 3
- q0s16 = vdupq_n_s16(cospi_28_64 * 2);
- q1s16 = vdupq_n_s16(cospi_4_64 * 2);
-
- q4s16 = vqrdmulhq_s16(q9s16, q0s16);
- q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
- // stage 4
- q1s16 = vdupq_n_s16(cospi_16_64 * 2);
- d4s16 = vdup_n_s16(cospi_16_64);
-
- q8s16 = vqrdmulhq_s16(q8s16, q1s16);
-
- d8s16 = vget_low_s16(q4s16);
- d9s16 = vget_high_s16(q4s16);
- d14s16 = vget_low_s16(q7s16);
- d15s16 = vget_high_s16(q7s16);
- q9s32 = vmull_s16(d14s16, d4s16);
- q10s32 = vmull_s16(d15s16, d4s16);
- q12s32 = vmull_s16(d9s16, d4s16);
- q11s32 = vmull_s16(d8s16, d4s16);
-
- q15s32 = vsubq_s32(q10s32, q12s32);
- q6s32 = vsubq_s32(q9s32, q11s32);
- q9s32 = vaddq_s32(q9s32, q11s32);
- q10s32 = vaddq_s32(q10s32, q12s32);
-
- d11s16 = vqrshrn_n_s32(q15s32, 14);
- d10s16 = vqrshrn_n_s32(q6s32, 14);
- d12s16 = vqrshrn_n_s32(q9s32, 14);
- d13s16 = vqrshrn_n_s32(q10s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- // stage 6
- q2s16 = vaddq_s16(q8s16, q7s16);
- q9s16 = vaddq_s16(q8s16, q6s16);
- q10s16 = vaddq_s16(q8s16, q5s16);
- q11s16 = vaddq_s16(q8s16, q4s16);
- q12s16 = vsubq_s16(q8s16, q4s16);
- q13s16 = vsubq_s16(q8s16, q5s16);
- q14s16 = vsubq_s16(q8s16, q6s16);
- q15s16 = vsubq_s16(q8s16, q7s16);
-
- d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
- d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
- d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
- d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
- d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
- d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
- d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
- d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
- d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
- d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
- d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
- // store the data
- output_stride >>= 1; // output_stride / 2, out is int16_t
- vst1_u64((uint64_t *)out, d4u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d5u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d18u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d19u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d20u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d21u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d22u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d23u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d24u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d25u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d26u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d27u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d28u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d29u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d30u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d31u64);
- return;
-}
-
-void vpx_idct16x16_10_add_neon_pass2(
- int16_t *src,
- int16_t *out,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride) {
- int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
- uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
- uint64x1_t d16u64, d17u64, d18u64, d19u64;
- uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q13s32;
- int16x8x2_t q0x2s16;
- (void)skip_adding;
- (void)dest;
- (void)dest_stride;
-
- q0x2s16 = vld2q_s16(src);
- q8s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q9s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q10s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q11s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q12s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q13s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q14s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q15s16 = q0x2s16.val[0];
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- // stage 3
- q6s16 = vdupq_n_s16(cospi_30_64 * 2);
- q0s16 = vqrdmulhq_s16(q8s16, q6s16);
- q6s16 = vdupq_n_s16(cospi_2_64 * 2);
- q7s16 = vqrdmulhq_s16(q8s16, q6s16);
-
- q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
- q14s16 = vdupq_n_s16(cospi_6_64 * 2);
- q3s16 = vqrdmulhq_s16(q9s16, q15s16);
- q4s16 = vqrdmulhq_s16(q9s16, q14s16);
-
- // stage 4
- d0s16 = vget_low_s16(q0s16);
- d1s16 = vget_high_s16(q0s16);
- d6s16 = vget_low_s16(q3s16);
- d7s16 = vget_high_s16(q3s16);
- d8s16 = vget_low_s16(q4s16);
- d9s16 = vget_high_s16(q4s16);
- d14s16 = vget_low_s16(q7s16);
- d15s16 = vget_high_s16(q7s16);
-
- d30s16 = vdup_n_s16(cospi_8_64);
- d31s16 = vdup_n_s16(cospi_24_64);
-
- q12s32 = vmull_s16(d14s16, d31s16);
- q5s32 = vmull_s16(d15s16, d31s16);
- q2s32 = vmull_s16(d0s16, d31s16);
- q11s32 = vmull_s16(d1s16, d31s16);
-
- q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
- q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
- q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
- q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
-
- d2s16 = vqrshrn_n_s32(q12s32, 14);
- d3s16 = vqrshrn_n_s32(q5s32, 14);
- d12s16 = vqrshrn_n_s32(q2s32, 14);
- d13s16 = vqrshrn_n_s32(q11s32, 14);
- q1s16 = vcombine_s16(d2s16, d3s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- d30s16 = vdup_n_s16(-cospi_8_64);
- q10s32 = vmull_s16(d8s16, d30s16);
- q13s32 = vmull_s16(d9s16, d30s16);
- q8s32 = vmull_s16(d6s16, d30s16);
- q9s32 = vmull_s16(d7s16, d30s16);
-
- q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
- q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
- q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
- q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
-
- d4s16 = vqrshrn_n_s32(q10s32, 14);
- d5s16 = vqrshrn_n_s32(q13s32, 14);
- d10s16 = vqrshrn_n_s32(q8s32, 14);
- d11s16 = vqrshrn_n_s32(q9s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- // stage 5
- q8s16 = vaddq_s16(q0s16, q3s16);
- q9s16 = vaddq_s16(q1s16, q2s16);
- q10s16 = vsubq_s16(q1s16, q2s16);
- q11s16 = vsubq_s16(q0s16, q3s16);
- q12s16 = vsubq_s16(q7s16, q4s16);
- q13s16 = vsubq_s16(q6s16, q5s16);
- q14s16 = vaddq_s16(q6s16, q5s16);
- q15s16 = vaddq_s16(q7s16, q4s16);
-
- // stage 6
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
-
- d14s16 = vdup_n_s16(cospi_16_64);
- q3s32 = vmull_s16(d26s16, d14s16);
- q4s32 = vmull_s16(d27s16, d14s16);
- q0s32 = vmull_s16(d20s16, d14s16);
- q1s32 = vmull_s16(d21s16, d14s16);
-
- q5s32 = vsubq_s32(q3s32, q0s32);
- q6s32 = vsubq_s32(q4s32, q1s32);
- q0s32 = vaddq_s32(q3s32, q0s32);
- q4s32 = vaddq_s32(q4s32, q1s32);
-
- d4s16 = vqrshrn_n_s32(q5s32, 14);
- d5s16 = vqrshrn_n_s32(q6s32, 14);
- d10s16 = vqrshrn_n_s32(q0s32, 14);
- d11s16 = vqrshrn_n_s32(q4s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- q0s32 = vmull_s16(d22s16, d14s16);
- q1s32 = vmull_s16(d23s16, d14s16);
- q13s32 = vmull_s16(d24s16, d14s16);
- q6s32 = vmull_s16(d25s16, d14s16);
-
- q10s32 = vsubq_s32(q13s32, q0s32);
- q4s32 = vsubq_s32(q6s32, q1s32);
- q13s32 = vaddq_s32(q13s32, q0s32);
- q6s32 = vaddq_s32(q6s32, q1s32);
-
- d6s16 = vqrshrn_n_s32(q10s32, 14);
- d7s16 = vqrshrn_n_s32(q4s32, 14);
- d8s16 = vqrshrn_n_s32(q13s32, 14);
- d9s16 = vqrshrn_n_s32(q6s32, 14);
- q3s16 = vcombine_s16(d6s16, d7s16);
- q4s16 = vcombine_s16(d8s16, d9s16);
-
- // stage 7
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q0s16, q15s16);
- q13s16 = vaddq_s16(q1s16, q14s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q14s16 = vsubq_s16(q1s16, q14s16);
- q15s16 = vsubq_s16(q0s16, q15s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q10s16, q5s16);
- q13s16 = vaddq_s16(q11s16, q4s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q4s16 = vsubq_s16(q11s16, q4s16);
- q5s16 = vsubq_s16(q10s16, q5s16);
-
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q0s16, q3s16);
- q13s16 = vaddq_s16(q1s16, q2s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q2s16 = vsubq_s16(q1s16, q2s16);
- q3s16 = vsubq_s16(q0s16, q3s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- q12s16 = vaddq_s16(q10s16, q9s16);
- q13s16 = vaddq_s16(q11s16, q8s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q8s16 = vsubq_s16(q11s16, q8s16);
- q9s16 = vsubq_s16(q10s16, q9s16);
-
- d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
- d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
- d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
- d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
- d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
- d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
- d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
- d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
- d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
- d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
- d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
- d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
- d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
- d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
- d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
- d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
- vst1_u64((uint64_t *)out, d16u64);
- out += 4;
- vst1_u64((uint64_t *)out, d17u64);
- out += 12;
- vst1_u64((uint64_t *)out, d18u64);
- out += 4;
- vst1_u64((uint64_t *)out, d19u64);
- out += 12;
- vst1_u64((uint64_t *)out, d4u64);
- out += 4;
- vst1_u64((uint64_t *)out, d5u64);
- out += 12;
- vst1_u64((uint64_t *)out, d6u64);
- out += 4;
- vst1_u64((uint64_t *)out, d7u64);
- out += 12;
- vst1_u64((uint64_t *)out, d8u64);
- out += 4;
- vst1_u64((uint64_t *)out, d9u64);
- out += 12;
- vst1_u64((uint64_t *)out, d10u64);
- out += 4;
- vst1_u64((uint64_t *)out, d11u64);
- out += 12;
- vst1_u64((uint64_t *)out, d28u64);
- out += 4;
- vst1_u64((uint64_t *)out, d29u64);
- out += 12;
- vst1_u64((uint64_t *)out, d30u64);
- out += 4;
- vst1_u64((uint64_t *)out, d31u64);
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c
deleted file mode 100644
index 352979aa16..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_dsp/vpx_dsp_common.h"
-
-void vpx_idct16x16_256_add_neon_pass1(const int16_t *input,
- int16_t *output,
- int output_stride);
-void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
- int16_t *output,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride);
-void vpx_idct16x16_10_add_neon_pass1(const int16_t *input,
- int16_t *output,
- int output_stride);
-void vpx_idct16x16_10_add_neon_pass2(const int16_t *src,
- int16_t *output,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride);
-
-#if HAVE_NEON_ASM
-/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
-extern void vpx_push_neon(int64_t *store);
-extern void vpx_pop_neon(int64_t *store);
-#endif // HAVE_NEON_ASM
-
-void vpx_idct16x16_256_add_neon(const int16_t *input,
- uint8_t *dest, int dest_stride) {
-#if HAVE_NEON_ASM
- int64_t store_reg[8];
-#endif
- int16_t pass1_output[16*16] = {0};
- int16_t row_idct_output[16*16] = {0};
-
-#if HAVE_NEON_ASM
- // save d8-d15 register values.
- vpx_push_neon(store_reg);
-#endif
-
- /* Parallel idct on the upper 8 rows */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7
- // which will be saved into row_idct_output.
- vpx_idct16x16_256_add_neon_pass2(input+1,
- row_idct_output,
- pass1_output,
- 0,
- dest,
- dest_stride);
-
- /* Parallel idct on the lower 8 rows */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7
- // which will be saved into row_idct_output.
- vpx_idct16x16_256_add_neon_pass2(input+8*16+1,
- row_idct_output+8,
- pass1_output,
- 0,
- dest,
- dest_stride);
-
- /* Parallel idct on the left 8 columns */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7.
- // Then add the result to the destination data.
- vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
- row_idct_output,
- pass1_output,
- 1,
- dest,
- dest_stride);
-
- /* Parallel idct on the right 8 columns */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7.
- // Then add the result to the destination data.
- vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
- row_idct_output+8,
- pass1_output,
- 1,
- dest+8,
- dest_stride);
-
-#if HAVE_NEON_ASM
- // restore d8-d15 register values.
- vpx_pop_neon(store_reg);
-#endif
-
- return;
-}
-
-void vpx_idct16x16_10_add_neon(const int16_t *input,
- uint8_t *dest, int dest_stride) {
-#if HAVE_NEON_ASM
- int64_t store_reg[8];
-#endif
- int16_t pass1_output[16*16] = {0};
- int16_t row_idct_output[16*16] = {0};
-
-#if HAVE_NEON_ASM
- // save d8-d15 register values.
- vpx_push_neon(store_reg);
-#endif
-
- /* Parallel idct on the upper 8 rows */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7
- // which will be saved into row_idct_output.
- vpx_idct16x16_10_add_neon_pass2(input+1,
- row_idct_output,
- pass1_output,
- 0,
- dest,
- dest_stride);
-
- /* Skip Parallel idct on the lower 8 rows as they are all 0s */
-
- /* Parallel idct on the left 8 columns */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7.
- // Then add the result to the destination data.
- vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
- row_idct_output,
- pass1_output,
- 1,
- dest,
- dest_stride);
-
- /* Parallel idct on the right 8 columns */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7.
- // Then add the result to the destination data.
- vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
- row_idct_output+8,
- pass1_output,
- 1,
- dest+8,
- dest_stride);
-
-#if HAVE_NEON_ASM
- // restore d8-d15 register values.
- vpx_pop_neon(store_reg);
-#endif
-
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
deleted file mode 100644
index c25c0c4a5c..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-
-#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-
-static INLINE void LD_16x8(
- uint8_t *d,
- int d_stride,
- uint8x16_t *q8u8,
- uint8x16_t *q9u8,
- uint8x16_t *q10u8,
- uint8x16_t *q11u8,
- uint8x16_t *q12u8,
- uint8x16_t *q13u8,
- uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- *q8u8 = vld1q_u8(d);
- d += d_stride;
- *q9u8 = vld1q_u8(d);
- d += d_stride;
- *q10u8 = vld1q_u8(d);
- d += d_stride;
- *q11u8 = vld1q_u8(d);
- d += d_stride;
- *q12u8 = vld1q_u8(d);
- d += d_stride;
- *q13u8 = vld1q_u8(d);
- d += d_stride;
- *q14u8 = vld1q_u8(d);
- d += d_stride;
- *q15u8 = vld1q_u8(d);
- return;
-}
-
-static INLINE void ADD_DIFF_16x8(
- uint8x16_t qdiffu8,
- uint8x16_t *q8u8,
- uint8x16_t *q9u8,
- uint8x16_t *q10u8,
- uint8x16_t *q11u8,
- uint8x16_t *q12u8,
- uint8x16_t *q13u8,
- uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
- *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
- *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
- *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
- *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
- *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
- *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
- *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
- return;
-}
-
-static INLINE void SUB_DIFF_16x8(
- uint8x16_t qdiffu8,
- uint8x16_t *q8u8,
- uint8x16_t *q9u8,
- uint8x16_t *q10u8,
- uint8x16_t *q11u8,
- uint8x16_t *q12u8,
- uint8x16_t *q13u8,
- uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
- *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
- *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
- *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
- *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
- *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
- *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
- *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
- return;
-}
-
-static INLINE void ST_16x8(
- uint8_t *d,
- int d_stride,
- uint8x16_t *q8u8,
- uint8x16_t *q9u8,
- uint8x16_t *q10u8,
- uint8x16_t *q11u8,
- uint8x16_t *q12u8,
- uint8x16_t *q13u8,
- uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- vst1q_u8(d, *q8u8);
- d += d_stride;
- vst1q_u8(d, *q9u8);
- d += d_stride;
- vst1q_u8(d, *q10u8);
- d += d_stride;
- vst1q_u8(d, *q11u8);
- d += d_stride;
- vst1q_u8(d, *q12u8);
- d += d_stride;
- vst1q_u8(d, *q13u8);
- d += d_stride;
- vst1q_u8(d, *q14u8);
- d += d_stride;
- vst1q_u8(d, *q15u8);
- return;
-}
-
-void vpx_idct32x32_1_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
- int i, j, dest_stride8;
- uint8_t *d;
- int16_t a1, cospi_16_64 = 11585;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 6);
-
- dest_stride8 = dest_stride * 8;
- if (a1 >= 0) { // diff_positive_32_32
- a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
- q0u8 = vdupq_n_u8(a1);
- for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
- d = dest;
- for (j = 0; j < 4; j++) {
- LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
- &q12u8, &q13u8, &q14u8, &q15u8);
- ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
- &q12u8, &q13u8, &q14u8, &q15u8);
- ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
- &q12u8, &q13u8, &q14u8, &q15u8);
- d += dest_stride8;
- }
- }
- } else { // diff_negative_32_32
- a1 = -a1;
- a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
- q0u8 = vdupq_n_u8(a1);
- for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
- d = dest;
- for (j = 0; j < 4; j++) {
- LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
- &q12u8, &q13u8, &q14u8, &q15u8);
- SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
- &q12u8, &q13u8, &q14u8, &q15u8);
- ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
- &q12u8, &q13u8, &q14u8, &q15u8);
- d += dest_stride8;
- }
- }
- }
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
deleted file mode 100644
index 025437eb96..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
+++ /dev/null
@@ -1,719 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "vpx_dsp/txfm_common.h"
-
-#define LOAD_FROM_TRANSPOSED(prev, first, second) \
- q14s16 = vld1q_s16(trans_buf + first * 8); \
- q13s16 = vld1q_s16(trans_buf + second * 8);
-
-#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
- qA = vld1q_s16(out + first * 32); \
- qB = vld1q_s16(out + second * 32);
-
-#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
- vst1q_s16(out + first * 32, qA); \
- vst1q_s16(out + second * 32, qB);
-
-#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
- __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
- q6s16, q7s16, q8s16, q9s16);
-static INLINE void __STORE_COMBINE_CENTER_RESULTS(
- uint8_t *p1,
- uint8_t *p2,
- int stride,
- int16x8_t q6s16,
- int16x8_t q7s16,
- int16x8_t q8s16,
- int16x8_t q9s16) {
- int16x4_t d8s16, d9s16, d10s16, d11s16;
-
- d8s16 = vld1_s16((int16_t *)p1);
- p1 += stride;
- d11s16 = vld1_s16((int16_t *)p2);
- p2 -= stride;
- d9s16 = vld1_s16((int16_t *)p1);
- d10s16 = vld1_s16((int16_t *)p2);
-
- q7s16 = vrshrq_n_s16(q7s16, 6);
- q8s16 = vrshrq_n_s16(q8s16, 6);
- q9s16 = vrshrq_n_s16(q9s16, 6);
- q6s16 = vrshrq_n_s16(q6s16, 6);
-
- q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
- vreinterpret_u8_s16(d9s16)));
- q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_s16(d10s16)));
- q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_s16(d11s16)));
- q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
- vreinterpret_u8_s16(d8s16)));
-
- d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
- d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
- d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
- d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
-
- vst1_s16((int16_t *)p1, d9s16);
- p1 -= stride;
- vst1_s16((int16_t *)p2, d10s16);
- p2 += stride;
- vst1_s16((int16_t *)p1, d8s16);
- vst1_s16((int16_t *)p2, d11s16);
- return;
-}
-
-#define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
- __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
- q4s16, q5s16, q6s16, q7s16);
-static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
- uint8_t *p1,
- uint8_t *p2,
- int stride,
- int16x8_t q4s16,
- int16x8_t q5s16,
- int16x8_t q6s16,
- int16x8_t q7s16) {
- int16x4_t d4s16, d5s16, d6s16, d7s16;
-
- d4s16 = vld1_s16((int16_t *)p1);
- p1 += stride;
- d7s16 = vld1_s16((int16_t *)p2);
- p2 -= stride;
- d5s16 = vld1_s16((int16_t *)p1);
- d6s16 = vld1_s16((int16_t *)p2);
-
- q5s16 = vrshrq_n_s16(q5s16, 6);
- q6s16 = vrshrq_n_s16(q6s16, 6);
- q7s16 = vrshrq_n_s16(q7s16, 6);
- q4s16 = vrshrq_n_s16(q4s16, 6);
-
- q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
- vreinterpret_u8_s16(d5s16)));
- q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
- vreinterpret_u8_s16(d6s16)));
- q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
- vreinterpret_u8_s16(d7s16)));
- q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
- vreinterpret_u8_s16(d4s16)));
-
- d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
- d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
- d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
- d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
-
- vst1_s16((int16_t *)p1, d5s16);
- p1 -= stride;
- vst1_s16((int16_t *)p2, d6s16);
- p2 += stride;
- vst1_s16((int16_t *)p2, d7s16);
- vst1_s16((int16_t *)p1, d4s16);
- return;
-}
-
-#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
- DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
-static INLINE void DO_BUTTERFLY(
- int16x8_t q14s16,
- int16x8_t q13s16,
- int16_t first_const,
- int16_t second_const,
- int16x8_t *qAs16,
- int16x8_t *qBs16) {
- int16x4_t d30s16, d31s16;
- int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
- int16x4_t dCs16, dDs16, dAs16, dBs16;
-
- dCs16 = vget_low_s16(q14s16);
- dDs16 = vget_high_s16(q14s16);
- dAs16 = vget_low_s16(q13s16);
- dBs16 = vget_high_s16(q13s16);
-
- d30s16 = vdup_n_s16(first_const);
- d31s16 = vdup_n_s16(second_const);
-
- q8s32 = vmull_s16(dCs16, d30s16);
- q10s32 = vmull_s16(dAs16, d31s16);
- q9s32 = vmull_s16(dDs16, d30s16);
- q11s32 = vmull_s16(dBs16, d31s16);
- q12s32 = vmull_s16(dCs16, d31s16);
-
- q8s32 = vsubq_s32(q8s32, q10s32);
- q9s32 = vsubq_s32(q9s32, q11s32);
-
- q10s32 = vmull_s16(dDs16, d31s16);
- q11s32 = vmull_s16(dAs16, d30s16);
- q15s32 = vmull_s16(dBs16, d30s16);
-
- q11s32 = vaddq_s32(q12s32, q11s32);
- q10s32 = vaddq_s32(q10s32, q15s32);
-
- *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
- vqrshrn_n_s32(q9s32, 14));
- *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
- vqrshrn_n_s32(q10s32, 14));
- return;
-}
-
-static INLINE void idct32_transpose_pair(
- int16_t *input,
- int16_t *t_buf) {
- int16_t *in;
- int i;
- const int stride = 32;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
- int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
- for (i = 0; i < 4; i++, input += 8) {
- in = input;
- q8s16 = vld1q_s16(in);
- in += stride;
- q9s16 = vld1q_s16(in);
- in += stride;
- q10s16 = vld1q_s16(in);
- in += stride;
- q11s16 = vld1q_s16(in);
- in += stride;
- q12s16 = vld1q_s16(in);
- in += stride;
- q13s16 = vld1q_s16(in);
- in += stride;
- q14s16 = vld1q_s16(in);
- in += stride;
- q15s16 = vld1q_s16(in);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
- d30s16 = vget_low_s16(q15s16);
- d31s16 = vget_high_s16(q15s16);
-
- q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
- q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
- q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
- q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
- q12s16 = vcombine_s16(d17s16, d25s16);
- q13s16 = vcombine_s16(d19s16, d27s16);
- q14s16 = vcombine_s16(d21s16, d29s16);
- q15s16 = vcombine_s16(d23s16, d31s16);
-
- q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
- vreinterpretq_s32_s16(q10s16));
- q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
- vreinterpretq_s32_s16(q11s16));
- q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
- vreinterpretq_s32_s16(q14s16));
- q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
- vreinterpretq_s32_s16(q15s16));
-
- q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
- vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
- q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
- vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
- q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
- vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
- q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
- vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
-
- vst1q_s16(t_buf, q0x2s16.val[0]);
- t_buf += 8;
- vst1q_s16(t_buf, q0x2s16.val[1]);
- t_buf += 8;
- vst1q_s16(t_buf, q1x2s16.val[0]);
- t_buf += 8;
- vst1q_s16(t_buf, q1x2s16.val[1]);
- t_buf += 8;
- vst1q_s16(t_buf, q2x2s16.val[0]);
- t_buf += 8;
- vst1q_s16(t_buf, q2x2s16.val[1]);
- t_buf += 8;
- vst1q_s16(t_buf, q3x2s16.val[0]);
- t_buf += 8;
- vst1q_s16(t_buf, q3x2s16.val[1]);
- t_buf += 8;
- }
- return;
-}
-
-static INLINE void idct32_bands_end_1st_pass(
- int16_t *out,
- int16x8_t q2s16,
- int16x8_t q3s16,
- int16x8_t q6s16,
- int16x8_t q7s16,
- int16x8_t q8s16,
- int16x8_t q9s16,
- int16x8_t q10s16,
- int16x8_t q11s16,
- int16x8_t q12s16,
- int16x8_t q13s16,
- int16x8_t q14s16,
- int16x8_t q15s16) {
- int16x8_t q0s16, q1s16, q4s16, q5s16;
-
- STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
- STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
-
- LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
- STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
-
- LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
- q2s16 = vaddq_s16(q10s16, q1s16);
- q3s16 = vaddq_s16(q11s16, q0s16);
- q4s16 = vsubq_s16(q11s16, q0s16);
- q5s16 = vsubq_s16(q10s16, q1s16);
-
- LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
- STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
- STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
-
- LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
- STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
-
- LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
- q2s16 = vaddq_s16(q12s16, q1s16);
- q3s16 = vaddq_s16(q13s16, q0s16);
- q4s16 = vsubq_s16(q13s16, q0s16);
- q5s16 = vsubq_s16(q12s16, q1s16);
-
- LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
- STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
- STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
-
- LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
- STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
-
- LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
- q2s16 = vaddq_s16(q14s16, q1s16);
- q3s16 = vaddq_s16(q15s16, q0s16);
- q4s16 = vsubq_s16(q15s16, q0s16);
- q5s16 = vsubq_s16(q14s16, q1s16);
-
- LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
- STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
- STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
-
- LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
- STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
- return;
-}
-
-static INLINE void idct32_bands_end_2nd_pass(
- int16_t *out,
- uint8_t *dest,
- int stride,
- int16x8_t q2s16,
- int16x8_t q3s16,
- int16x8_t q6s16,
- int16x8_t q7s16,
- int16x8_t q8s16,
- int16x8_t q9s16,
- int16x8_t q10s16,
- int16x8_t q11s16,
- int16x8_t q12s16,
- int16x8_t q13s16,
- int16x8_t q14s16,
- int16x8_t q15s16) {
- uint8_t *r6 = dest + 31 * stride;
- uint8_t *r7 = dest/* + 0 * stride*/;
- uint8_t *r9 = dest + 15 * stride;
- uint8_t *r10 = dest + 16 * stride;
- int str2 = stride << 1;
- int16x8_t q0s16, q1s16, q4s16, q5s16;
-
- STORE_COMBINE_CENTER_RESULTS(r10, r9);
- r10 += str2; r9 -= str2;
-
- LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_COMBINE_EXTREME_RESULTS(r7, r6);
- r7 += str2; r6 -= str2;
-
- LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
- q2s16 = vaddq_s16(q10s16, q1s16);
- q3s16 = vaddq_s16(q11s16, q0s16);
- q4s16 = vsubq_s16(q11s16, q0s16);
- q5s16 = vsubq_s16(q10s16, q1s16);
-
- LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
- STORE_COMBINE_CENTER_RESULTS(r10, r9);
- r10 += str2; r9 -= str2;
-
- LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_COMBINE_EXTREME_RESULTS(r7, r6);
- r7 += str2; r6 -= str2;
-
- LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
- q2s16 = vaddq_s16(q12s16, q1s16);
- q3s16 = vaddq_s16(q13s16, q0s16);
- q4s16 = vsubq_s16(q13s16, q0s16);
- q5s16 = vsubq_s16(q12s16, q1s16);
-
- LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
- STORE_COMBINE_CENTER_RESULTS(r10, r9);
- r10 += str2; r9 -= str2;
-
- LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_COMBINE_EXTREME_RESULTS(r7, r6);
- r7 += str2; r6 -= str2;
-
- LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
- q2s16 = vaddq_s16(q14s16, q1s16);
- q3s16 = vaddq_s16(q15s16, q0s16);
- q4s16 = vsubq_s16(q15s16, q0s16);
- q5s16 = vsubq_s16(q14s16, q1s16);
-
- LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
- STORE_COMBINE_CENTER_RESULTS(r10, r9);
-
- LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
- q4s16 = vaddq_s16(q2s16, q1s16);
- q5s16 = vaddq_s16(q3s16, q0s16);
- q6s16 = vsubq_s16(q3s16, q0s16);
- q7s16 = vsubq_s16(q2s16, q1s16);
- STORE_COMBINE_EXTREME_RESULTS(r7, r6);
- return;
-}
-
-void vpx_idct32x32_1024_add_neon(
- int16_t *input,
- uint8_t *dest,
- int stride) {
- int i, idct32_pass_loop;
- int16_t trans_buf[32 * 8];
- int16_t pass1[32 * 32];
- int16_t pass2[32 * 32];
- int16_t *out;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-
- for (idct32_pass_loop = 0, out = pass1;
- idct32_pass_loop < 2;
- idct32_pass_loop++,
- input = pass1, // the input of pass2 is the result of pass1
- out = pass2) {
- for (i = 0;
- i < 4; i++,
- input += 32 * 8, out += 8) { // idct32_bands_loop
- idct32_transpose_pair(input, trans_buf);
-
- // -----------------------------------------
- // BLOCK A: 16-19,28-31
- // -----------------------------------------
- // generate 16,17,30,31
- // part of stage 1
- LOAD_FROM_TRANSPOSED(0, 1, 31)
- DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
- LOAD_FROM_TRANSPOSED(31, 17, 15)
- DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
- // part of stage 2
- q4s16 = vaddq_s16(q0s16, q1s16);
- q13s16 = vsubq_s16(q0s16, q1s16);
- q6s16 = vaddq_s16(q2s16, q3s16);
- q14s16 = vsubq_s16(q2s16, q3s16);
- // part of stage 3
- DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
-
- // generate 18,19,28,29
- // part of stage 1
- LOAD_FROM_TRANSPOSED(15, 9, 23)
- DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
- LOAD_FROM_TRANSPOSED(23, 25, 7)
- DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
- // part of stage 2
- q13s16 = vsubq_s16(q3s16, q2s16);
- q3s16 = vaddq_s16(q3s16, q2s16);
- q14s16 = vsubq_s16(q1s16, q0s16);
- q2s16 = vaddq_s16(q1s16, q0s16);
- // part of stage 3
- DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
- // part of stage 4
- q8s16 = vaddq_s16(q4s16, q2s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q10s16 = vaddq_s16(q7s16, q1s16);
- q15s16 = vaddq_s16(q6s16, q3s16);
- q13s16 = vsubq_s16(q5s16, q0s16);
- q14s16 = vsubq_s16(q7s16, q1s16);
- STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
- STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
- // part of stage 5
- DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
- STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
- // part of stage 4
- q13s16 = vsubq_s16(q4s16, q2s16);
- q14s16 = vsubq_s16(q6s16, q3s16);
- // part of stage 5
- DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
- STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
-
- // -----------------------------------------
- // BLOCK B: 20-23,24-27
- // -----------------------------------------
- // generate 20,21,26,27
- // part of stage 1
- LOAD_FROM_TRANSPOSED(7, 5, 27)
- DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
- LOAD_FROM_TRANSPOSED(27, 21, 11)
- DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
- // part of stage 2
- q13s16 = vsubq_s16(q0s16, q1s16);
- q0s16 = vaddq_s16(q0s16, q1s16);
- q14s16 = vsubq_s16(q2s16, q3s16);
- q2s16 = vaddq_s16(q2s16, q3s16);
- // part of stage 3
- DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
-
- // generate 22,23,24,25
- // part of stage 1
- LOAD_FROM_TRANSPOSED(11, 13, 19)
- DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
- LOAD_FROM_TRANSPOSED(19, 29, 3)
- DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
- // part of stage 2
- q14s16 = vsubq_s16(q4s16, q5s16);
- q5s16 = vaddq_s16(q4s16, q5s16);
- q13s16 = vsubq_s16(q6s16, q7s16);
- q6s16 = vaddq_s16(q6s16, q7s16);
- // part of stage 3
- DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
- // part of stage 4
- q10s16 = vaddq_s16(q7s16, q1s16);
- q11s16 = vaddq_s16(q5s16, q0s16);
- q12s16 = vaddq_s16(q6s16, q2s16);
- q15s16 = vaddq_s16(q4s16, q3s16);
- // part of stage 6
- LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
- q8s16 = vaddq_s16(q14s16, q11s16);
- q9s16 = vaddq_s16(q13s16, q10s16);
- q13s16 = vsubq_s16(q13s16, q10s16);
- q11s16 = vsubq_s16(q14s16, q11s16);
- STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
- LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
- q8s16 = vsubq_s16(q9s16, q12s16);
- q10s16 = vaddq_s16(q14s16, q15s16);
- q14s16 = vsubq_s16(q14s16, q15s16);
- q12s16 = vaddq_s16(q9s16, q12s16);
- STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
- // part of stage 7
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
- STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
- q13s16 = q11s16;
- q14s16 = q8s16;
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
- STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
- // part of stage 4
- q14s16 = vsubq_s16(q5s16, q0s16);
- q13s16 = vsubq_s16(q6s16, q2s16);
- DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
- q14s16 = vsubq_s16(q7s16, q1s16);
- q13s16 = vsubq_s16(q4s16, q3s16);
- DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
- // part of stage 6
- LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
- q8s16 = vaddq_s16(q14s16, q1s16);
- q9s16 = vaddq_s16(q13s16, q6s16);
- q13s16 = vsubq_s16(q13s16, q6s16);
- q1s16 = vsubq_s16(q14s16, q1s16);
- STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
- LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
- q14s16 = vsubq_s16(q8s16, q5s16);
- q10s16 = vaddq_s16(q8s16, q5s16);
- q11s16 = vaddq_s16(q9s16, q0s16);
- q0s16 = vsubq_s16(q9s16, q0s16);
- STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
- // part of stage 7
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
- STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
- DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
- &q1s16, &q0s16);
- STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
-
- // -----------------------------------------
- // BLOCK C: 8-10,11-15
- // -----------------------------------------
- // generate 8,9,14,15
- // part of stage 2
- LOAD_FROM_TRANSPOSED(3, 2, 30)
- DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
- LOAD_FROM_TRANSPOSED(30, 18, 14)
- DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
- // part of stage 3
- q13s16 = vsubq_s16(q0s16, q1s16);
- q0s16 = vaddq_s16(q0s16, q1s16);
- q14s16 = vsubq_s16(q2s16, q3s16);
- q2s16 = vaddq_s16(q2s16, q3s16);
- // part of stage 4
- DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
-
- // generate 10,11,12,13
- // part of stage 2
- LOAD_FROM_TRANSPOSED(14, 10, 22)
- DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
- LOAD_FROM_TRANSPOSED(22, 26, 6)
- DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
- // part of stage 3
- q14s16 = vsubq_s16(q4s16, q5s16);
- q5s16 = vaddq_s16(q4s16, q5s16);
- q13s16 = vsubq_s16(q6s16, q7s16);
- q6s16 = vaddq_s16(q6s16, q7s16);
- // part of stage 4
- DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
- // part of stage 5
- q8s16 = vaddq_s16(q0s16, q5s16);
- q9s16 = vaddq_s16(q1s16, q7s16);
- q13s16 = vsubq_s16(q1s16, q7s16);
- q14s16 = vsubq_s16(q3s16, q4s16);
- q10s16 = vaddq_s16(q3s16, q4s16);
- q15s16 = vaddq_s16(q2s16, q6s16);
- STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
- STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
- // part of stage 6
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
- STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
- q13s16 = vsubq_s16(q0s16, q5s16);
- q14s16 = vsubq_s16(q2s16, q6s16);
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
- STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
-
- // -----------------------------------------
- // BLOCK D: 0-3,4-7
- // -----------------------------------------
- // generate 4,5,6,7
- // part of stage 3
- LOAD_FROM_TRANSPOSED(6, 4, 28)
- DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
- LOAD_FROM_TRANSPOSED(28, 20, 12)
- DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
- // part of stage 4
- q13s16 = vsubq_s16(q0s16, q1s16);
- q0s16 = vaddq_s16(q0s16, q1s16);
- q14s16 = vsubq_s16(q2s16, q3s16);
- q2s16 = vaddq_s16(q2s16, q3s16);
- // part of stage 5
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-
- // generate 0,1,2,3
- // part of stage 4
- LOAD_FROM_TRANSPOSED(12, 0, 16)
- DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
- LOAD_FROM_TRANSPOSED(16, 8, 24)
- DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
- // part of stage 5
- q4s16 = vaddq_s16(q7s16, q6s16);
- q7s16 = vsubq_s16(q7s16, q6s16);
- q6s16 = vsubq_s16(q5s16, q14s16);
- q5s16 = vaddq_s16(q5s16, q14s16);
- // part of stage 6
- q8s16 = vaddq_s16(q4s16, q2s16);
- q9s16 = vaddq_s16(q5s16, q3s16);
- q10s16 = vaddq_s16(q6s16, q1s16);
- q11s16 = vaddq_s16(q7s16, q0s16);
- q12s16 = vsubq_s16(q7s16, q0s16);
- q13s16 = vsubq_s16(q6s16, q1s16);
- q14s16 = vsubq_s16(q5s16, q3s16);
- q15s16 = vsubq_s16(q4s16, q2s16);
- // part of stage 7
- LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
- q2s16 = vaddq_s16(q8s16, q1s16);
- q3s16 = vaddq_s16(q9s16, q0s16);
- q4s16 = vsubq_s16(q9s16, q0s16);
- q5s16 = vsubq_s16(q8s16, q1s16);
- LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
- q8s16 = vaddq_s16(q4s16, q1s16);
- q9s16 = vaddq_s16(q5s16, q0s16);
- q6s16 = vsubq_s16(q5s16, q0s16);
- q7s16 = vsubq_s16(q4s16, q1s16);
-
- if (idct32_pass_loop == 0) {
- idct32_bands_end_1st_pass(out,
- q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
- q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
- } else {
- idct32_bands_end_2nd_pass(out, dest, stride,
- q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
- q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
- dest += 8;
- }
- }
- }
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
deleted file mode 100644
index ea618700c9..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-
-void vpx_idct4x4_1_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8x8_t d6u8;
- uint32x2_t d2u32 = vdup_n_u32(0);
- uint16x8_t q8u16;
- int16x8_t q0s16;
- uint8_t *d1, *d2;
- int16_t i, a1, cospi_16_64 = 11585;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 4);
-
- q0s16 = vdupq_n_s16(a1);
-
- // dc_only_idct_add
- d1 = d2 = dest;
- for (i = 0; i < 2; i++) {
- d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
- d1 += dest_stride;
- d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
- d1 += dest_stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),
- vreinterpret_u8_u32(d2u32));
- d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-
- vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
- d2 += dest_stride;
- vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
- d2 += dest_stride;
- }
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
deleted file mode 100644
index 3c975c99b7..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-void vpx_idct4x4_16_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8x8_t d26u8, d27u8;
- uint32x2_t d26u32, d27u32;
- uint16x8_t q8u16, q9u16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
- int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
- int16x8_t q8s16, q9s16, q13s16, q14s16;
- int32x4_t q1s32, q13s32, q14s32, q15s32;
- int16x4x2_t d0x2s16, d1x2s16;
- int32x4x2_t q0x2s32;
- uint8_t *d;
- int16_t cospi_8_64 = 15137;
- int16_t cospi_16_64 = 11585;
- int16_t cospi_24_64 = 6270;
-
- d26u32 = d27u32 = vdup_n_u32(0);
-
- q8s16 = vld1q_s16(input);
- q9s16 = vld1q_s16(input + 8);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
-
- d0x2s16 = vtrn_s16(d16s16, d17s16);
- d1x2s16 = vtrn_s16(d18s16, d19s16);
- q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
- q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
- d20s16 = vdup_n_s16(cospi_8_64);
- d21s16 = vdup_n_s16(cospi_16_64);
-
- q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
- vreinterpretq_s32_s16(q9s16));
- d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
- d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
- d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
- d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
- d22s16 = vdup_n_s16(cospi_24_64);
-
- // stage 1
- d23s16 = vadd_s16(d16s16, d18s16);
- d24s16 = vsub_s16(d16s16, d18s16);
-
- q15s32 = vmull_s16(d17s16, d22s16);
- q1s32 = vmull_s16(d17s16, d20s16);
- q13s32 = vmull_s16(d23s16, d21s16);
- q14s32 = vmull_s16(d24s16, d21s16);
-
- q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
- q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
- d26s16 = vqrshrn_n_s32(q13s32, 14);
- d27s16 = vqrshrn_n_s32(q14s32, 14);
- d29s16 = vqrshrn_n_s32(q15s32, 14);
- d28s16 = vqrshrn_n_s32(q1s32, 14);
- q13s16 = vcombine_s16(d26s16, d27s16);
- q14s16 = vcombine_s16(d28s16, d29s16);
-
- // stage 2
- q8s16 = vaddq_s16(q13s16, q14s16);
- q9s16 = vsubq_s16(q13s16, q14s16);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_high_s16(q9s16); // vswp d18 d19
- d19s16 = vget_low_s16(q9s16);
-
- d0x2s16 = vtrn_s16(d16s16, d17s16);
- d1x2s16 = vtrn_s16(d18s16, d19s16);
- q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
- q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
- q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
- vreinterpretq_s32_s16(q9s16));
- d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
- d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
- d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
- d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
- // do the transform on columns
- // stage 1
- d23s16 = vadd_s16(d16s16, d18s16);
- d24s16 = vsub_s16(d16s16, d18s16);
-
- q15s32 = vmull_s16(d17s16, d22s16);
- q1s32 = vmull_s16(d17s16, d20s16);
- q13s32 = vmull_s16(d23s16, d21s16);
- q14s32 = vmull_s16(d24s16, d21s16);
-
- q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
- q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
- d26s16 = vqrshrn_n_s32(q13s32, 14);
- d27s16 = vqrshrn_n_s32(q14s32, 14);
- d29s16 = vqrshrn_n_s32(q15s32, 14);
- d28s16 = vqrshrn_n_s32(q1s32, 14);
- q13s16 = vcombine_s16(d26s16, d27s16);
- q14s16 = vcombine_s16(d28s16, d29s16);
-
- // stage 2
- q8s16 = vaddq_s16(q13s16, q14s16);
- q9s16 = vsubq_s16(q13s16, q14s16);
-
- q8s16 = vrshrq_n_s16(q8s16, 4);
- q9s16 = vrshrq_n_s16(q9s16, 4);
-
- d = dest;
- d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
- d += dest_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
- d += dest_stride;
- d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
- d += dest_stride;
- d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_u32(d26u32));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_u32(d27u32));
-
- d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
- d = dest;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
- d += dest_stride;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
- d += dest_stride;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
- d += dest_stride;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
deleted file mode 100644
index c1b801fad5..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-
-void vpx_idct8x8_1_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8x8_t d2u8, d3u8, d30u8, d31u8;
- uint64x1_t d2u64, d3u64, d4u64, d5u64;
- uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
- int16x8_t q0s16;
- uint8_t *d1, *d2;
- int16_t i, a1, cospi_16_64 = 11585;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 5);
-
- q0s16 = vdupq_n_s16(a1);
- q0u16 = vreinterpretq_u16_s16(q0s16);
-
- d1 = d2 = dest;
- for (i = 0; i < 2; i++) {
- d2u64 = vld1_u64((const uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((const uint64_t *)d1);
- d1 += dest_stride;
- d4u64 = vld1_u64((const uint64_t *)d1);
- d1 += dest_stride;
- d5u64 = vld1_u64((const uint64_t *)d1);
- d1 += dest_stride;
-
- q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
- q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
- q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
- q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
- d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
- d2 += dest_stride;
- }
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
deleted file mode 100644
index 4b2c2a6f83..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
+++ /dev/null
@@ -1,540 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "vpx_dsp/txfm_common.h"
-
-static INLINE void TRANSPOSE8X8(
- int16x8_t *q8s16,
- int16x8_t *q9s16,
- int16x8_t *q10s16,
- int16x8_t *q11s16,
- int16x8_t *q12s16,
- int16x8_t *q13s16,
- int16x8_t *q14s16,
- int16x8_t *q15s16) {
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
- int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
- d16s16 = vget_low_s16(*q8s16);
- d17s16 = vget_high_s16(*q8s16);
- d18s16 = vget_low_s16(*q9s16);
- d19s16 = vget_high_s16(*q9s16);
- d20s16 = vget_low_s16(*q10s16);
- d21s16 = vget_high_s16(*q10s16);
- d22s16 = vget_low_s16(*q11s16);
- d23s16 = vget_high_s16(*q11s16);
- d24s16 = vget_low_s16(*q12s16);
- d25s16 = vget_high_s16(*q12s16);
- d26s16 = vget_low_s16(*q13s16);
- d27s16 = vget_high_s16(*q13s16);
- d28s16 = vget_low_s16(*q14s16);
- d29s16 = vget_high_s16(*q14s16);
- d30s16 = vget_low_s16(*q15s16);
- d31s16 = vget_high_s16(*q15s16);
-
- *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
- *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
- *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
- *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
- *q12s16 = vcombine_s16(d17s16, d25s16);
- *q13s16 = vcombine_s16(d19s16, d27s16);
- *q14s16 = vcombine_s16(d21s16, d29s16);
- *q15s16 = vcombine_s16(d23s16, d31s16);
-
- q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
- vreinterpretq_s32_s16(*q10s16));
- q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
- vreinterpretq_s32_s16(*q11s16));
- q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
- vreinterpretq_s32_s16(*q14s16));
- q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
- vreinterpretq_s32_s16(*q15s16));
-
- q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
- vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
- q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
- vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
- q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
- vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
- q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
- vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
-
- *q8s16 = q0x2s16.val[0];
- *q9s16 = q0x2s16.val[1];
- *q10s16 = q1x2s16.val[0];
- *q11s16 = q1x2s16.val[1];
- *q12s16 = q2x2s16.val[0];
- *q13s16 = q2x2s16.val[1];
- *q14s16 = q3x2s16.val[0];
- *q15s16 = q3x2s16.val[1];
- return;
-}
-
-static INLINE void IDCT8x8_1D(
- int16x8_t *q8s16,
- int16x8_t *q9s16,
- int16x8_t *q10s16,
- int16x8_t *q11s16,
- int16x8_t *q12s16,
- int16x8_t *q13s16,
- int16x8_t *q14s16,
- int16x8_t *q15s16) {
- int16x4_t d0s16, d1s16, d2s16, d3s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
- d0s16 = vdup_n_s16(cospi_28_64);
- d1s16 = vdup_n_s16(cospi_4_64);
- d2s16 = vdup_n_s16(cospi_12_64);
- d3s16 = vdup_n_s16(cospi_20_64);
-
- d16s16 = vget_low_s16(*q8s16);
- d17s16 = vget_high_s16(*q8s16);
- d18s16 = vget_low_s16(*q9s16);
- d19s16 = vget_high_s16(*q9s16);
- d20s16 = vget_low_s16(*q10s16);
- d21s16 = vget_high_s16(*q10s16);
- d22s16 = vget_low_s16(*q11s16);
- d23s16 = vget_high_s16(*q11s16);
- d24s16 = vget_low_s16(*q12s16);
- d25s16 = vget_high_s16(*q12s16);
- d26s16 = vget_low_s16(*q13s16);
- d27s16 = vget_high_s16(*q13s16);
- d28s16 = vget_low_s16(*q14s16);
- d29s16 = vget_high_s16(*q14s16);
- d30s16 = vget_low_s16(*q15s16);
- d31s16 = vget_high_s16(*q15s16);
-
- q2s32 = vmull_s16(d18s16, d0s16);
- q3s32 = vmull_s16(d19s16, d0s16);
- q5s32 = vmull_s16(d26s16, d2s16);
- q6s32 = vmull_s16(d27s16, d2s16);
-
- q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
- q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
- q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
- q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
- d8s16 = vqrshrn_n_s32(q2s32, 14);
- d9s16 = vqrshrn_n_s32(q3s32, 14);
- d10s16 = vqrshrn_n_s32(q5s32, 14);
- d11s16 = vqrshrn_n_s32(q6s32, 14);
- q4s16 = vcombine_s16(d8s16, d9s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- q2s32 = vmull_s16(d18s16, d1s16);
- q3s32 = vmull_s16(d19s16, d1s16);
- q9s32 = vmull_s16(d26s16, d3s16);
- q13s32 = vmull_s16(d27s16, d3s16);
-
- q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
- q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
- q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
- q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
- d14s16 = vqrshrn_n_s32(q2s32, 14);
- d15s16 = vqrshrn_n_s32(q3s32, 14);
- d12s16 = vqrshrn_n_s32(q9s32, 14);
- d13s16 = vqrshrn_n_s32(q13s32, 14);
- q6s16 = vcombine_s16(d12s16, d13s16);
- q7s16 = vcombine_s16(d14s16, d15s16);
-
- d0s16 = vdup_n_s16(cospi_16_64);
-
- q2s32 = vmull_s16(d16s16, d0s16);
- q3s32 = vmull_s16(d17s16, d0s16);
- q13s32 = vmull_s16(d16s16, d0s16);
- q15s32 = vmull_s16(d17s16, d0s16);
-
- q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
- q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
- q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
- q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
- d0s16 = vdup_n_s16(cospi_24_64);
- d1s16 = vdup_n_s16(cospi_8_64);
-
- d18s16 = vqrshrn_n_s32(q2s32, 14);
- d19s16 = vqrshrn_n_s32(q3s32, 14);
- d22s16 = vqrshrn_n_s32(q13s32, 14);
- d23s16 = vqrshrn_n_s32(q15s32, 14);
- *q9s16 = vcombine_s16(d18s16, d19s16);
- *q11s16 = vcombine_s16(d22s16, d23s16);
-
- q2s32 = vmull_s16(d20s16, d0s16);
- q3s32 = vmull_s16(d21s16, d0s16);
- q8s32 = vmull_s16(d20s16, d1s16);
- q12s32 = vmull_s16(d21s16, d1s16);
-
- q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
- q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
- q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
- q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
- d26s16 = vqrshrn_n_s32(q2s32, 14);
- d27s16 = vqrshrn_n_s32(q3s32, 14);
- d30s16 = vqrshrn_n_s32(q8s32, 14);
- d31s16 = vqrshrn_n_s32(q12s32, 14);
- *q13s16 = vcombine_s16(d26s16, d27s16);
- *q15s16 = vcombine_s16(d30s16, d31s16);
-
- q0s16 = vaddq_s16(*q9s16, *q15s16);
- q1s16 = vaddq_s16(*q11s16, *q13s16);
- q2s16 = vsubq_s16(*q11s16, *q13s16);
- q3s16 = vsubq_s16(*q9s16, *q15s16);
-
- *q13s16 = vsubq_s16(q4s16, q5s16);
- q4s16 = vaddq_s16(q4s16, q5s16);
- *q14s16 = vsubq_s16(q7s16, q6s16);
- q7s16 = vaddq_s16(q7s16, q6s16);
- d26s16 = vget_low_s16(*q13s16);
- d27s16 = vget_high_s16(*q13s16);
- d28s16 = vget_low_s16(*q14s16);
- d29s16 = vget_high_s16(*q14s16);
-
- d16s16 = vdup_n_s16(cospi_16_64);
-
- q9s32 = vmull_s16(d28s16, d16s16);
- q10s32 = vmull_s16(d29s16, d16s16);
- q11s32 = vmull_s16(d28s16, d16s16);
- q12s32 = vmull_s16(d29s16, d16s16);
-
- q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
- q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
- q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
- q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
- d10s16 = vqrshrn_n_s32(q9s32, 14);
- d11s16 = vqrshrn_n_s32(q10s32, 14);
- d12s16 = vqrshrn_n_s32(q11s32, 14);
- d13s16 = vqrshrn_n_s32(q12s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- *q8s16 = vaddq_s16(q0s16, q7s16);
- *q9s16 = vaddq_s16(q1s16, q6s16);
- *q10s16 = vaddq_s16(q2s16, q5s16);
- *q11s16 = vaddq_s16(q3s16, q4s16);
- *q12s16 = vsubq_s16(q3s16, q4s16);
- *q13s16 = vsubq_s16(q2s16, q5s16);
- *q14s16 = vsubq_s16(q1s16, q6s16);
- *q15s16 = vsubq_s16(q0s16, q7s16);
- return;
-}
-
-void vpx_idct8x8_64_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8_t *d1, *d2;
- uint8x8_t d0u8, d1u8, d2u8, d3u8;
- uint64x1_t d0u64, d1u64, d2u64, d3u64;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
- q8s16 = vld1q_s16(input);
- q9s16 = vld1q_s16(input + 8);
- q10s16 = vld1q_s16(input + 16);
- q11s16 = vld1q_s16(input + 24);
- q12s16 = vld1q_s16(input + 32);
- q13s16 = vld1q_s16(input + 40);
- q14s16 = vld1q_s16(input + 48);
- q15s16 = vld1q_s16(input + 56);
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- q8s16 = vrshrq_n_s16(q8s16, 5);
- q9s16 = vrshrq_n_s16(q9s16, 5);
- q10s16 = vrshrq_n_s16(q10s16, 5);
- q11s16 = vrshrq_n_s16(q11s16, 5);
- q12s16 = vrshrq_n_s16(q12s16, 5);
- q13s16 = vrshrq_n_s16(q13s16, 5);
- q14s16 = vrshrq_n_s16(q14s16, 5);
- q15s16 = vrshrq_n_s16(q15s16, 5);
-
- d1 = d2 = dest;
-
- d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_u64(d0u64));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_u64(d1u64));
- q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
- vreinterpret_u8_u64(d2u64));
- q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
- vreinterpret_u8_u64(d3u64));
-
- d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
-
- q8s16 = q12s16;
- q9s16 = q13s16;
- q10s16 = q14s16;
- q11s16 = q15s16;
-
- d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_u64(d0u64));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_u64(d1u64));
- q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
- vreinterpret_u8_u64(d2u64));
- q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
- vreinterpret_u8_u64(d3u64));
-
- d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
- return;
-}
-
-void vpx_idct8x8_12_add_neon(
- int16_t *input,
- uint8_t *dest,
- int dest_stride) {
- uint8_t *d1, *d2;
- uint8x8_t d0u8, d1u8, d2u8, d3u8;
- int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
- int16x4_t d26s16, d27s16, d28s16, d29s16;
- uint64x1_t d0u64, d1u64, d2u64, d3u64;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- uint16x8_t q8u16, q9u16, q10u16, q11u16;
- int32x4_t q9s32, q10s32, q11s32, q12s32;
-
- q8s16 = vld1q_s16(input);
- q9s16 = vld1q_s16(input + 8);
- q10s16 = vld1q_s16(input + 16);
- q11s16 = vld1q_s16(input + 24);
- q12s16 = vld1q_s16(input + 32);
- q13s16 = vld1q_s16(input + 40);
- q14s16 = vld1q_s16(input + 48);
- q15s16 = vld1q_s16(input + 56);
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- // First transform rows
- // stage 1
- q0s16 = vdupq_n_s16(cospi_28_64 * 2);
- q1s16 = vdupq_n_s16(cospi_4_64 * 2);
-
- q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-
- q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
-
- q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
- q1s16 = vdupq_n_s16(cospi_12_64 * 2);
-
- q5s16 = vqrdmulhq_s16(q11s16, q0s16);
-
- q0s16 = vdupq_n_s16(cospi_16_64 * 2);
-
- q6s16 = vqrdmulhq_s16(q11s16, q1s16);
-
- // stage 2 & stage 3 - even half
- q1s16 = vdupq_n_s16(cospi_24_64 * 2);
-
- q9s16 = vqrdmulhq_s16(q8s16, q0s16);
-
- q0s16 = vdupq_n_s16(cospi_8_64 * 2);
-
- q13s16 = vqrdmulhq_s16(q10s16, q1s16);
-
- q15s16 = vqrdmulhq_s16(q10s16, q0s16);
-
- // stage 3 -odd half
- q0s16 = vaddq_s16(q9s16, q15s16);
- q1s16 = vaddq_s16(q9s16, q13s16);
- q2s16 = vsubq_s16(q9s16, q13s16);
- q3s16 = vsubq_s16(q9s16, q15s16);
-
- // stage 2 - odd half
- q13s16 = vsubq_s16(q4s16, q5s16);
- q4s16 = vaddq_s16(q4s16, q5s16);
- q14s16 = vsubq_s16(q7s16, q6s16);
- q7s16 = vaddq_s16(q7s16, q6s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
-
- d16s16 = vdup_n_s16(cospi_16_64);
- q9s32 = vmull_s16(d28s16, d16s16);
- q10s32 = vmull_s16(d29s16, d16s16);
- q11s32 = vmull_s16(d28s16, d16s16);
- q12s32 = vmull_s16(d29s16, d16s16);
-
- q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
- q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
- q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
- q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
- d10s16 = vqrshrn_n_s32(q9s32, 14);
- d11s16 = vqrshrn_n_s32(q10s32, 14);
- d12s16 = vqrshrn_n_s32(q11s32, 14);
- d13s16 = vqrshrn_n_s32(q12s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- // stage 4
- q8s16 = vaddq_s16(q0s16, q7s16);
- q9s16 = vaddq_s16(q1s16, q6s16);
- q10s16 = vaddq_s16(q2s16, q5s16);
- q11s16 = vaddq_s16(q3s16, q4s16);
- q12s16 = vsubq_s16(q3s16, q4s16);
- q13s16 = vsubq_s16(q2s16, q5s16);
- q14s16 = vsubq_s16(q1s16, q6s16);
- q15s16 = vsubq_s16(q0s16, q7s16);
-
- TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
- &q12s16, &q13s16, &q14s16, &q15s16);
-
- q8s16 = vrshrq_n_s16(q8s16, 5);
- q9s16 = vrshrq_n_s16(q9s16, 5);
- q10s16 = vrshrq_n_s16(q10s16, 5);
- q11s16 = vrshrq_n_s16(q11s16, 5);
- q12s16 = vrshrq_n_s16(q12s16, 5);
- q13s16 = vrshrq_n_s16(q13s16, 5);
- q14s16 = vrshrq_n_s16(q14s16, 5);
- q15s16 = vrshrq_n_s16(q15s16, 5);
-
- d1 = d2 = dest;
-
- d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_u64(d0u64));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_u64(d1u64));
- q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
- vreinterpret_u8_u64(d2u64));
- q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
- vreinterpret_u8_u64(d3u64));
-
- d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
-
- q8s16 = q12s16;
- q9s16 = q13s16;
- q10s16 = q14s16;
- q11s16 = q15s16;
-
- d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
- vreinterpret_u8_u64(d0u64));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
- vreinterpret_u8_u64(d1u64));
- q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
- vreinterpret_u8_u64(d2u64));
- q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
- vreinterpret_u8_u64(d3u64));
-
- d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c b/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c
deleted file mode 100644
index 0a376104d2..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c
+++ /dev/null
@@ -1,822 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-
-//------------------------------------------------------------------------------
-// DC 4x4
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left,
- int do_above, int do_left) {
- uint16x8_t sum_top;
- uint16x8_t sum_left;
- uint8x8_t dc0;
-
- if (do_above) {
- const uint8x8_t A = vld1_u8(above); // top row
- const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
- const uint16x4_t p1 = vpadd_u16(p0, p0);
- sum_top = vcombine_u16(p1, p1);
- }
-
- if (do_left) {
- const uint8x8_t L = vld1_u8(left); // left border
- const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
- const uint16x4_t p1 = vpadd_u16(p0, p0);
- sum_left = vcombine_u16(p1, p1);
- }
-
- if (do_above && do_left) {
- const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
- dc0 = vrshrn_n_u16(sum, 3);
- } else if (do_above) {
- dc0 = vrshrn_n_u16(sum_top, 2);
- } else if (do_left) {
- dc0 = vrshrn_n_u16(sum_left, 2);
- } else {
- dc0 = vdup_n_u8(0x80);
- }
-
- {
- const uint8x8_t dc = vdup_lane_u8(dc0, 0);
- int i;
- for (i = 0; i < 4; ++i) {
- vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
- }
- }
-}
-
-void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_4x4(dst, stride, above, left, 1, 1);
-}
-
-void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- dc_4x4(dst, stride, NULL, left, 0, 1);
-}
-
-void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- dc_4x4(dst, stride, above, NULL, 1, 0);
-}
-
-void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
- dc_4x4(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 8x8
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left,
- int do_above, int do_left) {
- uint16x8_t sum_top;
- uint16x8_t sum_left;
- uint8x8_t dc0;
-
- if (do_above) {
- const uint8x8_t A = vld1_u8(above); // top row
- const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
- const uint16x4_t p1 = vpadd_u16(p0, p0);
- const uint16x4_t p2 = vpadd_u16(p1, p1);
- sum_top = vcombine_u16(p2, p2);
- }
-
- if (do_left) {
- const uint8x8_t L = vld1_u8(left); // left border
- const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
- const uint16x4_t p1 = vpadd_u16(p0, p0);
- const uint16x4_t p2 = vpadd_u16(p1, p1);
- sum_left = vcombine_u16(p2, p2);
- }
-
- if (do_above && do_left) {
- const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
- dc0 = vrshrn_n_u16(sum, 4);
- } else if (do_above) {
- dc0 = vrshrn_n_u16(sum_top, 3);
- } else if (do_left) {
- dc0 = vrshrn_n_u16(sum_left, 3);
- } else {
- dc0 = vdup_n_u8(0x80);
- }
-
- {
- const uint8x8_t dc = vdup_lane_u8(dc0, 0);
- int i;
- for (i = 0; i < 8; ++i) {
- vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
- }
- }
-}
-
-void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_8x8(dst, stride, above, left, 1, 1);
-}
-
-void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- dc_8x8(dst, stride, NULL, left, 0, 1);
-}
-
-void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- dc_8x8(dst, stride, above, NULL, 1, 0);
-}
-
-void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
- dc_8x8(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 16x16
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left,
- int do_above, int do_left) {
- uint16x8_t sum_top;
- uint16x8_t sum_left;
- uint8x8_t dc0;
-
- if (do_above) {
- const uint8x16_t A = vld1q_u8(above); // top row
- const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
- const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
- const uint16x4_t p2 = vpadd_u16(p1, p1);
- const uint16x4_t p3 = vpadd_u16(p2, p2);
- sum_top = vcombine_u16(p3, p3);
- }
-
- if (do_left) {
- const uint8x16_t L = vld1q_u8(left); // left row
- const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left
- const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
- const uint16x4_t p2 = vpadd_u16(p1, p1);
- const uint16x4_t p3 = vpadd_u16(p2, p2);
- sum_left = vcombine_u16(p3, p3);
- }
-
- if (do_above && do_left) {
- const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
- dc0 = vrshrn_n_u16(sum, 5);
- } else if (do_above) {
- dc0 = vrshrn_n_u16(sum_top, 4);
- } else if (do_left) {
- dc0 = vrshrn_n_u16(sum_left, 4);
- } else {
- dc0 = vdup_n_u8(0x80);
- }
-
- {
- const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
- int i;
- for (i = 0; i < 16; ++i) {
- vst1q_u8(dst + i * stride, dc);
- }
- }
-}
-
-void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_16x16(dst, stride, above, left, 1, 1);
-}
-
-void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- dc_16x16(dst, stride, NULL, left, 0, 1);
-}
-
-void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)left;
- dc_16x16(dst, stride, above, NULL, 1, 0);
-}
-
-void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- dc_16x16(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 32x32
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left,
- int do_above, int do_left) {
- uint16x8_t sum_top;
- uint16x8_t sum_left;
- uint8x8_t dc0;
-
- if (do_above) {
- const uint8x16_t A0 = vld1q_u8(above); // top row
- const uint8x16_t A1 = vld1q_u8(above + 16);
- const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top
- const uint16x8_t p1 = vpaddlq_u8(A1);
- const uint16x8_t p2 = vaddq_u16(p0, p1);
- const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
- const uint16x4_t p4 = vpadd_u16(p3, p3);
- const uint16x4_t p5 = vpadd_u16(p4, p4);
- sum_top = vcombine_u16(p5, p5);
- }
-
- if (do_left) {
- const uint8x16_t L0 = vld1q_u8(left); // left row
- const uint8x16_t L1 = vld1q_u8(left + 16);
- const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left
- const uint16x8_t p1 = vpaddlq_u8(L1);
- const uint16x8_t p2 = vaddq_u16(p0, p1);
- const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
- const uint16x4_t p4 = vpadd_u16(p3, p3);
- const uint16x4_t p5 = vpadd_u16(p4, p4);
- sum_left = vcombine_u16(p5, p5);
- }
-
- if (do_above && do_left) {
- const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
- dc0 = vrshrn_n_u16(sum, 6);
- } else if (do_above) {
- dc0 = vrshrn_n_u16(sum_top, 5);
- } else if (do_left) {
- dc0 = vrshrn_n_u16(sum_left, 5);
- } else {
- dc0 = vdup_n_u8(0x80);
- }
-
- {
- const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
- int i;
- for (i = 0; i < 32; ++i) {
- vst1q_u8(dst + i * stride, dc);
- vst1q_u8(dst + i * stride + 16, dc);
- }
- }
-}
-
-void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_32x32(dst, stride, above, left, 1, 1);
-}
-
-void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- dc_32x32(dst, stride, NULL, left, 0, 1);
-}
-
-void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)left;
- dc_32x32(dst, stride, above, NULL, 1, 0);
-}
-
-void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- dc_32x32(dst, stride, NULL, NULL, 0, 0);
-}
-
-// -----------------------------------------------------------------------------
-
-void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row
- const uint64x1_t A1 = vshr_n_u64(A0, 8);
- const uint64x1_t A2 = vshr_n_u64(A0, 16);
- const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
- const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
- const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
- const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
- const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
- const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
- const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
- const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
- const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
- const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
- (void)left;
- vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
- vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
- vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
- vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
- dst[3 * stride + 3] = above[7];
-}
-
-void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
- static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
- const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
- const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
- const uint8x8_t A0 = vld1_u8(above); // top row
- const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
- const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
- const uint8x8_t avg1 = vhadd_u8(A0, A2);
- uint8x8_t row = vrhadd_u8(avg1, A1);
- int i;
- (void)left;
- for (i = 0; i < 7; ++i) {
- vst1_u8(dst + i * stride, row);
- row = vtbl1_u8(row, sh_12345677);
- }
- vst1_u8(dst + i * stride, row);
-}
-
-void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const uint8x16_t A0 = vld1q_u8(above); // top row
- const uint8x16_t above_right = vld1q_dup_u8(above + 15);
- const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
- const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
- const uint8x16_t avg1 = vhaddq_u8(A0, A2);
- uint8x16_t row = vrhaddq_u8(avg1, A1);
- int i;
- (void)left;
- for (i = 0; i < 15; ++i) {
- vst1q_u8(dst + i * stride, row);
- row = vextq_u8(row, above_right, 1);
- }
- vst1q_u8(dst + i * stride, row);
-}
-
-// -----------------------------------------------------------------------------
-
-void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
- const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
- const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
- const uint32x2_t zero = vdup_n_u32(0);
- const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
- const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
- const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
- const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
- const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
- const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
- const uint8_t D = vget_lane_u8(XABCD_u8, 4);
- const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
- const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
- const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
- const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
- const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
- const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
- const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
- const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
- const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
- vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
- vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
- vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
- vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
-}
-
-#if !HAVE_NEON_ASM
-
-void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int i;
- uint32x2_t d0u32 = vdup_n_u32(0);
- (void)left;
-
- d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
- for (i = 0; i < 4; i++, dst += stride)
- vst1_lane_u32((uint32_t *)dst, d0u32, 0);
-}
-
-void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int i;
- uint8x8_t d0u8 = vdup_n_u8(0);
- (void)left;
-
- d0u8 = vld1_u8(above);
- for (i = 0; i < 8; i++, dst += stride)
- vst1_u8(dst, d0u8);
-}
-
-void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int i;
- uint8x16_t q0u8 = vdupq_n_u8(0);
- (void)left;
-
- q0u8 = vld1q_u8(above);
- for (i = 0; i < 16; i++, dst += stride)
- vst1q_u8(dst, q0u8);
-}
-
-void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int i;
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
- (void)left;
-
- q0u8 = vld1q_u8(above);
- q1u8 = vld1q_u8(above + 16);
- for (i = 0; i < 32; i++, dst += stride) {
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- }
-}
-
-void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint32x2_t d1u32 = vdup_n_u32(0);
- (void)above;
-
- d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
-
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-}
-
-void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint64x1_t d1u64 = vdup_n_u64(0);
- (void)above;
-
- d1u64 = vld1_u64((const uint64_t *)left);
-
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
- vst1_u8(dst, d0u8);
-}
-
-void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int j;
- uint8x8_t d2u8 = vdup_n_u8(0);
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
- (void)above;
-
- q1u8 = vld1q_u8(left);
- d2u8 = vget_low_u8(q1u8);
- for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
- q0u8 = vdupq_lane_u8(d2u8, 0);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 1);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 2);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 3);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 4);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 5);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 6);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 7);
- vst1q_u8(dst, q0u8);
- dst += stride;
- }
-}
-
-void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int j, k;
- uint8x8_t d2u8 = vdup_n_u8(0);
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
- (void)above;
-
- for (k = 0; k < 2; k++, left += 16) {
- q1u8 = vld1q_u8(left);
- d2u8 = vget_low_u8(q1u8);
- for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
- q0u8 = vdupq_lane_u8(d2u8, 0);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 1);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 2);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 3);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 4);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 5);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 6);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 7);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- }
- }
-}
-
-void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int i;
- uint16x8_t q1u16, q3u16;
- int16x8_t q1s16;
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint32x2_t d2u32 = vdup_n_u32(0);
-
- d0u8 = vld1_dup_u8(above - 1);
- d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
- q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
- for (i = 0; i < 4; i++, dst += stride) {
- q1u16 = vdupq_n_u16((uint16_t)left[i]);
- q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
- vreinterpretq_s16_u16(q3u16));
- d0u8 = vqmovun_s16(q1s16);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- }
-}
-
-void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int j;
- uint16x8_t q0u16, q3u16, q10u16;
- int16x8_t q0s16;
- uint16x4_t d20u16;
- uint8x8_t d0u8, d2u8, d30u8;
-
- d0u8 = vld1_dup_u8(above - 1);
- d30u8 = vld1_u8(left);
- d2u8 = vld1_u8(above);
- q10u16 = vmovl_u8(d30u8);
- q3u16 = vsubl_u8(d2u8, d0u8);
- d20u16 = vget_low_u16(q10u16);
- for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
- q0u16 = vdupq_lane_u16(d20u16, 0);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += stride;
- q0u16 = vdupq_lane_u16(d20u16, 1);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += stride;
- q0u16 = vdupq_lane_u16(d20u16, 2);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += stride;
- q0u16 = vdupq_lane_u16(d20u16, 3);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += stride;
- }
-}
-
-void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int j, k;
- uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
- uint8x16_t q0u8, q1u8;
- int16x8_t q0s16, q1s16, q8s16, q11s16;
- uint16x4_t d20u16;
- uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
-
- q0u8 = vld1q_dup_u8(above - 1);
- q1u8 = vld1q_u8(above);
- q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
- q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
- for (k = 0; k < 2; k++, left += 8) {
- d18u8 = vld1_u8(left);
- q10u16 = vmovl_u8(d18u8);
- d20u16 = vget_low_u16(q10u16);
- for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
- q0u16 = vdupq_lane_u16(d20u16, 0);
- q8u16 = vdupq_lane_u16(d20u16, 1);
- q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q2u16));
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q3u16));
- q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q2u16));
- q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q3u16));
- d2u8 = vqmovun_s16(q1s16);
- d3u8 = vqmovun_s16(q0s16);
- d22u8 = vqmovun_s16(q11s16);
- d23u8 = vqmovun_s16(q8s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
- dst += stride;
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
- dst += stride;
-
- q0u16 = vdupq_lane_u16(d20u16, 2);
- q8u16 = vdupq_lane_u16(d20u16, 3);
- q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q2u16));
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q3u16));
- q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q2u16));
- q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q3u16));
- d2u8 = vqmovun_s16(q1s16);
- d3u8 = vqmovun_s16(q0s16);
- d22u8 = vqmovun_s16(q11s16);
- d23u8 = vqmovun_s16(q8s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
- dst += stride;
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
- dst += stride;
- }
- }
-}
-
-void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int j, k;
- uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
- uint8x16_t q0u8, q1u8, q2u8;
- int16x8_t q12s16, q13s16, q14s16, q15s16;
- uint16x4_t d6u16;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
-
- q0u8 = vld1q_dup_u8(above - 1);
- q1u8 = vld1q_u8(above);
- q2u8 = vld1q_u8(above + 16);
- q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
- q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
- q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
- q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
- for (k = 0; k < 4; k++, left += 8) {
- d26u8 = vld1_u8(left);
- q3u16 = vmovl_u8(d26u8);
- d6u16 = vget_low_u16(q3u16);
- for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
- q0u16 = vdupq_lane_u16(d6u16, 0);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += stride;
-
- q0u16 = vdupq_lane_u16(d6u16, 1);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += stride;
-
- q0u16 = vdupq_lane_u16(d6u16, 2);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += stride;
-
- q0u16 = vdupq_lane_u16(d6u16, 3);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += stride;
- }
- }
-}
-#endif // !HAVE_NEON_ASM
diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c
deleted file mode 100644
index d24e6adc8a..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-static INLINE void loop_filter_neon_16(
- uint8x16_t qblimit, // blimit
- uint8x16_t qlimit, // limit
- uint8x16_t qthresh, // thresh
- uint8x16_t q3, // p3
- uint8x16_t q4, // p2
- uint8x16_t q5, // p1
- uint8x16_t q6, // p0
- uint8x16_t q7, // q0
- uint8x16_t q8, // q1
- uint8x16_t q9, // q2
- uint8x16_t q10, // q3
- uint8x16_t *q5r, // p1
- uint8x16_t *q6r, // p0
- uint8x16_t *q7r, // q0
- uint8x16_t *q8r) { // q1
- uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
- int16x8_t q2s16, q11s16;
- uint16x8_t q4u16;
- int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
- int8x8_t d2s8, d3s8;
-
- q11u8 = vabdq_u8(q3, q4);
- q12u8 = vabdq_u8(q4, q5);
- q13u8 = vabdq_u8(q5, q6);
- q14u8 = vabdq_u8(q8, q7);
- q3 = vabdq_u8(q9, q8);
- q4 = vabdq_u8(q10, q9);
-
- q11u8 = vmaxq_u8(q11u8, q12u8);
- q12u8 = vmaxq_u8(q13u8, q14u8);
- q3 = vmaxq_u8(q3, q4);
- q15u8 = vmaxq_u8(q11u8, q12u8);
-
- q9 = vabdq_u8(q6, q7);
-
- // vp8_hevmask
- q13u8 = vcgtq_u8(q13u8, qthresh);
- q14u8 = vcgtq_u8(q14u8, qthresh);
- q15u8 = vmaxq_u8(q15u8, q3);
-
- q2u8 = vabdq_u8(q5, q8);
- q9 = vqaddq_u8(q9, q9);
-
- q15u8 = vcgeq_u8(qlimit, q15u8);
-
- // vp8_filter() function
- // convert to signed
- q10 = vdupq_n_u8(0x80);
- q8 = veorq_u8(q8, q10);
- q7 = veorq_u8(q7, q10);
- q6 = veorq_u8(q6, q10);
- q5 = veorq_u8(q5, q10);
-
- q2u8 = vshrq_n_u8(q2u8, 1);
- q9 = vqaddq_u8(q9, q2u8);
-
- q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
- vget_low_s8(vreinterpretq_s8_u8(q6)));
- q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
- vget_high_s8(vreinterpretq_s8_u8(q6)));
-
- q9 = vcgeq_u8(qblimit, q9);
-
- q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
- vreinterpretq_s8_u8(q8));
-
- q14u8 = vorrq_u8(q13u8, q14u8);
-
- q4u16 = vdupq_n_u16(3);
- q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
- q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
-
- q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
- q15u8 = vandq_u8(q15u8, q9);
-
- q1s8 = vreinterpretq_s8_u8(q1u8);
- q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
- q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
-
- q4 = vdupq_n_u8(3);
- q9 = vdupq_n_u8(4);
- // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
- d2s8 = vqmovn_s16(q2s16);
- d3s8 = vqmovn_s16(q11s16);
- q1s8 = vcombine_s8(d2s8, d3s8);
- q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
- q1s8 = vreinterpretq_s8_u8(q1u8);
-
- q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
- q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
- q2s8 = vshrq_n_s8(q2s8, 3);
- q1s8 = vshrq_n_s8(q1s8, 3);
-
- q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
- q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
-
- q1s8 = vrshrq_n_s8(q1s8, 1);
- q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
-
- q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
- q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
-
- *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
- *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
- *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
- *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
- return;
-}
-
-void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
- uint8x16_t qblimit, qlimit, qthresh;
- uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
-
- dblimit0 = vld1_u8(blimit0);
- dlimit0 = vld1_u8(limit0);
- dthresh0 = vld1_u8(thresh0);
- dblimit1 = vld1_u8(blimit1);
- dlimit1 = vld1_u8(limit1);
- dthresh1 = vld1_u8(thresh1);
- qblimit = vcombine_u8(dblimit0, dblimit1);
- qlimit = vcombine_u8(dlimit0, dlimit1);
- qthresh = vcombine_u8(dthresh0, dthresh1);
-
- s -= (p << 2);
-
- q3u8 = vld1q_u8(s);
- s += p;
- q4u8 = vld1q_u8(s);
- s += p;
- q5u8 = vld1q_u8(s);
- s += p;
- q6u8 = vld1q_u8(s);
- s += p;
- q7u8 = vld1q_u8(s);
- s += p;
- q8u8 = vld1q_u8(s);
- s += p;
- q9u8 = vld1q_u8(s);
- s += p;
- q10u8 = vld1q_u8(s);
-
- loop_filter_neon_16(qblimit, qlimit, qthresh,
- q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
- &q5u8, &q6u8, &q7u8, &q8u8);
-
- s -= (p * 5);
- vst1q_u8(s, q5u8);
- s += p;
- vst1q_u8(s, q6u8);
- s += p;
- vst1q_u8(s, q7u8);
- s += p;
- vst1q_u8(s, q8u8);
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c
deleted file mode 100644
index 7f3ee70b94..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-
-static INLINE void loop_filter_neon(
- uint8x8_t dblimit, // flimit
- uint8x8_t dlimit, // limit
- uint8x8_t dthresh, // thresh
- uint8x8_t d3u8, // p3
- uint8x8_t d4u8, // p2
- uint8x8_t d5u8, // p1
- uint8x8_t d6u8, // p0
- uint8x8_t d7u8, // q0
- uint8x8_t d16u8, // q1
- uint8x8_t d17u8, // q2
- uint8x8_t d18u8, // q3
- uint8x8_t *d4ru8, // p1
- uint8x8_t *d5ru8, // p0
- uint8x8_t *d6ru8, // q0
- uint8x8_t *d7ru8) { // q1
- uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
- int16x8_t q12s16;
- int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
-
- d19u8 = vabd_u8(d3u8, d4u8);
- d20u8 = vabd_u8(d4u8, d5u8);
- d21u8 = vabd_u8(d5u8, d6u8);
- d22u8 = vabd_u8(d16u8, d7u8);
- d3u8 = vabd_u8(d17u8, d16u8);
- d4u8 = vabd_u8(d18u8, d17u8);
-
- d19u8 = vmax_u8(d19u8, d20u8);
- d20u8 = vmax_u8(d21u8, d22u8);
- d3u8 = vmax_u8(d3u8, d4u8);
- d23u8 = vmax_u8(d19u8, d20u8);
-
- d17u8 = vabd_u8(d6u8, d7u8);
-
- d21u8 = vcgt_u8(d21u8, dthresh);
- d22u8 = vcgt_u8(d22u8, dthresh);
- d23u8 = vmax_u8(d23u8, d3u8);
-
- d28u8 = vabd_u8(d5u8, d16u8);
- d17u8 = vqadd_u8(d17u8, d17u8);
-
- d23u8 = vcge_u8(dlimit, d23u8);
-
- d18u8 = vdup_n_u8(0x80);
- d5u8 = veor_u8(d5u8, d18u8);
- d6u8 = veor_u8(d6u8, d18u8);
- d7u8 = veor_u8(d7u8, d18u8);
- d16u8 = veor_u8(d16u8, d18u8);
-
- d28u8 = vshr_n_u8(d28u8, 1);
- d17u8 = vqadd_u8(d17u8, d28u8);
-
- d19u8 = vdup_n_u8(3);
-
- d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
- vreinterpret_s8_u8(d6u8));
-
- d17u8 = vcge_u8(dblimit, d17u8);
-
- d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
- vreinterpret_s8_u8(d16u8));
-
- d22u8 = vorr_u8(d21u8, d22u8);
-
- q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
- d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
- d23u8 = vand_u8(d23u8, d17u8);
-
- q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
-
- d17u8 = vdup_n_u8(4);
-
- d27s8 = vqmovn_s16(q12s16);
- d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
- d27s8 = vreinterpret_s8_u8(d27u8);
-
- d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
- d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
- d28s8 = vshr_n_s8(d28s8, 3);
- d27s8 = vshr_n_s8(d27s8, 3);
-
- d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
- d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
-
- d27s8 = vrshr_n_s8(d27s8, 1);
- d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
-
- d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
- d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
-
- *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
- *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
- *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
- *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
- return;
-}
-
-void vpx_lpf_horizontal_4_neon(
- uint8_t *src,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- int i;
- uint8_t *s, *psrc;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- psrc = src - (pitch << 2);
- for (i = 0; i < 1; i++) {
- s = psrc + i * 8;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- loop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d4u8, &d5u8, &d6u8, &d7u8);
-
- s -= (pitch * 5);
- vst1_u8(s, d4u8);
- s += pitch;
- vst1_u8(s, d5u8);
- s += pitch;
- vst1_u8(s, d6u8);
- s += pitch;
- vst1_u8(s, d7u8);
- }
- return;
-}
-
-void vpx_lpf_vertical_4_neon(
- uint8_t *src,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- int i, pitch8;
- uint8_t *s;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
- uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
- uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
- uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
- uint8x8x4_t d4Result;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- pitch8 = pitch * 8;
- for (i = 0; i < 1; i++, src += pitch8) {
- s = src - (i + 1) * 4;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
- vreinterpret_u32_u8(d7u8));
- d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
- vreinterpret_u32_u8(d16u8));
- d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
- vreinterpret_u32_u8(d17u8));
- d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
- vreinterpret_u32_u8(d18u8));
-
- d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
- vreinterpret_u16_u32(d2tmp2.val[0]));
- d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
- vreinterpret_u16_u32(d2tmp3.val[0]));
- d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
- vreinterpret_u16_u32(d2tmp2.val[1]));
- d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
- vreinterpret_u16_u32(d2tmp3.val[1]));
-
- d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
- vreinterpret_u8_u16(d2tmp5.val[0]));
- d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
- vreinterpret_u8_u16(d2tmp5.val[1]));
- d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
- vreinterpret_u8_u16(d2tmp7.val[0]));
- d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
- vreinterpret_u8_u16(d2tmp7.val[1]));
-
- d3u8 = d2tmp8.val[0];
- d4u8 = d2tmp8.val[1];
- d5u8 = d2tmp9.val[0];
- d6u8 = d2tmp9.val[1];
- d7u8 = d2tmp10.val[0];
- d16u8 = d2tmp10.val[1];
- d17u8 = d2tmp11.val[0];
- d18u8 = d2tmp11.val[1];
-
- loop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d4u8, &d5u8, &d6u8, &d7u8);
-
- d4Result.val[0] = d4u8;
- d4Result.val[1] = d5u8;
- d4Result.val[2] = d6u8;
- d4Result.val[3] = d7u8;
-
- src -= 2;
- vst4_lane_u8(src, d4Result, 0);
- src += pitch;
- vst4_lane_u8(src, d4Result, 1);
- src += pitch;
- vst4_lane_u8(src, d4Result, 2);
- src += pitch;
- vst4_lane_u8(src, d4Result, 3);
- src += pitch;
- vst4_lane_u8(src, d4Result, 4);
- src += pitch;
- vst4_lane_u8(src, d4Result, 5);
- src += pitch;
- vst4_lane_u8(src, d4Result, 6);
- src += pitch;
- vst4_lane_u8(src, d4Result, 7);
- }
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c
deleted file mode 100644
index ec3757380d..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c
+++ /dev/null
@@ -1,445 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-
-static INLINE void mbloop_filter_neon(
- uint8x8_t dblimit, // mblimit
- uint8x8_t dlimit, // limit
- uint8x8_t dthresh, // thresh
- uint8x8_t d3u8, // p2
- uint8x8_t d4u8, // p2
- uint8x8_t d5u8, // p1
- uint8x8_t d6u8, // p0
- uint8x8_t d7u8, // q0
- uint8x8_t d16u8, // q1
- uint8x8_t d17u8, // q2
- uint8x8_t d18u8, // q3
- uint8x8_t *d0ru8, // p1
- uint8x8_t *d1ru8, // p1
- uint8x8_t *d2ru8, // p0
- uint8x8_t *d3ru8, // q0
- uint8x8_t *d4ru8, // q1
- uint8x8_t *d5ru8) { // q1
- uint32_t flat;
- uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
- uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
- int16x8_t q15s16;
- uint16x8_t q10u16, q14u16;
- int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
-
- d19u8 = vabd_u8(d3u8, d4u8);
- d20u8 = vabd_u8(d4u8, d5u8);
- d21u8 = vabd_u8(d5u8, d6u8);
- d22u8 = vabd_u8(d16u8, d7u8);
- d23u8 = vabd_u8(d17u8, d16u8);
- d24u8 = vabd_u8(d18u8, d17u8);
-
- d19u8 = vmax_u8(d19u8, d20u8);
- d20u8 = vmax_u8(d21u8, d22u8);
-
- d25u8 = vabd_u8(d6u8, d4u8);
-
- d23u8 = vmax_u8(d23u8, d24u8);
-
- d26u8 = vabd_u8(d7u8, d17u8);
-
- d19u8 = vmax_u8(d19u8, d20u8);
-
- d24u8 = vabd_u8(d6u8, d7u8);
- d27u8 = vabd_u8(d3u8, d6u8);
- d28u8 = vabd_u8(d18u8, d7u8);
-
- d19u8 = vmax_u8(d19u8, d23u8);
-
- d23u8 = vabd_u8(d5u8, d16u8);
- d24u8 = vqadd_u8(d24u8, d24u8);
-
-
- d19u8 = vcge_u8(dlimit, d19u8);
-
-
- d25u8 = vmax_u8(d25u8, d26u8);
- d26u8 = vmax_u8(d27u8, d28u8);
-
- d23u8 = vshr_n_u8(d23u8, 1);
-
- d25u8 = vmax_u8(d25u8, d26u8);
-
- d24u8 = vqadd_u8(d24u8, d23u8);
-
- d20u8 = vmax_u8(d20u8, d25u8);
-
- d23u8 = vdup_n_u8(1);
- d24u8 = vcge_u8(dblimit, d24u8);
-
- d21u8 = vcgt_u8(d21u8, dthresh);
-
- d20u8 = vcge_u8(d23u8, d20u8);
-
- d19u8 = vand_u8(d19u8, d24u8);
-
- d23u8 = vcgt_u8(d22u8, dthresh);
-
- d20u8 = vand_u8(d20u8, d19u8);
-
- d22u8 = vdup_n_u8(0x80);
-
- d23u8 = vorr_u8(d21u8, d23u8);
-
- q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
- vreinterpret_u16_u8(d21u8));
-
- d30u8 = vshrn_n_u16(q10u16, 4);
- flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
-
- if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
- d27u8 = vdup_n_u8(3);
- d21u8 = vdup_n_u8(2);
- q14u16 = vaddl_u8(d6u8, d7u8);
- q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
- q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- *d0ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
- *d1ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- *d2ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d3ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vsubw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d4ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vsubw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d5ru8 = vqrshrn_n_u16(q14u16, 3);
- } else {
- d21u8 = veor_u8(d7u8, d22u8);
- d24u8 = veor_u8(d6u8, d22u8);
- d25u8 = veor_u8(d5u8, d22u8);
- d26u8 = veor_u8(d16u8, d22u8);
-
- d27u8 = vdup_n_u8(3);
-
- d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
- d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
-
- q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
-
- d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
- q15s16 = vaddw_s8(q15s16, d29s8);
-
- d29u8 = vdup_n_u8(4);
-
- d28s8 = vqmovn_s16(q15s16);
-
- d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
- d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
- d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
- d30s8 = vshr_n_s8(d30s8, 3);
- d29s8 = vshr_n_s8(d29s8, 3);
-
- d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
- d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
-
- d29s8 = vrshr_n_s8(d29s8, 1);
- d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
- d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
- d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
-
- if (flat == 0) { // filter_branch_only
- *d0ru8 = d4u8;
- *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
- *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
- *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
- *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
- *d5ru8 = d17u8;
- return;
- }
-
- d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
- d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
- d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
- d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-
- d23u8 = vdup_n_u8(2);
- q14u16 = vaddl_u8(d6u8, d7u8);
- q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
- q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
-
- d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
-
- q14u16 = vaddw_u8(q14u16, d5u8);
-
- d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
-
- d30u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
-
- d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
-
- d31u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
-
- *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
-
- d23u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d7u8);
-
- *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
-
- q14u16 = vaddw_u8(q14u16, d18u8);
-
- *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
-
- d22u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vsubw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
-
- d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
-
- q14u16 = vaddw_u8(q14u16, d18u8);
-
- d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
-
- d6u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vsubw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
-
- d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
-
- d7u8 = vqrshrn_n_u16(q14u16, 3);
-
- *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
- *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
- *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
- }
- return;
-}
-
-void vpx_lpf_horizontal_8_neon(
- uint8_t *src,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- int i;
- uint8_t *s, *psrc;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- uint8x8_t d16u8, d17u8, d18u8;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- psrc = src - (pitch << 2);
- for (i = 0; i < 1; i++) {
- s = psrc + i * 8;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- mbloop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
- s -= (pitch * 6);
- vst1_u8(s, d0u8);
- s += pitch;
- vst1_u8(s, d1u8);
- s += pitch;
- vst1_u8(s, d2u8);
- s += pitch;
- vst1_u8(s, d3u8);
- s += pitch;
- vst1_u8(s, d4u8);
- s += pitch;
- vst1_u8(s, d5u8);
- }
- return;
-}
-
-void vpx_lpf_vertical_8_neon(
- uint8_t *src,
- int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- int i;
- uint8_t *s;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- uint8x8_t d16u8, d17u8, d18u8;
- uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
- uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
- uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
- uint8x8x4_t d4Result;
- uint8x8x2_t d2Result;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- for (i = 0; i < 1; i++) {
- s = src + (i * (pitch << 3)) - 4;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
- vreinterpret_u32_u8(d7u8));
- d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
- vreinterpret_u32_u8(d16u8));
- d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
- vreinterpret_u32_u8(d17u8));
- d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
- vreinterpret_u32_u8(d18u8));
-
- d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
- vreinterpret_u16_u32(d2tmp2.val[0]));
- d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
- vreinterpret_u16_u32(d2tmp3.val[0]));
- d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
- vreinterpret_u16_u32(d2tmp2.val[1]));
- d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
- vreinterpret_u16_u32(d2tmp3.val[1]));
-
- d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
- vreinterpret_u8_u16(d2tmp5.val[0]));
- d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
- vreinterpret_u8_u16(d2tmp5.val[1]));
- d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
- vreinterpret_u8_u16(d2tmp7.val[0]));
- d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
- vreinterpret_u8_u16(d2tmp7.val[1]));
-
- d3u8 = d2tmp8.val[0];
- d4u8 = d2tmp8.val[1];
- d5u8 = d2tmp9.val[0];
- d6u8 = d2tmp9.val[1];
- d7u8 = d2tmp10.val[0];
- d16u8 = d2tmp10.val[1];
- d17u8 = d2tmp11.val[0];
- d18u8 = d2tmp11.val[1];
-
- mbloop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
- d4Result.val[0] = d0u8;
- d4Result.val[1] = d1u8;
- d4Result.val[2] = d2u8;
- d4Result.val[3] = d3u8;
-
- d2Result.val[0] = d4u8;
- d2Result.val[1] = d5u8;
-
- s = src - 3;
- vst4_lane_u8(s, d4Result, 0);
- s += pitch;
- vst4_lane_u8(s, d4Result, 1);
- s += pitch;
- vst4_lane_u8(s, d4Result, 2);
- s += pitch;
- vst4_lane_u8(s, d4Result, 3);
- s += pitch;
- vst4_lane_u8(s, d4Result, 4);
- s += pitch;
- vst4_lane_u8(s, d4Result, 5);
- s += pitch;
- vst4_lane_u8(s, d4Result, 6);
- s += pitch;
- vst4_lane_u8(s, d4Result, 7);
-
- s = src + 1;
- vst2_lane_u8(s, d2Result, 0);
- s += pitch;
- vst2_lane_u8(s, d2Result, 1);
- s += pitch;
- vst2_lane_u8(s, d2Result, 2);
- s += pitch;
- vst2_lane_u8(s, d2Result, 3);
- s += pitch;
- vst2_lane_u8(s, d2Result, 4);
- s += pitch;
- vst2_lane_u8(s, d2Result, 5);
- s += pitch;
- vst2_lane_u8(s, d2Result, 6);
- s += pitch;
- vst2_lane_u8(s, d2Result, 7);
- }
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c
deleted file mode 100644
index aa31f29358..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
- vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-
-#if HAVE_NEON_ASM
-void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
- vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
- vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
- vpx_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
-}
-#endif // HAVE_NEON_ASM
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c
deleted file mode 100644
index 8632250138..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(
- int16x4_t dsrc0,
- int16x4_t dsrc1,
- int16x4_t dsrc2,
- int16x4_t dsrc3,
- int16x4_t dsrc4,
- int16x4_t dsrc5,
- int16x4_t dsrc6,
- int16x4_t dsrc7,
- int16x8_t q0s16) {
- int32x4_t qdst;
- int16x4_t d0s16, d1s16;
-
- d0s16 = vget_low_s16(q0s16);
- d1s16 = vget_high_s16(q0s16);
-
- qdst = vmull_lane_s16(dsrc0, d0s16, 0);
- qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
- qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
- qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
- qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
- qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
- qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
- qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
- return qdst;
-}
-
-void vpx_convolve8_avg_horiz_neon(
- const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x,
- int x_step_q4,
- const int16_t *filter_y, // unused
- int y_step_q4, // unused
- int w,
- int h) {
- int width;
- const uint8_t *s;
- uint8_t *d;
- uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
- uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
- uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16;
- uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
- int16x8_t q0s16;
- uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
- int32x4_t q1s32, q2s32, q14s32, q15s32;
- uint16x8x2_t q0x2u16;
- uint8x8x2_t d0x2u8, d1x2u8;
- uint32x2x2_t d0x2u32;
- uint16x4x2_t d0x2u16, d1x2u16;
- uint32x4x2_t q0x2u32;
-
- assert(x_step_q4 == 16);
-
- q0s16 = vld1q_s16(filter_x);
-
- src -= 3; // adjust for taps
- for (; h > 0; h -= 4) { // loop_horiz_v
- s = src;
- d24u8 = vld1_u8(s);
- s += src_stride;
- d25u8 = vld1_u8(s);
- s += src_stride;
- d26u8 = vld1_u8(s);
- s += src_stride;
- d27u8 = vld1_u8(s);
-
- q12u8 = vcombine_u8(d24u8, d25u8);
- q13u8 = vcombine_u8(d26u8, d27u8);
-
- q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
- vreinterpretq_u16_u8(q13u8));
- d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
- d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
- d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
- d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
- d0x2u8 = vtrn_u8(d24u8, d25u8);
- d1x2u8 = vtrn_u8(d26u8, d27u8);
-
- __builtin_prefetch(src + src_stride * 4);
- __builtin_prefetch(src + src_stride * 5);
-
- q8u16 = vmovl_u8(d0x2u8.val[0]);
- q9u16 = vmovl_u8(d0x2u8.val[1]);
- q10u16 = vmovl_u8(d1x2u8.val[0]);
- q11u16 = vmovl_u8(d1x2u8.val[1]);
-
- src += 7;
- d16u16 = vget_low_u16(q8u16);
- d17u16 = vget_high_u16(q8u16);
- d18u16 = vget_low_u16(q9u16);
- d19u16 = vget_high_u16(q9u16);
- q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
- q9u16 = vcombine_u16(d17u16, d19u16);
-
- d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
- for (width = w;
- width > 0;
- width -= 4, src += 4, dst += 4) { // loop_horiz
- s = src;
- d28u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d29u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d31u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d30u32 = vld1_dup_u32((const uint32_t *)s);
-
- __builtin_prefetch(src + 64);
-
- d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
- vreinterpret_u16_u32(d31u32));
- d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
- vreinterpret_u16_u32(d30u32));
- d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
- vreinterpret_u8_u16(d1x2u16.val[0])); // d29
- d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
- vreinterpret_u8_u16(d1x2u16.val[1])); // d30
-
- __builtin_prefetch(src + 64 + src_stride);
-
- q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
- q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
- q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
- vreinterpretq_u32_u8(q15u8));
-
- d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
- d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
- q12u16 = vmovl_u8(d28u8);
- q13u16 = vmovl_u8(d29u8);
-
- __builtin_prefetch(src + 64 + src_stride * 2);
-
- d = dst;
- d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
- d += dst_stride;
- d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
- d += dst_stride;
- d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
- d += dst_stride;
- d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
-
- d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
- d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
- d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
- d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
- d18s16, d19s16, d23s16, d24s16, q0s16);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
- d19s16, d23s16, d24s16, d26s16, q0s16);
- q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
- d23s16, d24s16, d26s16, d27s16, q0s16);
- q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
-
- __builtin_prefetch(src + 64 + src_stride * 3);
-
- d2u16 = vqrshrun_n_s32(q1s32, 7);
- d3u16 = vqrshrun_n_s32(q2s32, 7);
- d4u16 = vqrshrun_n_s32(q14s32, 7);
- d5u16 = vqrshrun_n_s32(q15s32, 7);
-
- q1u16 = vcombine_u16(d2u16, d3u16);
- q2u16 = vcombine_u16(d4u16, d5u16);
-
- d2u8 = vqmovn_u16(q1u16);
- d3u8 = vqmovn_u16(q2u16);
-
- d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
- vreinterpret_u16_u8(d3u8));
- d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
- vreinterpret_u32_u16(d0x2u16.val[1]));
- d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
- vreinterpret_u8_u32(d0x2u32.val[1]));
-
- q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
- q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
- q1u8 = vrhaddq_u8(q1u8, q3u8);
-
- d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
- d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
- d = dst;
- vst1_lane_u32((uint32_t *)d, d2u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d2u32, 1);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
- q8u16 = q9u16;
- d20s16 = d23s16;
- q11u16 = q12u16;
- q9u16 = q13u16;
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- }
- src += src_stride * 4 - w - 7;
- dst += dst_stride * 4 - w;
- }
- return;
-}
-
-void vpx_convolve8_avg_vert_neon(
- const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x, // unused
- int x_step_q4, // unused
- const int16_t *filter_y,
- int y_step_q4,
- int w,
- int h) {
- int height;
- const uint8_t *s;
- uint8_t *d;
- uint8x8_t d2u8, d3u8;
- uint32x2_t d2u32, d3u32, d6u32, d7u32;
- uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
- uint8x16_t q1u8, q3u8;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16;
- uint16x4_t d2u16, d3u16, d4u16, d5u16;
- int16x8_t q0s16;
- uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
- int32x4_t q1s32, q2s32, q14s32, q15s32;
-
- assert(y_step_q4 == 16);
-
- src -= src_stride * 3;
- q0s16 = vld1q_s16(filter_y);
- for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
- s = src;
- d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
- s += src_stride;
- d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
- s += src_stride;
- d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
- s += src_stride;
- d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
- s += src_stride;
- d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
- s += src_stride;
- d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
- s += src_stride;
- d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
- s += src_stride;
-
- q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
- q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
- q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
- q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
- d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
- d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d = dst;
- for (height = h; height > 0; height -= 4) { // loop_vert
- d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
- s += src_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
- s += src_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
- s += src_stride;
- d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
- s += src_stride;
-
- q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
- q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
- d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
- d += dst_stride;
- d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
- d += dst_stride;
- d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
- d += dst_stride;
- d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
- d -= dst_stride * 3;
-
- d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
- d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
- d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
- __builtin_prefetch(s);
- __builtin_prefetch(s + src_stride);
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
- d20s16, d21s16, d22s16, d24s16, q0s16);
- __builtin_prefetch(s + src_stride * 2);
- __builtin_prefetch(s + src_stride * 3);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
- d21s16, d22s16, d24s16, d26s16, q0s16);
- __builtin_prefetch(d);
- __builtin_prefetch(d + dst_stride);
- q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
- d22s16, d24s16, d26s16, d27s16, q0s16);
- __builtin_prefetch(d + dst_stride * 2);
- __builtin_prefetch(d + dst_stride * 3);
- q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
-
- d2u16 = vqrshrun_n_s32(q1s32, 7);
- d3u16 = vqrshrun_n_s32(q2s32, 7);
- d4u16 = vqrshrun_n_s32(q14s32, 7);
- d5u16 = vqrshrun_n_s32(q15s32, 7);
-
- q1u16 = vcombine_u16(d2u16, d3u16);
- q2u16 = vcombine_u16(d4u16, d5u16);
-
- d2u8 = vqmovn_u16(q1u16);
- d3u8 = vqmovn_u16(q2u16);
-
- q1u8 = vcombine_u8(d2u8, d3u8);
- q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
- q1u8 = vrhaddq_u8(q1u8, q3u8);
-
- d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
- d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
- vst1_lane_u32((uint32_t *)d, d2u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d2u32, 1);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 1);
- d += dst_stride;
-
- q8u16 = q10u16;
- d18s16 = d22s16;
- d19s16 = d24s16;
- q10u16 = q13u16;
- d22s16 = d25s16;
- }
- }
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
deleted file mode 100644
index 9bd715e2c6..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
+++ /dev/null
@@ -1,340 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(
- int16x4_t dsrc0,
- int16x4_t dsrc1,
- int16x4_t dsrc2,
- int16x4_t dsrc3,
- int16x4_t dsrc4,
- int16x4_t dsrc5,
- int16x4_t dsrc6,
- int16x4_t dsrc7,
- int16x8_t q0s16) {
- int32x4_t qdst;
- int16x4_t d0s16, d1s16;
-
- d0s16 = vget_low_s16(q0s16);
- d1s16 = vget_high_s16(q0s16);
-
- qdst = vmull_lane_s16(dsrc0, d0s16, 0);
- qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
- qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
- qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
- qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
- qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
- qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
- qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
- return qdst;
-}
-
-void vpx_convolve8_horiz_neon(
- const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x,
- int x_step_q4,
- const int16_t *filter_y, // unused
- int y_step_q4, // unused
- int w,
- int h) {
- int width;
- const uint8_t *s, *psrc;
- uint8_t *d, *pdst;
- uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
- uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
- uint8x16_t q12u8, q13u8, q14u8, q15u8;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16;
- uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
- int16x8_t q0s16;
- uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
- int32x4_t q1s32, q2s32, q14s32, q15s32;
- uint16x8x2_t q0x2u16;
- uint8x8x2_t d0x2u8, d1x2u8;
- uint32x2x2_t d0x2u32;
- uint16x4x2_t d0x2u16, d1x2u16;
- uint32x4x2_t q0x2u32;
-
- assert(x_step_q4 == 16);
-
- q0s16 = vld1q_s16(filter_x);
-
- src -= 3; // adjust for taps
- for (; h > 0; h -= 4,
- src += src_stride * 4,
- dst += dst_stride * 4) { // loop_horiz_v
- s = src;
- d24u8 = vld1_u8(s);
- s += src_stride;
- d25u8 = vld1_u8(s);
- s += src_stride;
- d26u8 = vld1_u8(s);
- s += src_stride;
- d27u8 = vld1_u8(s);
-
- q12u8 = vcombine_u8(d24u8, d25u8);
- q13u8 = vcombine_u8(d26u8, d27u8);
-
- q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
- vreinterpretq_u16_u8(q13u8));
- d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
- d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
- d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
- d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
- d0x2u8 = vtrn_u8(d24u8, d25u8);
- d1x2u8 = vtrn_u8(d26u8, d27u8);
-
- __builtin_prefetch(src + src_stride * 4);
- __builtin_prefetch(src + src_stride * 5);
- __builtin_prefetch(src + src_stride * 6);
-
- q8u16 = vmovl_u8(d0x2u8.val[0]);
- q9u16 = vmovl_u8(d0x2u8.val[1]);
- q10u16 = vmovl_u8(d1x2u8.val[0]);
- q11u16 = vmovl_u8(d1x2u8.val[1]);
-
- d16u16 = vget_low_u16(q8u16);
- d17u16 = vget_high_u16(q8u16);
- d18u16 = vget_low_u16(q9u16);
- d19u16 = vget_high_u16(q9u16);
- q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
- q9u16 = vcombine_u16(d17u16, d19u16);
-
- d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
- for (width = w, psrc = src + 7, pdst = dst;
- width > 0;
- width -= 4, psrc += 4, pdst += 4) { // loop_horiz
- s = psrc;
- d28u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d29u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d31u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d30u32 = vld1_dup_u32((const uint32_t *)s);
-
- __builtin_prefetch(psrc + 64);
-
- d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
- vreinterpret_u16_u32(d31u32));
- d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
- vreinterpret_u16_u32(d30u32));
- d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
- vreinterpret_u8_u16(d1x2u16.val[0])); // d29
- d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
- vreinterpret_u8_u16(d1x2u16.val[1])); // d30
-
- __builtin_prefetch(psrc + 64 + src_stride);
-
- q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
- q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
- q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
- vreinterpretq_u32_u8(q15u8));
-
- d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
- d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
- q12u16 = vmovl_u8(d28u8);
- q13u16 = vmovl_u8(d29u8);
-
- __builtin_prefetch(psrc + 64 + src_stride * 2);
-
- d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
- d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
- d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
- d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
- d18s16, d19s16, d23s16, d24s16, q0s16);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
- d19s16, d23s16, d24s16, d26s16, q0s16);
- q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
- d23s16, d24s16, d26s16, d27s16, q0s16);
- q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
-
- __builtin_prefetch(psrc + 60 + src_stride * 3);
-
- d2u16 = vqrshrun_n_s32(q1s32, 7);
- d3u16 = vqrshrun_n_s32(q2s32, 7);
- d4u16 = vqrshrun_n_s32(q14s32, 7);
- d5u16 = vqrshrun_n_s32(q15s32, 7);
-
- q1u16 = vcombine_u16(d2u16, d3u16);
- q2u16 = vcombine_u16(d4u16, d5u16);
-
- d2u8 = vqmovn_u16(q1u16);
- d3u8 = vqmovn_u16(q2u16);
-
- d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
- vreinterpret_u16_u8(d3u8));
- d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
- vreinterpret_u32_u16(d0x2u16.val[1]));
- d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
- vreinterpret_u8_u32(d0x2u32.val[1]));
-
- d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
- d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
-
- d = pdst;
- vst1_lane_u32((uint32_t *)d, d2u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d2u32, 1);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
- q8u16 = q9u16;
- d20s16 = d23s16;
- q11u16 = q12u16;
- q9u16 = q13u16;
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- }
- }
- return;
-}
-
-void vpx_convolve8_vert_neon(
- const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x, // unused
- int x_step_q4, // unused
- const int16_t *filter_y,
- int y_step_q4,
- int w,
- int h) {
- int height;
- const uint8_t *s;
- uint8_t *d;
- uint32x2_t d2u32, d3u32;
- uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16;
- uint16x4_t d2u16, d3u16, d4u16, d5u16;
- int16x8_t q0s16;
- uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
- int32x4_t q1s32, q2s32, q14s32, q15s32;
-
- assert(y_step_q4 == 16);
-
- src -= src_stride * 3;
- q0s16 = vld1q_s16(filter_y);
- for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
- s = src;
- d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
- s += src_stride;
- d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
- s += src_stride;
- d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
- s += src_stride;
- d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
- s += src_stride;
- d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
- s += src_stride;
- d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
- s += src_stride;
- d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
- s += src_stride;
-
- q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
- q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
- q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
- q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
- d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
- d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d = dst;
- for (height = h; height > 0; height -= 4) { // loop_vert
- d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
- s += src_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
- s += src_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
- s += src_stride;
- d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
- s += src_stride;
-
- q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
- q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
- d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
- d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
- d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
- __builtin_prefetch(d);
- __builtin_prefetch(d + dst_stride);
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
- d20s16, d21s16, d22s16, d24s16, q0s16);
- __builtin_prefetch(d + dst_stride * 2);
- __builtin_prefetch(d + dst_stride * 3);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
- d21s16, d22s16, d24s16, d26s16, q0s16);
- __builtin_prefetch(s);
- __builtin_prefetch(s + src_stride);
- q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
- d22s16, d24s16, d26s16, d27s16, q0s16);
- __builtin_prefetch(s + src_stride * 2);
- __builtin_prefetch(s + src_stride * 3);
- q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
-
- d2u16 = vqrshrun_n_s32(q1s32, 7);
- d3u16 = vqrshrun_n_s32(q2s32, 7);
- d4u16 = vqrshrun_n_s32(q14s32, 7);
- d5u16 = vqrshrun_n_s32(q15s32, 7);
-
- q1u16 = vcombine_u16(d2u16, d3u16);
- q2u16 = vcombine_u16(d4u16, d5u16);
-
- d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
- d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
-
- vst1_lane_u32((uint32_t *)d, d2u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d2u32, 1);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 1);
- d += dst_stride;
-
- q8u16 = q10u16;
- d18s16 = d22s16;
- d19s16 = d24s16;
- q10u16 = q13u16;
- d22s16 = d25s16;
- }
- }
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
deleted file mode 100644
index dc58a332f8..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-
-void vpx_convolve_avg_neon(
- const uint8_t *src, // r0
- ptrdiff_t src_stride, // r1
- uint8_t *dst, // r2
- ptrdiff_t dst_stride, // r3
- const int16_t *filter_x,
- int filter_x_stride,
- const int16_t *filter_y,
- int filter_y_stride,
- int w,
- int h) {
- uint8_t *d;
- uint8x8_t d0u8, d1u8, d2u8, d3u8;
- uint32x2_t d0u32, d2u32;
- uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
-
- d = dst;
- if (w > 32) { // avg64
- for (; h > 0; h -= 1) {
- q0u8 = vld1q_u8(src);
- q1u8 = vld1q_u8(src + 16);
- q2u8 = vld1q_u8(src + 32);
- q3u8 = vld1q_u8(src + 48);
- src += src_stride;
- q8u8 = vld1q_u8(d);
- q9u8 = vld1q_u8(d + 16);
- q10u8 = vld1q_u8(d + 32);
- q11u8 = vld1q_u8(d + 48);
- d += dst_stride;
-
- q0u8 = vrhaddq_u8(q0u8, q8u8);
- q1u8 = vrhaddq_u8(q1u8, q9u8);
- q2u8 = vrhaddq_u8(q2u8, q10u8);
- q3u8 = vrhaddq_u8(q3u8, q11u8);
-
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- vst1q_u8(dst + 32, q2u8);
- vst1q_u8(dst + 48, q3u8);
- dst += dst_stride;
- }
- } else if (w == 32) { // avg32
- for (; h > 0; h -= 2) {
- q0u8 = vld1q_u8(src);
- q1u8 = vld1q_u8(src + 16);
- src += src_stride;
- q2u8 = vld1q_u8(src);
- q3u8 = vld1q_u8(src + 16);
- src += src_stride;
- q8u8 = vld1q_u8(d);
- q9u8 = vld1q_u8(d + 16);
- d += dst_stride;
- q10u8 = vld1q_u8(d);
- q11u8 = vld1q_u8(d + 16);
- d += dst_stride;
-
- q0u8 = vrhaddq_u8(q0u8, q8u8);
- q1u8 = vrhaddq_u8(q1u8, q9u8);
- q2u8 = vrhaddq_u8(q2u8, q10u8);
- q3u8 = vrhaddq_u8(q3u8, q11u8);
-
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- dst += dst_stride;
- vst1q_u8(dst, q2u8);
- vst1q_u8(dst + 16, q3u8);
- dst += dst_stride;
- }
- } else if (w > 8) { // avg16
- for (; h > 0; h -= 2) {
- q0u8 = vld1q_u8(src);
- src += src_stride;
- q1u8 = vld1q_u8(src);
- src += src_stride;
- q2u8 = vld1q_u8(d);
- d += dst_stride;
- q3u8 = vld1q_u8(d);
- d += dst_stride;
-
- q0u8 = vrhaddq_u8(q0u8, q2u8);
- q1u8 = vrhaddq_u8(q1u8, q3u8);
-
- vst1q_u8(dst, q0u8);
- dst += dst_stride;
- vst1q_u8(dst, q1u8);
- dst += dst_stride;
- }
- } else if (w == 8) { // avg8
- for (; h > 0; h -= 2) {
- d0u8 = vld1_u8(src);
- src += src_stride;
- d1u8 = vld1_u8(src);
- src += src_stride;
- d2u8 = vld1_u8(d);
- d += dst_stride;
- d3u8 = vld1_u8(d);
- d += dst_stride;
-
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- q0u8 = vrhaddq_u8(q0u8, q1u8);
-
- vst1_u8(dst, vget_low_u8(q0u8));
- dst += dst_stride;
- vst1_u8(dst, vget_high_u8(q0u8));
- dst += dst_stride;
- }
- } else { // avg4
- for (; h > 0; h -= 2) {
- d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
- src += src_stride;
- d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
- src += src_stride;
- d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
- d += dst_stride;
- d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
- d += dst_stride;
-
- d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
- vreinterpret_u8_u32(d2u32));
-
- d0u32 = vreinterpret_u32_u8(d0u8);
- vst1_lane_u32((uint32_t *)dst, d0u32, 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, d0u32, 1);
- dst += dst_stride;
- }
- }
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
deleted file mode 100644
index d8fb97a861..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-
-void vpx_convolve_copy_neon(
- const uint8_t *src, // r0
- ptrdiff_t src_stride, // r1
- uint8_t *dst, // r2
- ptrdiff_t dst_stride, // r3
- const int16_t *filter_x,
- int filter_x_stride,
- const int16_t *filter_y,
- int filter_y_stride,
- int w,
- int h) {
- uint8x8_t d0u8, d2u8;
- uint8x16_t q0u8, q1u8, q2u8, q3u8;
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
-
- if (w > 32) { // copy64
- for (; h > 0; h--) {
- q0u8 = vld1q_u8(src);
- q1u8 = vld1q_u8(src + 16);
- q2u8 = vld1q_u8(src + 32);
- q3u8 = vld1q_u8(src + 48);
- src += src_stride;
-
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- vst1q_u8(dst + 32, q2u8);
- vst1q_u8(dst + 48, q3u8);
- dst += dst_stride;
- }
- } else if (w == 32) { // copy32
- for (; h > 0; h -= 2) {
- q0u8 = vld1q_u8(src);
- q1u8 = vld1q_u8(src + 16);
- src += src_stride;
- q2u8 = vld1q_u8(src);
- q3u8 = vld1q_u8(src + 16);
- src += src_stride;
-
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- dst += dst_stride;
- vst1q_u8(dst, q2u8);
- vst1q_u8(dst + 16, q3u8);
- dst += dst_stride;
- }
- } else if (w > 8) { // copy16
- for (; h > 0; h -= 2) {
- q0u8 = vld1q_u8(src);
- src += src_stride;
- q1u8 = vld1q_u8(src);
- src += src_stride;
-
- vst1q_u8(dst, q0u8);
- dst += dst_stride;
- vst1q_u8(dst, q1u8);
- dst += dst_stride;
- }
- } else if (w == 8) { // copy8
- for (; h > 0; h -= 2) {
- d0u8 = vld1_u8(src);
- src += src_stride;
- d2u8 = vld1_u8(src);
- src += src_stride;
-
- vst1_u8(dst, d0u8);
- dst += dst_stride;
- vst1_u8(dst, d2u8);
- dst += dst_stride;
- }
- } else { // copy4
- for (; h > 0; h--) {
- *(uint32_t *)dst = *(const uint32_t *)src;
- src += src_stride;
- dst += dst_stride;
- }
- }
- return;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
deleted file mode 100644
index 1506ce6203..0000000000
--- a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_ports/mem.h"
-
-void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
- * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
- */
- DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
-
- // Account for the vertical phase needing 3 lines prior and 4 lines post
- int intermediate_height = h + 7;
-
- assert(y_step_q4 == 16);
- assert(x_step_q4 == 16);
-
- /* Filter starting 3 lines back. The neon implementation will ignore the
- * given height and filter a multiple of 4 lines. Since this goes in to
- * the temp buffer which has lots of extra room and is subsequently discarded
- * this is safe if somewhat less than ideal.
- */
- vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
- temp, 64,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, intermediate_height);
-
- /* Step into the temp buffer 3 lines to get the actual frame data */
- vpx_convolve8_vert_neon(temp + 64 * 3, 64,
- dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
-}
-
-void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
- int intermediate_height = h + 7;
-
- assert(y_step_q4 == 16);
- assert(x_step_q4 == 16);
-
- /* This implementation has the same issues as above. In addition, we only want
- * to average the values after both passes.
- */
- vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
- temp, 64,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, intermediate_height);
- vpx_convolve8_avg_vert_neon(temp + 64 * 3,
- 64, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
-}
diff --git a/thirdparty/libvpx/vpx_dsp/bitreader.c b/thirdparty/libvpx/vpx_dsp/bitreader.c
deleted file mode 100644
index 8140e78e70..0000000000
--- a/thirdparty/libvpx/vpx_dsp/bitreader.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#include <stdlib.h>
-
-#include "./vpx_config.h"
-
-#include "vpx_dsp/bitreader.h"
-#include "vpx_dsp/prob.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_ports/mem.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_util/endian_inl.h"
-
-int vpx_reader_init(vpx_reader *r,
- const uint8_t *buffer,
- size_t size,
- vpx_decrypt_cb decrypt_cb,
- void *decrypt_state) {
- if (size && !buffer) {
- return 1;
- } else {
- r->buffer_end = buffer + size;
- r->buffer = buffer;
- r->value = 0;
- r->count = -8;
- r->range = 255;
- r->decrypt_cb = decrypt_cb;
- r->decrypt_state = decrypt_state;
- vpx_reader_fill(r);
- return vpx_read_bit(r) != 0; // marker bit
- }
-}
-
-void vpx_reader_fill(vpx_reader *r) {
- const uint8_t *const buffer_end = r->buffer_end;
- const uint8_t *buffer = r->buffer;
- const uint8_t *buffer_start = buffer;
- BD_VALUE value = r->value;
- int count = r->count;
- const size_t bytes_left = buffer_end - buffer;
- const size_t bits_left = bytes_left * CHAR_BIT;
- int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
-
- if (r->decrypt_cb) {
- size_t n = VPXMIN(sizeof(r->clear_buffer), bytes_left);
- r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n);
- buffer = r->clear_buffer;
- buffer_start = r->clear_buffer;
- }
- if (bits_left > BD_VALUE_SIZE) {
- const int bits = (shift & 0xfffffff8) + CHAR_BIT;
- BD_VALUE nv;
- BD_VALUE big_endian_values;
- memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
-#if SIZE_MAX == 0xffffffffffffffffULL
- big_endian_values = HToBE64(big_endian_values);
-#else
- big_endian_values = HToBE32(big_endian_values);
-#endif
- nv = big_endian_values >> (BD_VALUE_SIZE - bits);
- count += bits;
- buffer += (bits >> 3);
- value = r->value | (nv << (shift & 0x7));
- } else {
- const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left);
- int loop_end = 0;
- if (bits_over >= 0) {
- count += LOTS_OF_BITS;
- loop_end = bits_over;
- }
-
- if (bits_over < 0 || bits_left) {
- while (shift >= loop_end) {
- count += CHAR_BIT;
- value |= (BD_VALUE)*buffer++ << shift;
- shift -= CHAR_BIT;
- }
- }
- }
-
- // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption,
- // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than
- // assign 'buffer' to 'r->buffer'.
- r->buffer += buffer - buffer_start;
- r->value = value;
- r->count = count;
-}
-
-const uint8_t *vpx_reader_find_end(vpx_reader *r) {
- // Find the end of the coded buffer
- while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) {
- r->count -= CHAR_BIT;
- r->buffer--;
- }
- return r->buffer;
-}
diff --git a/thirdparty/libvpx/vpx_dsp/bitreader.h b/thirdparty/libvpx/vpx_dsp/bitreader.h
deleted file mode 100644
index 9a441b4107..0000000000
--- a/thirdparty/libvpx/vpx_dsp/bitreader.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_BITREADER_H_
-#define VPX_DSP_BITREADER_H_
-
-#include <stddef.h>
-#include <limits.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/mem.h"
-#include "vpx/vp8dx.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_dsp/prob.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef size_t BD_VALUE;
-
-#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
-
-// This is meant to be a large, positive constant that can still be efficiently
-// loaded as an immediate (on platforms like ARM, for example).
-// Even relatively modest values like 100 would work fine.
-#define LOTS_OF_BITS 0x40000000
-
-typedef struct {
- // Be careful when reordering this struct, it may impact the cache negatively.
- BD_VALUE value;
- unsigned int range;
- int count;
- const uint8_t *buffer_end;
- const uint8_t *buffer;
- vpx_decrypt_cb decrypt_cb;
- void *decrypt_state;
- uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
-} vpx_reader;
-
-int vpx_reader_init(vpx_reader *r,
- const uint8_t *buffer,
- size_t size,
- vpx_decrypt_cb decrypt_cb,
- void *decrypt_state);
-
-void vpx_reader_fill(vpx_reader *r);
-
-const uint8_t *vpx_reader_find_end(vpx_reader *r);
-
-static INLINE int vpx_reader_has_error(vpx_reader *r) {
- // Check if we have reached the end of the buffer.
- //
- // Variable 'count' stores the number of bits in the 'value' buffer, minus
- // 8. The top byte is part of the algorithm, and the remainder is buffered
- // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
- // occupied, 8 for the algorithm and 8 in the buffer.
- //
- // When reading a byte from the user's buffer, count is filled with 8 and
- // one byte is filled into the value buffer. When we reach the end of the
- // data, count is additionally filled with LOTS_OF_BITS. So when
- // count == LOTS_OF_BITS - 1, the user's data has been exhausted.
- //
- // 1 if we have tried to decode bits after the end of stream was encountered.
- // 0 No error.
- return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
-}
-
-static INLINE int vpx_read(vpx_reader *r, int prob) {
- unsigned int bit = 0;
- BD_VALUE value;
- BD_VALUE bigsplit;
- int count;
- unsigned int range;
- unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
-
- if (r->count < 0)
- vpx_reader_fill(r);
-
- value = r->value;
- count = r->count;
-
- bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
-
- range = split;
-
- if (value >= bigsplit) {
- range = r->range - split;
- value = value - bigsplit;
- bit = 1;
- }
-
- {
- register int shift = vpx_norm[range];
- range <<= shift;
- value <<= shift;
- count -= shift;
- }
- r->value = value;
- r->count = count;
- r->range = range;
-
- return bit;
-}
-
-static INLINE int vpx_read_bit(vpx_reader *r) {
- return vpx_read(r, 128); // vpx_prob_half
-}
-
-static INLINE int vpx_read_literal(vpx_reader *r, int bits) {
- int literal = 0, bit;
-
- for (bit = bits - 1; bit >= 0; bit--)
- literal |= vpx_read_bit(r) << bit;
-
- return literal;
-}
-
-static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree,
- const vpx_prob *probs) {
- vpx_tree_index i = 0;
-
- while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0)
- continue;
-
- return -i;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VPX_DSP_BITREADER_H_
diff --git a/thirdparty/libvpx/vpx_dsp/bitreader_buffer.c b/thirdparty/libvpx/vpx_dsp/bitreader_buffer.c
deleted file mode 100644
index d7b55cf9f4..0000000000
--- a/thirdparty/libvpx/vpx_dsp/bitreader_buffer.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vpx_config.h"
-#include "./bitreader_buffer.h"
-
-size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb) {
- return (rb->bit_offset + 7) >> 3;
-}
-
-int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) {
- const size_t off = rb->bit_offset;
- const size_t p = off >> 3;
- const int q = 7 - (int)(off & 0x7);
- if (rb->bit_buffer + p < rb->bit_buffer_end) {
- const int bit = (rb->bit_buffer[p] >> q) & 1;
- rb->bit_offset = off + 1;
- return bit;
- } else {
- rb->error_handler(rb->error_handler_data);
- return 0;
- }
-}
-
-int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) {
- int value = 0, bit;
- for (bit = bits - 1; bit >= 0; bit--)
- value |= vpx_rb_read_bit(rb) << bit;
- return value;
-}
-
-int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb,
- int bits) {
- const int value = vpx_rb_read_literal(rb, bits);
- return vpx_rb_read_bit(rb) ? -value : value;
-}
-
-int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb,
- int bits) {
-#if CONFIG_MISC_FIXES
- const int nbits = sizeof(unsigned) * 8 - bits - 1;
- const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits;
- return ((int) value) >> nbits;
-#else
- return vpx_rb_read_signed_literal(rb, bits);
-#endif
-}
diff --git a/thirdparty/libvpx/vpx_dsp/bitreader_buffer.h b/thirdparty/libvpx/vpx_dsp/bitreader_buffer.h
deleted file mode 100644
index 8a48a95ed1..0000000000
--- a/thirdparty/libvpx/vpx_dsp/bitreader_buffer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_BITREADER_BUFFER_H_
-#define VPX_DSP_BITREADER_BUFFER_H_
-
-#include <limits.h>
-
-#include "vpx/vpx_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*vpx_rb_error_handler)(void *data);
-
-struct vpx_read_bit_buffer {
- const uint8_t *bit_buffer;
- const uint8_t *bit_buffer_end;
- size_t bit_offset;
-
- void *error_handler_data;
- vpx_rb_error_handler error_handler;
-};
-
-size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb);
-
-int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb);
-
-int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits);
-
-int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
-
-int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VPX_DSP_BITREADER_BUFFER_H_
diff --git a/thirdparty/libvpx/vpx_dsp/intrapred.c b/thirdparty/libvpx/vpx_dsp/intrapred.c
deleted file mode 100644
index cc4a74bd26..0000000000
--- a/thirdparty/libvpx/vpx_dsp/intrapred.c
+++ /dev/null
@@ -1,870 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_mem/vpx_mem.h"
-
-#define DST(x, y) dst[(x) + (y) * stride]
-#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
-#define AVG2(a, b) (((a) + (b) + 1) >> 1)
-
-static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int r, c;
- (void) above;
- // first column
- for (r = 0; r < bs - 1; ++r)
- dst[r * stride] = AVG2(left[r], left[r + 1]);
- dst[(bs - 1) * stride] = left[bs - 1];
- dst++;
-
- // second column
- for (r = 0; r < bs - 2; ++r)
- dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
- dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
- dst[(bs - 1) * stride] = left[bs - 1];
- dst++;
-
- // rest of last row
- for (c = 0; c < bs - 2; ++c)
- dst[(bs - 1) * stride + c] = left[bs - 1];
-
- for (r = bs - 2; r >= 0; --r)
- for (c = 0; c < bs - 2; ++c)
- dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
-}
-
-#if CONFIG_MISC_FIXES
-static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int r, c;
- (void) above;
-
- for (r = 0; r < bs; ++r) {
- for (c = 0; c < bs; ++c) {
- dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
- left[(c >> 1) + r + 2])
- : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
- }
- dst += stride;
- }
-}
-#endif // CONFIG_MISC_FIXES
-
-static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int r, c;
- int size;
- (void)left;
- for (c = 0; c < bs; ++c) {
- dst[c] = AVG2(above[c], above[c + 1]);
- dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]);
- }
- for (r = 2, size = bs - 2; r < bs; r += 2, --size) {
- memcpy(dst + (r + 0) * stride, dst + (r >> 1), size);
- memset(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
- memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), size);
- memset(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
- }
-}
-
-#if CONFIG_MISC_FIXES
-static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int r, c;
- (void) left;
- for (r = 0; r < bs; ++r) {
- for (c = 0; c < bs; ++c) {
- dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
- above[(r >> 1) + c + 2])
- : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
- }
- dst += stride;
- }
-}
-#endif // CONFIG_MISC_FIXES
-
-static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- const uint8_t above_right = above[bs - 1];
- const uint8_t *const dst_row0 = dst;
- int x, size;
- (void)left;
-
- for (x = 0; x < bs - 1; ++x) {
- dst[x] = AVG3(above[x], above[x + 1], above[x + 2]);
- }
- dst[bs - 1] = above_right;
- dst += stride;
- for (x = 1, size = bs - 2; x < bs; ++x, --size) {
- memcpy(dst, dst_row0 + x, size);
- memset(dst + size, above_right, x + 1);
- dst += stride;
- }
-}
-
-#if CONFIG_MISC_FIXES
-static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int r, c;
- (void) left;
- for (r = 0; r < bs; ++r) {
- for (c = 0; c < bs; ++c) {
- dst[c] = AVG3(above[r + c], above[r + c + 1],
- above[r + c + 1 + (r + c + 2 < bs * 2)]);
- }
- dst += stride;
- }
-}
-#endif // CONFIG_MISC_FIXES
-
-static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int r, c;
-
- // first row
- for (c = 0; c < bs; c++)
- dst[c] = AVG2(above[c - 1], above[c]);
- dst += stride;
-
- // second row
- dst[0] = AVG3(left[0], above[-1], above[0]);
- for (c = 1; c < bs; c++)
- dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
- dst += stride;
-
- // the rest of first col
- dst[0] = AVG3(above[-1], left[0], left[1]);
- for (r = 3; r < bs; ++r)
- dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
-
- // the rest of the block
- for (r = 2; r < bs; ++r) {
- for (c = 1; c < bs; c++)
- dst[c] = dst[-2 * stride + c - 1];
- dst += stride;
- }
-}
-
-static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int i;
-#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
- // silence a spurious -Warray-bounds warning, possibly related to:
- // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
- uint8_t border[69];
-#else
- uint8_t border[32 + 32 - 1]; // outer border from bottom-left to top-right
-#endif
-
- // dst(bs, bs - 2)[0], i.e., border starting at bottom-left
- for (i = 0; i < bs - 2; ++i) {
- border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
- }
- border[bs - 2] = AVG3(above[-1], left[0], left[1]);
- border[bs - 1] = AVG3(left[0], above[-1], above[0]);
- border[bs - 0] = AVG3(above[-1], above[0], above[1]);
- // dst[0][2, size), i.e., remaining top border ascending
- for (i = 0; i < bs - 2; ++i) {
- border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]);
- }
-
- for (i = 0; i < bs; ++i) {
- memcpy(dst + i * stride, border + bs - 1 - i, bs);
- }
-}
-
-static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int r, c;
- dst[0] = AVG2(above[-1], left[0]);
- for (r = 1; r < bs; r++)
- dst[r * stride] = AVG2(left[r - 1], left[r]);
- dst++;
-
- dst[0] = AVG3(left[0], above[-1], above[0]);
- dst[stride] = AVG3(above[-1], left[0], left[1]);
- for (r = 2; r < bs; r++)
- dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
- dst++;
-
- for (c = 0; c < bs - 2; c++)
- dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
- dst += stride;
-
- for (r = 1; r < bs; ++r) {
- for (c = 0; c < bs - 2; c++)
- dst[c] = dst[-stride + c - 2];
- dst += stride;
- }
-}
-
-static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int r;
- (void) left;
-
- for (r = 0; r < bs; r++) {
- memcpy(dst, above, bs);
- dst += stride;
- }
-}
-
-static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int r;
- (void) above;
-
- for (r = 0; r < bs; r++) {
- memset(dst, left[r], bs);
- dst += stride;
- }
-}
-
-static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int r, c;
- int ytop_left = above[-1];
-
- for (r = 0; r < bs; r++) {
- for (c = 0; c < bs; c++)
- dst[c] = clip_pixel(left[r] + above[c] - ytop_left);
- dst += stride;
- }
-}
-
-static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int r;
- (void) above;
- (void) left;
-
- for (r = 0; r < bs; r++) {
- memset(dst, 128, bs);
- dst += stride;
- }
-}
-
-static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above,
- const uint8_t *left) {
- int i, r, expected_dc, sum = 0;
- (void) above;
-
- for (i = 0; i < bs; i++)
- sum += left[i];
- expected_dc = (sum + (bs >> 1)) / bs;
-
- for (r = 0; r < bs; r++) {
- memset(dst, expected_dc, bs);
- dst += stride;
- }
-}
-
-static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int i, r, expected_dc, sum = 0;
- (void) left;
-
- for (i = 0; i < bs; i++)
- sum += above[i];
- expected_dc = (sum + (bs >> 1)) / bs;
-
- for (r = 0; r < bs; r++) {
- memset(dst, expected_dc, bs);
- dst += stride;
- }
-}
-
-static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- int i, r, expected_dc, sum = 0;
- const int count = 2 * bs;
-
- for (i = 0; i < bs; i++) {
- sum += above[i];
- sum += left[i];
- }
-
- expected_dc = (sum + (count >> 1)) / count;
-
- for (r = 0; r < bs; r++) {
- memset(dst, expected_dc, bs);
- dst += stride;
- }
-}
-
-void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const int H = above[-1];
- const int I = left[0];
- const int J = left[1];
- const int K = left[2];
- const int L = left[3];
-
- memset(dst + stride * 0, AVG3(H, I, J), 4);
- memset(dst + stride * 1, AVG3(I, J, K), 4);
- memset(dst + stride * 2, AVG3(J, K, L), 4);
- memset(dst + stride * 3, AVG3(K, L, L), 4);
-}
-
-void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const int H = above[-1];
- const int I = above[0];
- const int J = above[1];
- const int K = above[2];
- const int L = above[3];
- const int M = above[4];
- (void)left;
-
- dst[0] = AVG3(H, I, J);
- dst[1] = AVG3(I, J, K);
- dst[2] = AVG3(J, K, L);
- dst[3] = AVG3(K, L, M);
- memcpy(dst + stride * 1, dst, 4);
- memcpy(dst + stride * 2, dst, 4);
- memcpy(dst + stride * 3, dst, 4);
-}
-
-void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const int I = left[0];
- const int J = left[1];
- const int K = left[2];
- const int L = left[3];
- (void)above;
- DST(0, 0) = AVG2(I, J);
- DST(2, 0) = DST(0, 1) = AVG2(J, K);
- DST(2, 1) = DST(0, 2) = AVG2(K, L);
- DST(1, 0) = AVG3(I, J, K);
- DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
- DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
- DST(3, 2) = DST(2, 2) =
- DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
-}
-
-void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const int A = above[0];
- const int B = above[1];
- const int C = above[2];
- const int D = above[3];
- const int E = above[4];
- const int F = above[5];
- const int G = above[6];
- (void)left;
- DST(0, 0) = AVG2(A, B);
- DST(1, 0) = DST(0, 2) = AVG2(B, C);
- DST(2, 0) = DST(1, 2) = AVG2(C, D);
- DST(3, 0) = DST(2, 2) = AVG2(D, E);
- DST(3, 2) = AVG2(E, F); // differs from vp8
-
- DST(0, 1) = AVG3(A, B, C);
- DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
- DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
- DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
- DST(3, 3) = AVG3(E, F, G); // differs from vp8
-}
-
-void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const int A = above[0];
- const int B = above[1];
- const int C = above[2];
- const int D = above[3];
- const int E = above[4];
- const int F = above[5];
- const int G = above[6];
- const int H = above[7];
- (void)left;
- DST(0, 0) = AVG2(A, B);
- DST(1, 0) = DST(0, 2) = AVG2(B, C);
- DST(2, 0) = DST(1, 2) = AVG2(C, D);
- DST(3, 0) = DST(2, 2) = AVG2(D, E);
- DST(3, 2) = AVG3(E, F, G);
-
- DST(0, 1) = AVG3(A, B, C);
- DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
- DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
- DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
- DST(3, 3) = AVG3(F, G, H);
-}
-
-void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const int A = above[0];
- const int B = above[1];
- const int C = above[2];
- const int D = above[3];
- const int E = above[4];
- const int F = above[5];
- const int G = above[6];
- const int H = above[7];
- (void)stride;
- (void)left;
- DST(0, 0) = AVG3(A, B, C);
- DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
- DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
- DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
- DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
- DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
- DST(3, 3) = H; // differs from vp8
-}
-
-void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const int A = above[0];
- const int B = above[1];
- const int C = above[2];
- const int D = above[3];
- const int E = above[4];
- const int F = above[5];
- const int G = above[6];
- const int H = above[7];
- (void)stride;
- (void)left;
- DST(0, 0) = AVG3(A, B, C);
- DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
- DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
- DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
- DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
- DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
- DST(3, 3) = AVG3(G, H, H);
-}
-
-void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const int I = left[0];
- const int J = left[1];
- const int K = left[2];
- const int X = above[-1];
- const int A = above[0];
- const int B = above[1];
- const int C = above[2];
- const int D = above[3];
- DST(0, 0) = DST(1, 2) = AVG2(X, A);
- DST(1, 0) = DST(2, 2) = AVG2(A, B);
- DST(2, 0) = DST(3, 2) = AVG2(B, C);
- DST(3, 0) = AVG2(C, D);
-
- DST(0, 3) = AVG3(K, J, I);
- DST(0, 2) = AVG3(J, I, X);
- DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
- DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
- DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
- DST(3, 1) = AVG3(B, C, D);
-}
-
-void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const int I = left[0];
- const int J = left[1];
- const int K = left[2];
- const int L = left[3];
- const int X = above[-1];
- const int A = above[0];
- const int B = above[1];
- const int C = above[2];
- const int D = above[3];
- (void)stride;
- DST(0, 3) = AVG3(J, K, L);
- DST(1, 3) = DST(0, 2) = AVG3(I, J, K);
- DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J);
- DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
- DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
- DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
- DST(3, 0) = AVG3(D, C, B);
-}
-
-void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const int I = left[0];
- const int J = left[1];
- const int K = left[2];
- const int L = left[3];
- const int X = above[-1];
- const int A = above[0];
- const int B = above[1];
- const int C = above[2];
-
- DST(0, 0) = DST(2, 1) = AVG2(I, X);
- DST(0, 1) = DST(2, 2) = AVG2(J, I);
- DST(0, 2) = DST(2, 3) = AVG2(K, J);
- DST(0, 3) = AVG2(L, K);
-
- DST(3, 0) = AVG3(A, B, C);
- DST(2, 0) = AVG3(X, A, B);
- DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
- DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
- DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
- DST(1, 3) = AVG3(L, K, J);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r, c;
- (void) above;
- (void) bd;
-
- // First column.
- for (r = 0; r < bs - 1; ++r) {
- dst[r * stride] = AVG2(left[r], left[r + 1]);
- }
- dst[(bs - 1) * stride] = left[bs - 1];
- dst++;
-
- // Second column.
- for (r = 0; r < bs - 2; ++r) {
- dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
- }
- dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
- dst[(bs - 1) * stride] = left[bs - 1];
- dst++;
-
- // Rest of last row.
- for (c = 0; c < bs - 2; ++c)
- dst[(bs - 1) * stride + c] = left[bs - 1];
-
- for (r = bs - 2; r >= 0; --r) {
- for (c = 0; c < bs - 2; ++c)
- dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
- }
-}
-
-#if CONFIG_MISC_FIXES
-static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r, c;
- (void) above;
- (void) bd;
-
- for (r = 0; r < bs; ++r) {
- for (c = 0; c < bs; ++c) {
- dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
- left[(c >> 1) + r + 2])
- : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
- }
- dst += stride;
- }
-}
-#endif // CONFIG_MISC_FIXES
-
-static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r, c;
- (void) left;
- (void) bd;
- for (r = 0; r < bs; ++r) {
- for (c = 0; c < bs; ++c) {
- dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
- above[(r >> 1) + c + 2])
- : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
- }
- dst += stride;
- }
-}
-
-#define highbd_d63e_predictor highbd_d63_predictor
-
-static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- int r, c;
- (void) left;
- (void) bd;
- for (r = 0; r < bs; ++r) {
- for (c = 0; c < bs; ++c) {
- dst[c] = r + c + 2 < bs * 2 ? AVG3(above[r + c], above[r + c + 1],
- above[r + c + 2])
- : above[bs * 2 - 1];
- }
- dst += stride;
- }
-}
-
-#if CONFIG_MISC_FIXES
-static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r, c;
- (void) left;
- (void) bd;
- for (r = 0; r < bs; ++r) {
- for (c = 0; c < bs; ++c) {
- dst[c] = AVG3(above[r + c], above[r + c + 1],
- above[r + c + 1 + (r + c + 2 < bs * 2)]);
- }
- dst += stride;
- }
-}
-#endif // CONFIG_MISC_FIXES
-
-static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r, c;
- (void) bd;
-
- // first row
- for (c = 0; c < bs; c++)
- dst[c] = AVG2(above[c - 1], above[c]);
- dst += stride;
-
- // second row
- dst[0] = AVG3(left[0], above[-1], above[0]);
- for (c = 1; c < bs; c++)
- dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
- dst += stride;
-
- // the rest of first col
- dst[0] = AVG3(above[-1], left[0], left[1]);
- for (r = 3; r < bs; ++r)
- dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
-
- // the rest of the block
- for (r = 2; r < bs; ++r) {
- for (c = 1; c < bs; c++)
- dst[c] = dst[-2 * stride + c - 1];
- dst += stride;
- }
-}
-
-static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r, c;
- (void) bd;
- dst[0] = AVG3(left[0], above[-1], above[0]);
- for (c = 1; c < bs; c++)
- dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
-
- dst[stride] = AVG3(above[-1], left[0], left[1]);
- for (r = 2; r < bs; ++r)
- dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
-
- dst += stride;
- for (r = 1; r < bs; ++r) {
- for (c = 1; c < bs; c++)
- dst[c] = dst[-stride + c - 1];
- dst += stride;
- }
-}
-
-static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r, c;
- (void) bd;
- dst[0] = AVG2(above[-1], left[0]);
- for (r = 1; r < bs; r++)
- dst[r * stride] = AVG2(left[r - 1], left[r]);
- dst++;
-
- dst[0] = AVG3(left[0], above[-1], above[0]);
- dst[stride] = AVG3(above[-1], left[0], left[1]);
- for (r = 2; r < bs; r++)
- dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
- dst++;
-
- for (c = 0; c < bs - 2; c++)
- dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
- dst += stride;
-
- for (r = 1; r < bs; ++r) {
- for (c = 0; c < bs - 2; c++)
- dst[c] = dst[-stride + c - 2];
- dst += stride;
- }
-}
-
-static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r;
- (void) left;
- (void) bd;
- for (r = 0; r < bs; r++) {
- memcpy(dst, above, bs * sizeof(uint16_t));
- dst += stride;
- }
-}
-
-static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r;
- (void) above;
- (void) bd;
- for (r = 0; r < bs; r++) {
- vpx_memset16(dst, left[r], bs);
- dst += stride;
- }
-}
-
-static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r, c;
- int ytop_left = above[-1];
- (void) bd;
-
- for (r = 0; r < bs; r++) {
- for (c = 0; c < bs; c++)
- dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd);
- dst += stride;
- }
-}
-
-static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r;
- (void) above;
- (void) left;
-
- for (r = 0; r < bs; r++) {
- vpx_memset16(dst, 128 << (bd - 8), bs);
- dst += stride;
- }
-}
-
-static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int i, r, expected_dc, sum = 0;
- (void) above;
- (void) bd;
-
- for (i = 0; i < bs; i++)
- sum += left[i];
- expected_dc = (sum + (bs >> 1)) / bs;
-
- for (r = 0; r < bs; r++) {
- vpx_memset16(dst, expected_dc, bs);
- dst += stride;
- }
-}
-
-static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int i, r, expected_dc, sum = 0;
- (void) left;
- (void) bd;
-
- for (i = 0; i < bs; i++)
- sum += above[i];
- expected_dc = (sum + (bs >> 1)) / bs;
-
- for (r = 0; r < bs; r++) {
- vpx_memset16(dst, expected_dc, bs);
- dst += stride;
- }
-}
-
-static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- int i, r, expected_dc, sum = 0;
- const int count = 2 * bs;
- (void) bd;
-
- for (i = 0; i < bs; i++) {
- sum += above[i];
- sum += left[i];
- }
-
- expected_dc = (sum + (count >> 1)) / count;
-
- for (r = 0; r < bs; r++) {
- vpx_memset16(dst, expected_dc, bs);
- dst += stride;
- }
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-// This serves as a wrapper function, so that all the prediction functions
-// can be unified and accessed as a pointer array. Note that the boundary
-// above and left are not necessarily used all the time.
-#define intra_pred_sized(type, size) \
- void vpx_##type##_predictor_##size##x##size##_c(uint8_t *dst, \
- ptrdiff_t stride, \
- const uint8_t *above, \
- const uint8_t *left) { \
- type##_predictor(dst, stride, size, above, left); \
- }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-#define intra_pred_highbd_sized(type, size) \
- void vpx_highbd_##type##_predictor_##size##x##size##_c( \
- uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
- const uint16_t *left, int bd) { \
- highbd_##type##_predictor(dst, stride, size, above, left, bd); \
- }
-
-#define intra_pred_allsizes(type) \
- intra_pred_sized(type, 4) \
- intra_pred_sized(type, 8) \
- intra_pred_sized(type, 16) \
- intra_pred_sized(type, 32) \
- intra_pred_highbd_sized(type, 4) \
- intra_pred_highbd_sized(type, 8) \
- intra_pred_highbd_sized(type, 16) \
- intra_pred_highbd_sized(type, 32)
-
-#define intra_pred_no_4x4(type) \
- intra_pred_sized(type, 8) \
- intra_pred_sized(type, 16) \
- intra_pred_sized(type, 32) \
- intra_pred_highbd_sized(type, 4) \
- intra_pred_highbd_sized(type, 8) \
- intra_pred_highbd_sized(type, 16) \
- intra_pred_highbd_sized(type, 32)
-
-#else
-#define intra_pred_allsizes(type) \
- intra_pred_sized(type, 4) \
- intra_pred_sized(type, 8) \
- intra_pred_sized(type, 16) \
- intra_pred_sized(type, 32)
-
-#define intra_pred_no_4x4(type) \
- intra_pred_sized(type, 8) \
- intra_pred_sized(type, 16) \
- intra_pred_sized(type, 32)
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-intra_pred_no_4x4(d207)
-intra_pred_no_4x4(d63)
-intra_pred_no_4x4(d45)
-#if CONFIG_MISC_FIXES
-intra_pred_allsizes(d207e)
-intra_pred_allsizes(d63e)
-intra_pred_no_4x4(d45e)
-#endif
-intra_pred_no_4x4(d117)
-intra_pred_no_4x4(d135)
-intra_pred_no_4x4(d153)
-intra_pred_allsizes(v)
-intra_pred_allsizes(h)
-intra_pred_allsizes(tm)
-intra_pred_allsizes(dc_128)
-intra_pred_allsizes(dc_left)
-intra_pred_allsizes(dc_top)
-intra_pred_allsizes(dc)
-#undef intra_pred_allsizes
diff --git a/thirdparty/libvpx/vpx_dsp/inv_txfm.c b/thirdparty/libvpx/vpx_dsp/inv_txfm.c
deleted file mode 100644
index e18d31d7aa..0000000000
--- a/thirdparty/libvpx/vpx_dsp/inv_txfm.c
+++ /dev/null
@@ -1,2518 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <string.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/inv_txfm.h"
-
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
- 0.5 shifts per pixel. */
- int i;
- tran_low_t output[16];
- tran_high_t a1, b1, c1, d1, e1;
- const tran_low_t *ip = input;
- tran_low_t *op = output;
-
- for (i = 0; i < 4; i++) {
- a1 = ip[0] >> UNIT_QUANT_SHIFT;
- c1 = ip[1] >> UNIT_QUANT_SHIFT;
- d1 = ip[2] >> UNIT_QUANT_SHIFT;
- b1 = ip[3] >> UNIT_QUANT_SHIFT;
- a1 += c1;
- d1 -= b1;
- e1 = (a1 - d1) >> 1;
- b1 = e1 - b1;
- c1 = e1 - c1;
- a1 -= b1;
- d1 += c1;
- op[0] = WRAPLOW(a1);
- op[1] = WRAPLOW(b1);
- op[2] = WRAPLOW(c1);
- op[3] = WRAPLOW(d1);
- ip += 4;
- op += 4;
- }
-
- ip = output;
- for (i = 0; i < 4; i++) {
- a1 = ip[4 * 0];
- c1 = ip[4 * 1];
- d1 = ip[4 * 2];
- b1 = ip[4 * 3];
- a1 += c1;
- d1 -= b1;
- e1 = (a1 - d1) >> 1;
- b1 = e1 - b1;
- c1 = e1 - c1;
- a1 -= b1;
- d1 += c1;
- dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
- dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
- dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
- dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
-
- ip++;
- dest++;
- }
-}
-
-void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
- int i;
- tran_high_t a1, e1;
- tran_low_t tmp[4];
- const tran_low_t *ip = in;
- tran_low_t *op = tmp;
-
- a1 = ip[0] >> UNIT_QUANT_SHIFT;
- e1 = a1 >> 1;
- a1 -= e1;
- op[0] = WRAPLOW(a1);
- op[1] = op[2] = op[3] = WRAPLOW(e1);
-
- ip = tmp;
- for (i = 0; i < 4; i++) {
- e1 = ip[0] >> 1;
- a1 = ip[0] - e1;
- dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
- dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
- dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
- dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
- ip++;
- dest++;
- }
-}
-
-void idct4_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step[4];
- tran_high_t temp1, temp2;
- // stage 1
- temp1 = (input[0] + input[2]) * cospi_16_64;
- temp2 = (input[0] - input[2]) * cospi_16_64;
- step[0] = WRAPLOW(dct_const_round_shift(temp1));
- step[1] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
- temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
- step[2] = WRAPLOW(dct_const_round_shift(temp1));
- step[3] = WRAPLOW(dct_const_round_shift(temp2));
-
- // stage 2
- output[0] = WRAPLOW(step[0] + step[3]);
- output[1] = WRAPLOW(step[1] + step[2]);
- output[2] = WRAPLOW(step[1] - step[2]);
- output[3] = WRAPLOW(step[0] - step[3]);
-}
-
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- tran_low_t out[4 * 4];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[4], temp_out[4];
-
- // Rows
- for (i = 0; i < 4; ++i) {
- idct4_c(input, outptr);
- input += 4;
- outptr += 4;
- }
-
- // Columns
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j)
- temp_in[j] = out[j * 4 + i];
- idct4_c(temp_in, temp_out);
- for (j = 0; j < 4; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 4));
- }
- }
-}
-
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
- int i;
- tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 4);
-
- for (i = 0; i < 4; i++) {
- dest[0] = clip_pixel_add(dest[0], a1);
- dest[1] = clip_pixel_add(dest[1], a1);
- dest[2] = clip_pixel_add(dest[2], a1);
- dest[3] = clip_pixel_add(dest[3], a1);
- dest += dest_stride;
- }
-}
-
-void idct8_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step1[8], step2[8];
- tran_high_t temp1, temp2;
- // stage 1
- step1[0] = input[0];
- step1[2] = input[4];
- step1[1] = input[2];
- step1[3] = input[6];
- temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
- temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1));
- step1[7] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
- temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
- // stage 2
- temp1 = (step1[0] + step1[2]) * cospi_16_64;
- temp2 = (step1[0] - step1[2]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1));
- step2[1] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1));
- step2[3] = WRAPLOW(dct_const_round_shift(temp2));
- step2[4] = WRAPLOW(step1[4] + step1[5]);
- step2[5] = WRAPLOW(step1[4] - step1[5]);
- step2[6] = WRAPLOW(-step1[6] + step1[7]);
- step2[7] = WRAPLOW(step1[6] + step1[7]);
-
- // stage 3
- step1[0] = WRAPLOW(step2[0] + step2[3]);
- step1[1] = WRAPLOW(step2[1] + step2[2]);
- step1[2] = WRAPLOW(step2[1] - step2[2]);
- step1[3] = WRAPLOW(step2[0] - step2[3]);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
- step1[7] = step2[7];
-
- // stage 4
- output[0] = WRAPLOW(step1[0] + step1[7]);
- output[1] = WRAPLOW(step1[1] + step1[6]);
- output[2] = WRAPLOW(step1[2] + step1[5]);
- output[3] = WRAPLOW(step1[3] + step1[4]);
- output[4] = WRAPLOW(step1[3] - step1[4]);
- output[5] = WRAPLOW(step1[2] - step1[5]);
- output[6] = WRAPLOW(step1[1] - step1[6]);
- output[7] = WRAPLOW(step1[0] - step1[7]);
-}
-
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- tran_low_t out[8 * 8];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
-
- // First transform rows
- for (i = 0; i < 8; ++i) {
- idct8_c(input, outptr);
- input += 8;
- outptr += 8;
- }
-
- // Then transform columns
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = out[j * 8 + i];
- idct8_c(temp_in, temp_out);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 5));
- }
- }
-}
-
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- int i, j;
- tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 5);
- for (j = 0; j < 8; ++j) {
- for (i = 0; i < 8; ++i)
- dest[i] = clip_pixel_add(dest[i], a1);
- dest += stride;
- }
-}
-
-void iadst4_c(const tran_low_t *input, tran_low_t *output) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
- tran_low_t x0 = input[0];
- tran_low_t x1 = input[1];
- tran_low_t x2 = input[2];
- tran_low_t x3 = input[3];
-
- if (!(x0 | x1 | x2 | x3)) {
- output[0] = output[1] = output[2] = output[3] = 0;
- return;
- }
-
- s0 = sinpi_1_9 * x0;
- s1 = sinpi_2_9 * x0;
- s2 = sinpi_3_9 * x1;
- s3 = sinpi_4_9 * x2;
- s4 = sinpi_1_9 * x2;
- s5 = sinpi_2_9 * x3;
- s6 = sinpi_4_9 * x3;
- s7 = WRAPLOW(x0 - x2 + x3);
-
- s0 = s0 + s3 + s5;
- s1 = s1 - s4 - s6;
- s3 = s2;
- s2 = sinpi_3_9 * s7;
-
- // 1-D transform scaling factor is sqrt(2).
- // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
- // + 1b (addition) = 29b.
- // Hence the output bit depth is 15b.
- output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
- output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
- output[2] = WRAPLOW(dct_const_round_shift(s2));
- output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
-}
-
-void iadst8_c(const tran_low_t *input, tran_low_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7;
-
- tran_high_t x0 = input[7];
- tran_high_t x1 = input[0];
- tran_high_t x2 = input[5];
- tran_high_t x3 = input[2];
- tran_high_t x4 = input[3];
- tran_high_t x5 = input[4];
- tran_high_t x6 = input[1];
- tran_high_t x7 = input[6];
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
- output[0] = output[1] = output[2] = output[3] = output[4]
- = output[5] = output[6] = output[7] = 0;
- return;
- }
-
- // stage 1
- s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
- s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
- s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
- s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
- s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
- s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
- s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
- s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
-
- x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
- x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
- x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
- x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
- x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
- x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
- x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
- x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
-
- // stage 2
- s0 = (int)x0;
- s1 = (int)x1;
- s2 = (int)x2;
- s3 = (int)x3;
- s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
- s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
- s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
- s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
-
- x0 = WRAPLOW(s0 + s2);
- x1 = WRAPLOW(s1 + s3);
- x2 = WRAPLOW(s0 - s2);
- x3 = WRAPLOW(s1 - s3);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
- x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
- x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
- x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
-
- // stage 3
- s2 = (int)(cospi_16_64 * (x2 + x3));
- s3 = (int)(cospi_16_64 * (x2 - x3));
- s6 = (int)(cospi_16_64 * (x6 + x7));
- s7 = (int)(cospi_16_64 * (x6 - x7));
-
- x2 = WRAPLOW(dct_const_round_shift(s2));
- x3 = WRAPLOW(dct_const_round_shift(s3));
- x6 = WRAPLOW(dct_const_round_shift(s6));
- x7 = WRAPLOW(dct_const_round_shift(s7));
-
- output[0] = WRAPLOW(x0);
- output[1] = WRAPLOW(-x4);
- output[2] = WRAPLOW(x6);
- output[3] = WRAPLOW(-x2);
- output[4] = WRAPLOW(x3);
- output[5] = WRAPLOW(-x7);
- output[6] = WRAPLOW(x5);
- output[7] = WRAPLOW(-x1);
-}
-
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
-
- // First transform rows
- // only first 4 row has non-zero coefs
- for (i = 0; i < 4; ++i) {
- idct8_c(input, outptr);
- input += 8;
- outptr += 8;
- }
-
- // Then transform columns
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = out[j * 8 + i];
- idct8_c(temp_in, temp_out);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 5));
- }
- }
-}
-
-void idct16_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step1[16], step2[16];
- tran_high_t temp1, temp2;
-
- // stage 1
- step1[0] = input[0/2];
- step1[1] = input[16/2];
- step1[2] = input[8/2];
- step1[3] = input[24/2];
- step1[4] = input[4/2];
- step1[5] = input[20/2];
- step1[6] = input[12/2];
- step1[7] = input[28/2];
- step1[8] = input[2/2];
- step1[9] = input[18/2];
- step1[10] = input[10/2];
- step1[11] = input[26/2];
- step1[12] = input[6/2];
- step1[13] = input[22/2];
- step1[14] = input[14/2];
- step1[15] = input[30/2];
-
- // stage 2
- step2[0] = step1[0];
- step2[1] = step1[1];
- step2[2] = step1[2];
- step2[3] = step1[3];
- step2[4] = step1[4];
- step2[5] = step1[5];
- step2[6] = step1[6];
- step2[7] = step1[7];
-
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = WRAPLOW(dct_const_round_shift(temp1));
- step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1));
- step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1));
- step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
- // stage 3
- step1[0] = step2[0];
- step1[1] = step2[1];
- step1[2] = step2[2];
- step1[3] = step2[3];
-
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1));
- step1[7] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
- step1[8] = WRAPLOW(step2[8] + step2[9]);
- step1[9] = WRAPLOW(step2[8] - step2[9]);
- step1[10] = WRAPLOW(-step2[10] + step2[11]);
- step1[11] = WRAPLOW(step2[10] + step2[11]);
- step1[12] = WRAPLOW(step2[12] + step2[13]);
- step1[13] = WRAPLOW(step2[12] - step2[13]);
- step1[14] = WRAPLOW(-step2[14] + step2[15]);
- step1[15] = WRAPLOW(step2[14] + step2[15]);
-
- // stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1));
- step2[1] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1));
- step2[3] = WRAPLOW(dct_const_round_shift(temp2));
- step2[4] = WRAPLOW(step1[4] + step1[5]);
- step2[5] = WRAPLOW(step1[4] - step1[5]);
- step2[6] = WRAPLOW(-step1[6] + step1[7]);
- step2[7] = WRAPLOW(step1[6] + step1[7]);
-
- step2[8] = step1[8];
- step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1));
- step2[14] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
- step2[11] = step1[11];
- step2[12] = step1[12];
-
- // stage 5
- step1[0] = WRAPLOW(step2[0] + step2[3]);
- step1[1] = WRAPLOW(step2[1] + step2[2]);
- step1[2] = WRAPLOW(step2[1] - step2[2]);
- step1[3] = WRAPLOW(step2[0] - step2[3]);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
- step1[7] = step2[7];
-
- step1[8] = WRAPLOW(step2[8] + step2[11]);
- step1[9] = WRAPLOW(step2[9] + step2[10]);
- step1[10] = WRAPLOW(step2[9] - step2[10]);
- step1[11] = WRAPLOW(step2[8] - step2[11]);
- step1[12] = WRAPLOW(-step2[12] + step2[15]);
- step1[13] = WRAPLOW(-step2[13] + step2[14]);
- step1[14] = WRAPLOW(step2[13] + step2[14]);
- step1[15] = WRAPLOW(step2[12] + step2[15]);
-
- // stage 6
- step2[0] = WRAPLOW(step1[0] + step1[7]);
- step2[1] = WRAPLOW(step1[1] + step1[6]);
- step2[2] = WRAPLOW(step1[2] + step1[5]);
- step2[3] = WRAPLOW(step1[3] + step1[4]);
- step2[4] = WRAPLOW(step1[3] - step1[4]);
- step2[5] = WRAPLOW(step1[2] - step1[5]);
- step2[6] = WRAPLOW(step1[1] - step1[6]);
- step2[7] = WRAPLOW(step1[0] - step1[7]);
- step2[8] = step1[8];
- step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1));
- step2[12] = WRAPLOW(dct_const_round_shift(temp2));
- step2[14] = step1[14];
- step2[15] = step1[15];
-
- // stage 7
- output[0] = WRAPLOW(step2[0] + step2[15]);
- output[1] = WRAPLOW(step2[1] + step2[14]);
- output[2] = WRAPLOW(step2[2] + step2[13]);
- output[3] = WRAPLOW(step2[3] + step2[12]);
- output[4] = WRAPLOW(step2[4] + step2[11]);
- output[5] = WRAPLOW(step2[5] + step2[10]);
- output[6] = WRAPLOW(step2[6] + step2[9]);
- output[7] = WRAPLOW(step2[7] + step2[8]);
- output[8] = WRAPLOW(step2[7] - step2[8]);
- output[9] = WRAPLOW(step2[6] - step2[9]);
- output[10] = WRAPLOW(step2[5] - step2[10]);
- output[11] = WRAPLOW(step2[4] - step2[11]);
- output[12] = WRAPLOW(step2[3] - step2[12]);
- output[13] = WRAPLOW(step2[2] - step2[13]);
- output[14] = WRAPLOW(step2[1] - step2[14]);
- output[15] = WRAPLOW(step2[0] - step2[15]);
-}
-
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[16 * 16];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
-
- // First transform rows
- for (i = 0; i < 16; ++i) {
- idct16_c(input, outptr);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j * 16 + i];
- idct16_c(temp_in, temp_out);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void iadst16_c(const tran_low_t *input, tran_low_t *output) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
- tran_high_t x0 = input[15];
- tran_high_t x1 = input[0];
- tran_high_t x2 = input[13];
- tran_high_t x3 = input[2];
- tran_high_t x4 = input[11];
- tran_high_t x5 = input[4];
- tran_high_t x6 = input[9];
- tran_high_t x7 = input[6];
- tran_high_t x8 = input[7];
- tran_high_t x9 = input[8];
- tran_high_t x10 = input[5];
- tran_high_t x11 = input[10];
- tran_high_t x12 = input[3];
- tran_high_t x13 = input[12];
- tran_high_t x14 = input[1];
- tran_high_t x15 = input[14];
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
- | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
- output[0] = output[1] = output[2] = output[3] = output[4]
- = output[5] = output[6] = output[7] = output[8]
- = output[9] = output[10] = output[11] = output[12]
- = output[13] = output[14] = output[15] = 0;
- return;
- }
-
- // stage 1
- s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
- s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
- s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
- s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
- s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
- s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
- s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
- s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
- s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
- s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
- s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
- s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
- s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
- s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
- s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
- s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
- x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
- x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
- x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
- x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
- x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
- x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
- x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
- x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
- x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
- x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
- x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
- x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
- x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
- x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
- x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
- x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
-
- // stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4;
- s5 = x5;
- s6 = x6;
- s7 = x7;
- s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
- s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
- s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
- s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
- s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
- s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
- s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
- s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
- x0 = WRAPLOW(s0 + s4);
- x1 = WRAPLOW(s1 + s5);
- x2 = WRAPLOW(s2 + s6);
- x3 = WRAPLOW(s3 + s7);
- x4 = WRAPLOW(s0 - s4);
- x5 = WRAPLOW(s1 - s5);
- x6 = WRAPLOW(s2 - s6);
- x7 = WRAPLOW(s3 - s7);
- x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
- x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
- x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
- x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
- x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
- x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
- x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
- x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
-
- // stage 3
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
- s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
- s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
- s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
- s8 = x8;
- s9 = x9;
- s10 = x10;
- s11 = x11;
- s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
- s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
- s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
- s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
- x0 = WRAPLOW(s0 + s2);
- x1 = WRAPLOW(s1 + s3);
- x2 = WRAPLOW(s0 - s2);
- x3 = WRAPLOW(s1 - s3);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
- x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
- x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
- x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
- x8 = WRAPLOW(s8 + s10);
- x9 = WRAPLOW(s9 + s11);
- x10 = WRAPLOW(s8 - s10);
- x11 = WRAPLOW(s9 - s11);
- x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
- x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
- x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
- x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
-
- // stage 4
- s2 = (- cospi_16_64) * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (- x6 + x7);
- s10 = cospi_16_64 * (x10 + x11);
- s11 = cospi_16_64 * (- x10 + x11);
- s14 = (- cospi_16_64) * (x14 + x15);
- s15 = cospi_16_64 * (x14 - x15);
-
- x2 = WRAPLOW(dct_const_round_shift(s2));
- x3 = WRAPLOW(dct_const_round_shift(s3));
- x6 = WRAPLOW(dct_const_round_shift(s6));
- x7 = WRAPLOW(dct_const_round_shift(s7));
- x10 = WRAPLOW(dct_const_round_shift(s10));
- x11 = WRAPLOW(dct_const_round_shift(s11));
- x14 = WRAPLOW(dct_const_round_shift(s14));
- x15 = WRAPLOW(dct_const_round_shift(s15));
-
- output[0] = WRAPLOW(x0);
- output[1] = WRAPLOW(-x8);
- output[2] = WRAPLOW(x12);
- output[3] = WRAPLOW(-x4);
- output[4] = WRAPLOW(x6);
- output[5] = WRAPLOW(x14);
- output[6] = WRAPLOW(x10);
- output[7] = WRAPLOW(x2);
- output[8] = WRAPLOW(x3);
- output[9] = WRAPLOW(x11);
- output[10] = WRAPLOW(x15);
- output[11] = WRAPLOW(x7);
- output[12] = WRAPLOW(x5);
- output[13] = WRAPLOW(-x13);
- output[14] = WRAPLOW(x9);
- output[15] = WRAPLOW(-x1);
-}
-
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[16 * 16] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
-
- // First transform rows. Since all non-zero dct coefficients are in
- // upper-left 4x4 area, we only need to calculate first 4 rows here.
- for (i = 0; i < 4; ++i) {
- idct16_c(input, outptr);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j*16 + i];
- idct16_c(temp_in, temp_out);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- int i, j;
- tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 6);
- for (j = 0; j < 16; ++j) {
- for (i = 0; i < 16; ++i)
- dest[i] = clip_pixel_add(dest[i], a1);
- dest += stride;
- }
-}
-
-void idct32_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step1[32], step2[32];
- tran_high_t temp1, temp2;
-
- // stage 1
- step1[0] = input[0];
- step1[1] = input[16];
- step1[2] = input[8];
- step1[3] = input[24];
- step1[4] = input[4];
- step1[5] = input[20];
- step1[6] = input[12];
- step1[7] = input[28];
- step1[8] = input[2];
- step1[9] = input[18];
- step1[10] = input[10];
- step1[11] = input[26];
- step1[12] = input[6];
- step1[13] = input[22];
- step1[14] = input[14];
- step1[15] = input[30];
-
- temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
- temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
- step1[16] = WRAPLOW(dct_const_round_shift(temp1));
- step1[31] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
- temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
- step1[17] = WRAPLOW(dct_const_round_shift(temp1));
- step1[30] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
- temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1));
- step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
- temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
- step1[19] = WRAPLOW(dct_const_round_shift(temp1));
- step1[28] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
- temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1));
- step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
- temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1));
- step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
- temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1));
- step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
- temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
- step1[23] = WRAPLOW(dct_const_round_shift(temp1));
- step1[24] = WRAPLOW(dct_const_round_shift(temp2));
-
- // stage 2
- step2[0] = step1[0];
- step2[1] = step1[1];
- step2[2] = step1[2];
- step2[3] = step1[3];
- step2[4] = step1[4];
- step2[5] = step1[5];
- step2[6] = step1[6];
- step2[7] = step1[7];
-
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = WRAPLOW(dct_const_round_shift(temp1));
- step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1));
- step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1));
- step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
- step2[16] = WRAPLOW(step1[16] + step1[17]);
- step2[17] = WRAPLOW(step1[16] - step1[17]);
- step2[18] = WRAPLOW(-step1[18] + step1[19]);
- step2[19] = WRAPLOW(step1[18] + step1[19]);
- step2[20] = WRAPLOW(step1[20] + step1[21]);
- step2[21] = WRAPLOW(step1[20] - step1[21]);
- step2[22] = WRAPLOW(-step1[22] + step1[23]);
- step2[23] = WRAPLOW(step1[22] + step1[23]);
- step2[24] = WRAPLOW(step1[24] + step1[25]);
- step2[25] = WRAPLOW(step1[24] - step1[25]);
- step2[26] = WRAPLOW(-step1[26] + step1[27]);
- step2[27] = WRAPLOW(step1[26] + step1[27]);
- step2[28] = WRAPLOW(step1[28] + step1[29]);
- step2[29] = WRAPLOW(step1[28] - step1[29]);
- step2[30] = WRAPLOW(-step1[30] + step1[31]);
- step2[31] = WRAPLOW(step1[30] + step1[31]);
-
- // stage 3
- step1[0] = step2[0];
- step1[1] = step2[1];
- step1[2] = step2[2];
- step1[3] = step2[3];
-
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1));
- step1[7] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
- step1[8] = WRAPLOW(step2[8] + step2[9]);
- step1[9] = WRAPLOW(step2[8] - step2[9]);
- step1[10] = WRAPLOW(-step2[10] + step2[11]);
- step1[11] = WRAPLOW(step2[10] + step2[11]);
- step1[12] = WRAPLOW(step2[12] + step2[13]);
- step1[13] = WRAPLOW(step2[12] - step2[13]);
- step1[14] = WRAPLOW(-step2[14] + step2[15]);
- step1[15] = WRAPLOW(step2[14] + step2[15]);
-
- step1[16] = step2[16];
- step1[31] = step2[31];
- temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
- temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
- step1[17] = WRAPLOW(dct_const_round_shift(temp1));
- step1[30] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
- temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1));
- step1[29] = WRAPLOW(dct_const_round_shift(temp2));
- step1[19] = step2[19];
- step1[20] = step2[20];
- temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
- temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1));
- step1[26] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
- temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1));
- step1[25] = WRAPLOW(dct_const_round_shift(temp2));
- step1[23] = step2[23];
- step1[24] = step2[24];
- step1[27] = step2[27];
- step1[28] = step2[28];
-
- // stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1));
- step2[1] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1));
- step2[3] = WRAPLOW(dct_const_round_shift(temp2));
- step2[4] = WRAPLOW(step1[4] + step1[5]);
- step2[5] = WRAPLOW(step1[4] - step1[5]);
- step2[6] = WRAPLOW(-step1[6] + step1[7]);
- step2[7] = WRAPLOW(step1[6] + step1[7]);
-
- step2[8] = step1[8];
- step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1));
- step2[14] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
- step2[11] = step1[11];
- step2[12] = step1[12];
-
- step2[16] = WRAPLOW(step1[16] + step1[19]);
- step2[17] = WRAPLOW(step1[17] + step1[18]);
- step2[18] = WRAPLOW(step1[17] - step1[18]);
- step2[19] = WRAPLOW(step1[16] - step1[19]);
- step2[20] = WRAPLOW(-step1[20] + step1[23]);
- step2[21] = WRAPLOW(-step1[21] + step1[22]);
- step2[22] = WRAPLOW(step1[21] + step1[22]);
- step2[23] = WRAPLOW(step1[20] + step1[23]);
-
- step2[24] = WRAPLOW(step1[24] + step1[27]);
- step2[25] = WRAPLOW(step1[25] + step1[26]);
- step2[26] = WRAPLOW(step1[25] - step1[26]);
- step2[27] = WRAPLOW(step1[24] - step1[27]);
- step2[28] = WRAPLOW(-step1[28] + step1[31]);
- step2[29] = WRAPLOW(-step1[29] + step1[30]);
- step2[30] = WRAPLOW(step1[29] + step1[30]);
- step2[31] = WRAPLOW(step1[28] + step1[31]);
-
- // stage 5
- step1[0] = WRAPLOW(step2[0] + step2[3]);
- step1[1] = WRAPLOW(step2[1] + step2[2]);
- step1[2] = WRAPLOW(step2[1] - step2[2]);
- step1[3] = WRAPLOW(step2[0] - step2[3]);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
- step1[7] = step2[7];
-
- step1[8] = WRAPLOW(step2[8] + step2[11]);
- step1[9] = WRAPLOW(step2[9] + step2[10]);
- step1[10] = WRAPLOW(step2[9] - step2[10]);
- step1[11] = WRAPLOW(step2[8] - step2[11]);
- step1[12] = WRAPLOW(-step2[12] + step2[15]);
- step1[13] = WRAPLOW(-step2[13] + step2[14]);
- step1[14] = WRAPLOW(step2[13] + step2[14]);
- step1[15] = WRAPLOW(step2[12] + step2[15]);
-
- step1[16] = step2[16];
- step1[17] = step2[17];
- temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
- temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1));
- step1[29] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
- temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
- step1[19] = WRAPLOW(dct_const_round_shift(temp1));
- step1[28] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
- temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1));
- step1[27] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
- temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1));
- step1[26] = WRAPLOW(dct_const_round_shift(temp2));
- step1[22] = step2[22];
- step1[23] = step2[23];
- step1[24] = step2[24];
- step1[25] = step2[25];
- step1[30] = step2[30];
- step1[31] = step2[31];
-
- // stage 6
- step2[0] = WRAPLOW(step1[0] + step1[7]);
- step2[1] = WRAPLOW(step1[1] + step1[6]);
- step2[2] = WRAPLOW(step1[2] + step1[5]);
- step2[3] = WRAPLOW(step1[3] + step1[4]);
- step2[4] = WRAPLOW(step1[3] - step1[4]);
- step2[5] = WRAPLOW(step1[2] - step1[5]);
- step2[6] = WRAPLOW(step1[1] - step1[6]);
- step2[7] = WRAPLOW(step1[0] - step1[7]);
- step2[8] = step1[8];
- step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1));
- step2[12] = WRAPLOW(dct_const_round_shift(temp2));
- step2[14] = step1[14];
- step2[15] = step1[15];
-
- step2[16] = WRAPLOW(step1[16] + step1[23]);
- step2[17] = WRAPLOW(step1[17] + step1[22]);
- step2[18] = WRAPLOW(step1[18] + step1[21]);
- step2[19] = WRAPLOW(step1[19] + step1[20]);
- step2[20] = WRAPLOW(step1[19] - step1[20]);
- step2[21] = WRAPLOW(step1[18] - step1[21]);
- step2[22] = WRAPLOW(step1[17] - step1[22]);
- step2[23] = WRAPLOW(step1[16] - step1[23]);
-
- step2[24] = WRAPLOW(-step1[24] + step1[31]);
- step2[25] = WRAPLOW(-step1[25] + step1[30]);
- step2[26] = WRAPLOW(-step1[26] + step1[29]);
- step2[27] = WRAPLOW(-step1[27] + step1[28]);
- step2[28] = WRAPLOW(step1[27] + step1[28]);
- step2[29] = WRAPLOW(step1[26] + step1[29]);
- step2[30] = WRAPLOW(step1[25] + step1[30]);
- step2[31] = WRAPLOW(step1[24] + step1[31]);
-
- // stage 7
- step1[0] = WRAPLOW(step2[0] + step2[15]);
- step1[1] = WRAPLOW(step2[1] + step2[14]);
- step1[2] = WRAPLOW(step2[2] + step2[13]);
- step1[3] = WRAPLOW(step2[3] + step2[12]);
- step1[4] = WRAPLOW(step2[4] + step2[11]);
- step1[5] = WRAPLOW(step2[5] + step2[10]);
- step1[6] = WRAPLOW(step2[6] + step2[9]);
- step1[7] = WRAPLOW(step2[7] + step2[8]);
- step1[8] = WRAPLOW(step2[7] - step2[8]);
- step1[9] = WRAPLOW(step2[6] - step2[9]);
- step1[10] = WRAPLOW(step2[5] - step2[10]);
- step1[11] = WRAPLOW(step2[4] - step2[11]);
- step1[12] = WRAPLOW(step2[3] - step2[12]);
- step1[13] = WRAPLOW(step2[2] - step2[13]);
- step1[14] = WRAPLOW(step2[1] - step2[14]);
- step1[15] = WRAPLOW(step2[0] - step2[15]);
-
- step1[16] = step2[16];
- step1[17] = step2[17];
- step1[18] = step2[18];
- step1[19] = step2[19];
- temp1 = (-step2[20] + step2[27]) * cospi_16_64;
- temp2 = (step2[20] + step2[27]) * cospi_16_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1));
- step1[27] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step2[21] + step2[26]) * cospi_16_64;
- temp2 = (step2[21] + step2[26]) * cospi_16_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1));
- step1[26] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step2[22] + step2[25]) * cospi_16_64;
- temp2 = (step2[22] + step2[25]) * cospi_16_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1));
- step1[25] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step2[23] + step2[24]) * cospi_16_64;
- temp2 = (step2[23] + step2[24]) * cospi_16_64;
- step1[23] = WRAPLOW(dct_const_round_shift(temp1));
- step1[24] = WRAPLOW(dct_const_round_shift(temp2));
- step1[28] = step2[28];
- step1[29] = step2[29];
- step1[30] = step2[30];
- step1[31] = step2[31];
-
- // final stage
- output[0] = WRAPLOW(step1[0] + step1[31]);
- output[1] = WRAPLOW(step1[1] + step1[30]);
- output[2] = WRAPLOW(step1[2] + step1[29]);
- output[3] = WRAPLOW(step1[3] + step1[28]);
- output[4] = WRAPLOW(step1[4] + step1[27]);
- output[5] = WRAPLOW(step1[5] + step1[26]);
- output[6] = WRAPLOW(step1[6] + step1[25]);
- output[7] = WRAPLOW(step1[7] + step1[24]);
- output[8] = WRAPLOW(step1[8] + step1[23]);
- output[9] = WRAPLOW(step1[9] + step1[22]);
- output[10] = WRAPLOW(step1[10] + step1[21]);
- output[11] = WRAPLOW(step1[11] + step1[20]);
- output[12] = WRAPLOW(step1[12] + step1[19]);
- output[13] = WRAPLOW(step1[13] + step1[18]);
- output[14] = WRAPLOW(step1[14] + step1[17]);
- output[15] = WRAPLOW(step1[15] + step1[16]);
- output[16] = WRAPLOW(step1[15] - step1[16]);
- output[17] = WRAPLOW(step1[14] - step1[17]);
- output[18] = WRAPLOW(step1[13] - step1[18]);
- output[19] = WRAPLOW(step1[12] - step1[19]);
- output[20] = WRAPLOW(step1[11] - step1[20]);
- output[21] = WRAPLOW(step1[10] - step1[21]);
- output[22] = WRAPLOW(step1[9] - step1[22]);
- output[23] = WRAPLOW(step1[8] - step1[23]);
- output[24] = WRAPLOW(step1[7] - step1[24]);
- output[25] = WRAPLOW(step1[6] - step1[25]);
- output[26] = WRAPLOW(step1[5] - step1[26]);
- output[27] = WRAPLOW(step1[4] - step1[27]);
- output[28] = WRAPLOW(step1[3] - step1[28]);
- output[29] = WRAPLOW(step1[2] - step1[29]);
- output[30] = WRAPLOW(step1[1] - step1[30]);
- output[31] = WRAPLOW(step1[0] - step1[31]);
-}
-
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[32 * 32];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
-
- // Rows
- for (i = 0; i < 32; ++i) {
- int16_t zero_coeff[16];
- for (j = 0; j < 16; ++j)
- zero_coeff[j] = input[2 * j] | input[2 * j + 1];
- for (j = 0; j < 8; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 4; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 2; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
- if (zero_coeff[0] | zero_coeff[1])
- idct32_c(input, outptr);
- else
- memset(outptr, 0, sizeof(tran_low_t) * 32);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j)
- temp_in[j] = out[j * 32 + i];
- idct32_c(temp_in, temp_out);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[32 * 32] = {0};
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
-
- // Rows
- // only upper-left 16x16 has non-zero coeff
- for (i = 0; i < 16; ++i) {
- idct32_c(input, outptr);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j)
- temp_in[j] = out[j * 32 + i];
- idct32_c(temp_in, temp_out);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[32 * 32] = {0};
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
-
- // Rows
- // only upper-left 8x8 has non-zero coeff
- for (i = 0; i < 8; ++i) {
- idct32_c(input, outptr);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j)
- temp_in[j] = out[j * 32 + i];
- idct32_c(temp_in, temp_out);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- int i, j;
- tran_high_t a1;
-
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 6);
-
- for (j = 0; j < 32; ++j) {
- for (i = 0; i < 32; ++i)
- dest[i] = clip_pixel_add(dest[i], a1);
- dest += stride;
- }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
- 0.5 shifts per pixel. */
- int i;
- tran_low_t output[16];
- tran_high_t a1, b1, c1, d1, e1;
- const tran_low_t *ip = input;
- tran_low_t *op = output;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- for (i = 0; i < 4; i++) {
- a1 = ip[0] >> UNIT_QUANT_SHIFT;
- c1 = ip[1] >> UNIT_QUANT_SHIFT;
- d1 = ip[2] >> UNIT_QUANT_SHIFT;
- b1 = ip[3] >> UNIT_QUANT_SHIFT;
- a1 += c1;
- d1 -= b1;
- e1 = (a1 - d1) >> 1;
- b1 = e1 - b1;
- c1 = e1 - c1;
- a1 -= b1;
- d1 += c1;
- op[0] = HIGHBD_WRAPLOW(a1, bd);
- op[1] = HIGHBD_WRAPLOW(b1, bd);
- op[2] = HIGHBD_WRAPLOW(c1, bd);
- op[3] = HIGHBD_WRAPLOW(d1, bd);
- ip += 4;
- op += 4;
- }
-
- ip = output;
- for (i = 0; i < 4; i++) {
- a1 = ip[4 * 0];
- c1 = ip[4 * 1];
- d1 = ip[4 * 2];
- b1 = ip[4 * 3];
- a1 += c1;
- d1 -= b1;
- e1 = (a1 - d1) >> 1;
- b1 = e1 - b1;
- c1 = e1 - c1;
- a1 -= b1;
- d1 += c1;
- dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0],
- HIGHBD_WRAPLOW(a1, bd), bd);
- dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1],
- HIGHBD_WRAPLOW(b1, bd), bd);
- dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2],
- HIGHBD_WRAPLOW(c1, bd), bd);
- dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3],
- HIGHBD_WRAPLOW(d1, bd), bd);
-
- ip++;
- dest++;
- }
-}
-
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
- int dest_stride, int bd) {
- int i;
- tran_high_t a1, e1;
- tran_low_t tmp[4];
- const tran_low_t *ip = in;
- tran_low_t *op = tmp;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- (void) bd;
-
- a1 = ip[0] >> UNIT_QUANT_SHIFT;
- e1 = a1 >> 1;
- a1 -= e1;
- op[0] = HIGHBD_WRAPLOW(a1, bd);
- op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
-
- ip = tmp;
- for (i = 0; i < 4; i++) {
- e1 = ip[0] >> 1;
- a1 = ip[0] - e1;
- dest[dest_stride * 0] = highbd_clip_pixel_add(
- dest[dest_stride * 0], a1, bd);
- dest[dest_stride * 1] = highbd_clip_pixel_add(
- dest[dest_stride * 1], e1, bd);
- dest[dest_stride * 2] = highbd_clip_pixel_add(
- dest[dest_stride * 2], e1, bd);
- dest[dest_stride * 3] = highbd_clip_pixel_add(
- dest[dest_stride * 3], e1, bd);
- ip++;
- dest++;
- }
-}
-
-void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_low_t step[4];
- tran_high_t temp1, temp2;
- (void) bd;
- // stage 1
- temp1 = (input[0] + input[2]) * cospi_16_64;
- temp2 = (input[0] - input[2]) * cospi_16_64;
- step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
- temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
- step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- // stage 2
- output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
- output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
- output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
- output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
-}
-
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[4 * 4];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[4], temp_out[4];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // Rows
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct4_c(input, outptr, bd);
- input += 4;
- outptr += 4;
- }
-
- // Columns
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j)
- temp_in[j] = out[j * 4 + i];
- vpx_highbd_idct4_c(temp_in, temp_out, bd);
- for (j = 0; j < 4; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
- }
- }
-}
-
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int dest_stride, int bd) {
- int i;
- tran_high_t a1;
- tran_low_t out = HIGHBD_WRAPLOW(
- highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
- a1 = ROUND_POWER_OF_TWO(out, 4);
-
- for (i = 0; i < 4; i++) {
- dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
- dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
- dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
- dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
- dest += dest_stride;
- }
-}
-
-void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_low_t step1[8], step2[8];
- tran_high_t temp1, temp2;
- // stage 1
- step1[0] = input[0];
- step1[2] = input[4];
- step1[1] = input[2];
- step1[3] = input[6];
- temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
- temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
- step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
- temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- // stage 2 & stage 3 - even half
- vpx_highbd_idct4_c(step1, step1, bd);
-
- // stage 2 - odd half
- step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
- // stage 3 - odd half
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[7] = step2[7];
-
- // stage 4
- output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
- output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
- output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
- output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
- output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
- output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
- output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
- output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
-}
-
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // First transform rows.
- for (i = 0; i < 8; ++i) {
- vpx_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
-
- // Then transform columns.
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = out[j * 8 + i];
- vpx_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
-}
-
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- int i, j;
- tran_high_t a1;
- tran_low_t out = HIGHBD_WRAPLOW(
- highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
- a1 = ROUND_POWER_OF_TWO(out, 5);
- for (j = 0; j < 8; ++j) {
- for (i = 0; i < 8; ++i)
- dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
- dest += stride;
- }
-}
-
-void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
- tran_low_t x0 = input[0];
- tran_low_t x1 = input[1];
- tran_low_t x2 = input[2];
- tran_low_t x3 = input[3];
- (void) bd;
-
- if (!(x0 | x1 | x2 | x3)) {
- memset(output, 0, 4 * sizeof(*output));
- return;
- }
-
- s0 = sinpi_1_9 * x0;
- s1 = sinpi_2_9 * x0;
- s2 = sinpi_3_9 * x1;
- s3 = sinpi_4_9 * x2;
- s4 = sinpi_1_9 * x2;
- s5 = sinpi_2_9 * x3;
- s6 = sinpi_4_9 * x3;
- s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
-
- s0 = s0 + s3 + s5;
- s1 = s1 - s4 - s6;
- s3 = s2;
- s2 = sinpi_3_9 * s7;
-
- // 1-D transform scaling factor is sqrt(2).
- // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
- // + 1b (addition) = 29b.
- // Hence the output bit depth is 15b.
- output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
- output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
- output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
- output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
-}
-
-void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
- tran_low_t x0 = input[7];
- tran_low_t x1 = input[0];
- tran_low_t x2 = input[5];
- tran_low_t x3 = input[2];
- tran_low_t x4 = input[3];
- tran_low_t x5 = input[4];
- tran_low_t x6 = input[1];
- tran_low_t x7 = input[6];
- (void) bd;
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
- memset(output, 0, 8 * sizeof(*output));
- return;
- }
-
- // stage 1
- s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
- s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
- s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
- s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
- s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
- s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
-
- x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
- x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
-
- // stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
- s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
- s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
- s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
-
- x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
- x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
- x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
- x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
-
- // stage 3
- s2 = cospi_16_64 * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (x6 - x7);
-
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
-
- output[0] = HIGHBD_WRAPLOW(x0, bd);
- output[1] = HIGHBD_WRAPLOW(-x4, bd);
- output[2] = HIGHBD_WRAPLOW(x6, bd);
- output[3] = HIGHBD_WRAPLOW(-x2, bd);
- output[4] = HIGHBD_WRAPLOW(x3, bd);
- output[5] = HIGHBD_WRAPLOW(-x7, bd);
- output[6] = HIGHBD_WRAPLOW(x5, bd);
- output[7] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // First transform rows.
- // Only first 4 row has non-zero coefs.
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
- // Then transform columns.
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = out[j * 8 + i];
- vpx_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
-}
-
-void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_low_t step1[16], step2[16];
- tran_high_t temp1, temp2;
- (void) bd;
-
- // stage 1
- step1[0] = input[0/2];
- step1[1] = input[16/2];
- step1[2] = input[8/2];
- step1[3] = input[24/2];
- step1[4] = input[4/2];
- step1[5] = input[20/2];
- step1[6] = input[12/2];
- step1[7] = input[28/2];
- step1[8] = input[2/2];
- step1[9] = input[18/2];
- step1[10] = input[10/2];
- step1[11] = input[26/2];
- step1[12] = input[6/2];
- step1[13] = input[22/2];
- step1[14] = input[14/2];
- step1[15] = input[30/2];
-
- // stage 2
- step2[0] = step1[0];
- step2[1] = step1[1];
- step2[2] = step1[2];
- step2[3] = step1[3];
- step2[4] = step1[4];
- step2[5] = step1[5];
- step2[6] = step1[6];
- step2[7] = step1[7];
-
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- // stage 3
- step1[0] = step2[0];
- step1[1] = step2[1];
- step1[2] = step2[2];
- step1[3] = step2[3];
-
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
- step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
- step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
- step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
-
- // stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
- step2[8] = step1[8];
- step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[11] = step1[11];
- step2[12] = step1[12];
-
- // stage 5
- step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
- step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
- step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
- step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[7] = step2[7];
-
- step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
- step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
- step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
- step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
-
- // stage 6
- step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
- step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
- step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
- step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
- step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
- step2[8] = step1[8];
- step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[14] = step1[14];
- step2[15] = step1[15];
-
- // stage 7
- output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
- output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
- output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
- output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
- output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
- output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
- output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
- output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
- output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
- output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
- output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
- output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
- output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
- output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
- output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
- output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
-}
-
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // First transform rows.
- for (i = 0; i < 16; ++i) {
- vpx_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns.
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j * 16 + i];
- vpx_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
-}
-
-void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
- tran_low_t x0 = input[15];
- tran_low_t x1 = input[0];
- tran_low_t x2 = input[13];
- tran_low_t x3 = input[2];
- tran_low_t x4 = input[11];
- tran_low_t x5 = input[4];
- tran_low_t x6 = input[9];
- tran_low_t x7 = input[6];
- tran_low_t x8 = input[7];
- tran_low_t x9 = input[8];
- tran_low_t x10 = input[5];
- tran_low_t x11 = input[10];
- tran_low_t x12 = input[3];
- tran_low_t x13 = input[12];
- tran_low_t x14 = input[1];
- tran_low_t x15 = input[14];
- (void) bd;
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
- | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
- memset(output, 0, 16 * sizeof(*output));
- return;
- }
-
- // stage 1
- s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
- s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
- s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
- s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
- s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
- s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
- s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
- s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
- s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
- s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
- s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
- s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
- s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
- s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
- s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
- s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
- x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
- x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
- x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
- x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
- x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
- x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
- x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
- x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
-
- // stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4;
- s5 = x5;
- s6 = x6;
- s7 = x7;
- s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
- s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
- s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
- s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
- s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
- s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
- s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
- s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
- x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
- x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
- x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
- x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
- x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
- x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
- x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
- x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
- x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
- x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
- x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
- x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
- x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
- x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
-
- // stage 3
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
- s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
- s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
- s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
- s8 = x8;
- s9 = x9;
- s10 = x10;
- s11 = x11;
- s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
- s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
- s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
- s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
- x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
- x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
- x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
- x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
- x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
- x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
- x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
- x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
- x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
- x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
-
- // stage 4
- s2 = (- cospi_16_64) * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (-x6 + x7);
- s10 = cospi_16_64 * (x10 + x11);
- s11 = cospi_16_64 * (-x10 + x11);
- s14 = (- cospi_16_64) * (x14 + x15);
- s15 = cospi_16_64 * (x14 - x15);
-
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
- x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
- x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
-
- output[0] = HIGHBD_WRAPLOW(x0, bd);
- output[1] = HIGHBD_WRAPLOW(-x8, bd);
- output[2] = HIGHBD_WRAPLOW(x12, bd);
- output[3] = HIGHBD_WRAPLOW(-x4, bd);
- output[4] = HIGHBD_WRAPLOW(x6, bd);
- output[5] = HIGHBD_WRAPLOW(x14, bd);
- output[6] = HIGHBD_WRAPLOW(x10, bd);
- output[7] = HIGHBD_WRAPLOW(x2, bd);
- output[8] = HIGHBD_WRAPLOW(x3, bd);
- output[9] = HIGHBD_WRAPLOW(x11, bd);
- output[10] = HIGHBD_WRAPLOW(x15, bd);
- output[11] = HIGHBD_WRAPLOW(x7, bd);
- output[12] = HIGHBD_WRAPLOW(x5, bd);
- output[13] = HIGHBD_WRAPLOW(-x13, bd);
- output[14] = HIGHBD_WRAPLOW(x9, bd);
- output[15] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // First transform rows. Since all non-zero dct coefficients are in
- // upper-left 4x4 area, we only need to calculate first 4 rows here.
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns.
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j*16 + i];
- vpx_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
-}
-
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- int i, j;
- tran_high_t a1;
- tran_low_t out = HIGHBD_WRAPLOW(
- highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
- a1 = ROUND_POWER_OF_TWO(out, 6);
- for (j = 0; j < 16; ++j) {
- for (i = 0; i < 16; ++i)
- dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
- dest += stride;
- }
-}
-
-static void highbd_idct32_c(const tran_low_t *input,
- tran_low_t *output, int bd) {
- tran_low_t step1[32], step2[32];
- tran_high_t temp1, temp2;
- (void) bd;
-
- // stage 1
- step1[0] = input[0];
- step1[1] = input[16];
- step1[2] = input[8];
- step1[3] = input[24];
- step1[4] = input[4];
- step1[5] = input[20];
- step1[6] = input[12];
- step1[7] = input[28];
- step1[8] = input[2];
- step1[9] = input[18];
- step1[10] = input[10];
- step1[11] = input[26];
- step1[12] = input[6];
- step1[13] = input[22];
- step1[14] = input[14];
- step1[15] = input[30];
-
- temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
- temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
- step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
- temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
- step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
- temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
- step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
- temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
- step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
- temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
- step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
- temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
- temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
- step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
- temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
- step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- // stage 2
- step2[0] = step1[0];
- step2[1] = step1[1];
- step2[2] = step1[2];
- step2[3] = step1[3];
- step2[4] = step1[4];
- step2[5] = step1[5];
- step2[6] = step1[6];
- step2[7] = step1[7];
-
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
- step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
- step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
- step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
- step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
- step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
- step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
- step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
- step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
- step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
- step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
- step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
- step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
- step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
- step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
- step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
-
- // stage 3
- step1[0] = step2[0];
- step1[1] = step2[1];
- step1[2] = step2[2];
- step1[3] = step2[3];
-
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
- step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
- step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
- step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
-
- step1[16] = step2[16];
- step1[31] = step2[31];
- temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
- temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
- step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
- temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
- step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[19] = step2[19];
- step1[20] = step2[20];
- temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
- temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
- temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
- step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[23] = step2[23];
- step1[24] = step2[24];
- step1[27] = step2[27];
- step1[28] = step2[28];
-
- // stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
- step2[8] = step1[8];
- step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[11] = step1[11];
- step2[12] = step1[12];
-
- step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
- step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
- step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
- step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
- step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
- step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
- step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
- step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
-
- step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
- step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
- step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
- step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
- step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
- step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
- step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
- step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
-
- // stage 5
- step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
- step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
- step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
- step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[7] = step2[7];
-
- step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
- step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
- step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
- step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
-
- step1[16] = step2[16];
- step1[17] = step2[17];
- temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
- temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
- step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
- temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
- step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
- temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
- step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
- temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[22] = step2[22];
- step1[23] = step2[23];
- step1[24] = step2[24];
- step1[25] = step2[25];
- step1[30] = step2[30];
- step1[31] = step2[31];
-
- // stage 6
- step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
- step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
- step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
- step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
- step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
- step2[8] = step1[8];
- step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[14] = step1[14];
- step2[15] = step1[15];
-
- step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
- step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
- step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
- step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
- step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
- step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
- step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
- step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
-
- step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
- step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
- step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
- step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
- step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
- step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
- step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
- step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
-
- // stage 7
- step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
- step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
- step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
- step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
- step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
- step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
- step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
- step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
- step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
- step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
- step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
- step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
-
- step1[16] = step2[16];
- step1[17] = step2[17];
- step1[18] = step2[18];
- step1[19] = step2[19];
- temp1 = (-step2[20] + step2[27]) * cospi_16_64;
- temp2 = (step2[20] + step2[27]) * cospi_16_64;
- step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step2[21] + step2[26]) * cospi_16_64;
- temp2 = (step2[21] + step2[26]) * cospi_16_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step2[22] + step2[25]) * cospi_16_64;
- temp2 = (step2[22] + step2[25]) * cospi_16_64;
- step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step2[23] + step2[24]) * cospi_16_64;
- temp2 = (step2[23] + step2[24]) * cospi_16_64;
- step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[28] = step2[28];
- step1[29] = step2[29];
- step1[30] = step2[30];
- step1[31] = step2[31];
-
- // final stage
- output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
- output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
- output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
- output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
- output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
- output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
- output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
- output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
- output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
- output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
- output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
- output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
- output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
- output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
- output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
- output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
- output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
- output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
- output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
- output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
- output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
- output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
- output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
- output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
- output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
- output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
- output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
- output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
- output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
- output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
- output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
- output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
-}
-
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[32 * 32];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // Rows
- for (i = 0; i < 32; ++i) {
- tran_low_t zero_coeff[16];
- for (j = 0; j < 16; ++j)
- zero_coeff[j] = input[2 * j] | input[2 * j + 1];
- for (j = 0; j < 8; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 4; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 2; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
- if (zero_coeff[0] | zero_coeff[1])
- highbd_idct32_c(input, outptr, bd);
- else
- memset(outptr, 0, sizeof(tran_low_t) * 32);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j)
- temp_in[j] = out[j * 32 + i];
- highbd_idct32_c(temp_in, temp_out, bd);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
-}
-
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[32 * 32] = {0};
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // Rows
- // Only upper-left 8x8 has non-zero coeff.
- for (i = 0; i < 8; ++i) {
- highbd_idct32_c(input, outptr, bd);
- input += 32;
- outptr += 32;
- }
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j)
- temp_in[j] = out[j * 32 + i];
- highbd_idct32_c(temp_in, temp_out, bd);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
-}
-
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- int i, j;
- int a1;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- tran_low_t out = HIGHBD_WRAPLOW(
- highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
- a1 = ROUND_POWER_OF_TWO(out, 6);
-
- for (j = 0; j < 32; ++j) {
- for (i = 0; i < 32; ++i)
- dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
- dest += stride;
- }
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/thirdparty/libvpx/vpx_dsp/inv_txfm.h b/thirdparty/libvpx/vpx_dsp/inv_txfm.h
deleted file mode 100644
index 9cfe1be3a7..0000000000
--- a/thirdparty/libvpx/vpx_dsp/inv_txfm.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_INV_TXFM_H_
-#define VPX_DSP_INV_TXFM_H_
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "vpx_dsp/txfm_common.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static INLINE tran_high_t check_range(tran_high_t input) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
- // For valid VP9 input streams, intermediate stage coefficients should always
- // stay within the range of a signed 16 bit integer. Coefficients can go out
- // of this range for invalid/corrupt VP9 streams. However, strictly checking
- // this range for every intermediate coefficient can burdensome for a decoder,
- // therefore the following assertion is only enabled when configured with
- // --enable-coefficient-range-checking.
- assert(INT16_MIN <= input);
- assert(input <= INT16_MAX);
-#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
- return input;
-}
-
-static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
- tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- return (tran_high_t)rv;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE tran_high_t highbd_check_range(tran_high_t input,
- int bd) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
- // For valid highbitdepth VP9 streams, intermediate stage coefficients will
- // stay within the ranges:
- // - 8 bit: signed 16 bit integer
- // - 10 bit: signed 18 bit integer
- // - 12 bit: signed 20 bit integer
- const int32_t int_max = (1 << (7 + bd)) - 1;
- const int32_t int_min = -int_max - 1;
- assert(int_min <= input);
- assert(input <= int_max);
- (void) int_min;
-#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
- (void) bd;
- return input;
-}
-
-static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
- tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- return (tran_high_t)rv;
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_EMULATE_HARDWARE
-// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
-// non-normative method to handle overflows. A stream that causes
-// overflows in the inverse transform is considered invalid in VP9,
-// and a hardware implementer is free to choose any reasonable
-// method to handle overflows. However to aid in hardware
-// verification they can use a specific implementation of the
-// WRAPLOW() macro below that is identical to their intended
-// hardware implementation (and also use configure options to trigger
-// the C-implementation of the transform).
-//
-// The particular WRAPLOW implementation below performs strict
-// overflow wrapping to match common hardware implementations.
-// bd of 8 uses trans_low with 16bits, need to remove 16bits
-// bd of 10 uses trans_low with 18bits, need to remove 14bits
-// bd of 12 uses trans_low with 20bits, need to remove 12bits
-// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-
-#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
-#if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_WRAPLOW(x, bd) \
- ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-#else // CONFIG_EMULATE_HARDWARE
-
-#define WRAPLOW(x) ((int32_t)check_range(x))
-#if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_WRAPLOW(x, bd) \
- ((int32_t)highbd_check_range((x), bd))
-#endif // CONFIG_VP9_HIGHBITDEPTH
-#endif // CONFIG_EMULATE_HARDWARE
-
-void idct4_c(const tran_low_t *input, tran_low_t *output);
-void idct8_c(const tran_low_t *input, tran_low_t *output);
-void idct16_c(const tran_low_t *input, tran_low_t *output);
-void idct32_c(const tran_low_t *input, tran_low_t *output);
-void iadst4_c(const tran_low_t *input, tran_low_t *output);
-void iadst8_c(const tran_low_t *input, tran_low_t *output);
-void iadst16_c(const tran_low_t *input, tran_low_t *output);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
- int bd) {
- trans = HIGHBD_WRAPLOW(trans, bd);
- return clip_pixel_highbd(dest + (int)trans, bd);
-}
-#endif
-
-static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
- trans = WRAPLOW(trans);
- return clip_pixel(dest + (int)trans);
-}
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VPX_DSP_INV_TXFM_H_
diff --git a/thirdparty/libvpx/vpx_dsp/loopfilter.c b/thirdparty/libvpx/vpx_dsp/loopfilter.c
deleted file mode 100644
index 645a1ab95e..0000000000
--- a/thirdparty/libvpx/vpx_dsp/loopfilter.c
+++ /dev/null
@@ -1,767 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_ports/mem.h"
-
-static INLINE int8_t signed_char_clamp(int t) {
- return (int8_t)clamp(t, -128, 127);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE int16_t signed_char_clamp_high(int t, int bd) {
- switch (bd) {
- case 10:
- return (int16_t)clamp(t, -128*4, 128*4-1);
- case 12:
- return (int16_t)clamp(t, -128*16, 128*16-1);
- case 8:
- default:
- return (int16_t)clamp(t, -128, 128-1);
- }
-}
-#endif
-
-// should we apply any filter at all: 11111111 yes, 00000000 no
-static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
- uint8_t p3, uint8_t p2,
- uint8_t p1, uint8_t p0,
- uint8_t q0, uint8_t q1,
- uint8_t q2, uint8_t q3) {
- int8_t mask = 0;
- mask |= (abs(p3 - p2) > limit) * -1;
- mask |= (abs(p2 - p1) > limit) * -1;
- mask |= (abs(p1 - p0) > limit) * -1;
- mask |= (abs(q1 - q0) > limit) * -1;
- mask |= (abs(q2 - q1) > limit) * -1;
- mask |= (abs(q3 - q2) > limit) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- return ~mask;
-}
-
-static INLINE int8_t flat_mask4(uint8_t thresh,
- uint8_t p3, uint8_t p2,
- uint8_t p1, uint8_t p0,
- uint8_t q0, uint8_t q1,
- uint8_t q2, uint8_t q3) {
- int8_t mask = 0;
- mask |= (abs(p1 - p0) > thresh) * -1;
- mask |= (abs(q1 - q0) > thresh) * -1;
- mask |= (abs(p2 - p0) > thresh) * -1;
- mask |= (abs(q2 - q0) > thresh) * -1;
- mask |= (abs(p3 - p0) > thresh) * -1;
- mask |= (abs(q3 - q0) > thresh) * -1;
- return ~mask;
-}
-
-static INLINE int8_t flat_mask5(uint8_t thresh,
- uint8_t p4, uint8_t p3,
- uint8_t p2, uint8_t p1,
- uint8_t p0, uint8_t q0,
- uint8_t q1, uint8_t q2,
- uint8_t q3, uint8_t q4) {
- int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
- mask |= (abs(p4 - p0) > thresh) * -1;
- mask |= (abs(q4 - q0) > thresh) * -1;
- return ~mask;
-}
-
-// is there high edge variance internal edge: 11111111 yes, 00000000 no
-static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
- uint8_t q0, uint8_t q1) {
- int8_t hev = 0;
- hev |= (abs(p1 - p0) > thresh) * -1;
- hev |= (abs(q1 - q0) > thresh) * -1;
- return hev;
-}
-
-static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
- uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
- int8_t filter1, filter2;
-
- const int8_t ps1 = (int8_t) *op1 ^ 0x80;
- const int8_t ps0 = (int8_t) *op0 ^ 0x80;
- const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
- const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
- const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
-
- // add outer taps if we have high edge variance
- int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
- // inner taps
- filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
-
- // save bottom 3 bits so that we round one side +4 and the other +3
- // if it equals 4 we'll set to adjust by -1 to account for the fact
- // we'd round 3 the other way
- filter1 = signed_char_clamp(filter + 4) >> 3;
- filter2 = signed_char_clamp(filter + 3) >> 3;
-
- *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
- *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-
- // outer tap adjustments
- filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-
- *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
- *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
-}
-
-void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh) {
- int i;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < 8; ++i) {
- const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
- const int8_t mask = filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3);
- filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
- ++s;
- }
-}
-
-void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
- vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- int i;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < 8; ++i) {
- const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
- const int8_t mask = filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3);
- filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
- s += pitch;
- }
-}
-
-void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
- vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
-}
-
-static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
- uint8_t *op3, uint8_t *op2,
- uint8_t *op1, uint8_t *op0,
- uint8_t *oq0, uint8_t *oq1,
- uint8_t *oq2, uint8_t *oq3) {
- if (flat && mask) {
- const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
- const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-
- // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
- *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
- *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
- *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
- *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
- *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
- *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
- } else {
- filter4(mask, thresh, op1, op0, oq0, oq1);
- }
-}
-
-void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- int i;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < 8; ++i) {
- const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-
- const int8_t mask = filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
- s, s + 1 * p, s + 2 * p, s + 3 * p);
- ++s;
- }
-}
-
-void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
- vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- int i;
-
- for (i = 0; i < 8; ++i) {
- const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
- const int8_t mask = filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1,
- s, s + 1, s + 2, s + 3);
- s += pitch;
- }
-}
-
-void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
- vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
-}
-
-static INLINE void filter16(int8_t mask, uint8_t thresh,
- uint8_t flat, uint8_t flat2,
- uint8_t *op7, uint8_t *op6,
- uint8_t *op5, uint8_t *op4,
- uint8_t *op3, uint8_t *op2,
- uint8_t *op1, uint8_t *op0,
- uint8_t *oq0, uint8_t *oq1,
- uint8_t *oq2, uint8_t *oq3,
- uint8_t *oq4, uint8_t *oq5,
- uint8_t *oq6, uint8_t *oq7) {
- if (flat2 && flat && mask) {
- const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
- p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-
- const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
- q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
-
- // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
- *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
- q0, 4);
- *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
- q0 + q1, 4);
- *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
- q0 + q1 + q2, 4);
- *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
- q0 + q1 + q2 + q3, 4);
- *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
- q0 + q1 + q2 + q3 + q4, 4);
- *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
- q0 + q1 + q2 + q3 + q4 + q5, 4);
- *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
- q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
- *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
- q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
- *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
- q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
- *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
- q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
- *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
- q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
- *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
- q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
- *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
- q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
- *oq6 = ROUND_POWER_OF_TWO(p0 +
- q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
- } else {
- filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
- }
-}
-
-static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int count) {
- int i;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < 8 * count; ++i) {
- const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
- const int8_t mask = filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat2 = flat_mask5(1,
- s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
- q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
-
- filter16(mask, *thresh, flat, flat2,
- s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
- s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
- s, s + 1 * p, s + 2 * p, s + 3 * p,
- s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
- ++s;
- }
-}
-
-void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
-}
-
-void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
-}
-
-static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh,
- int count) {
- int i;
-
- for (i = 0; i < count; ++i) {
- const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
- const int8_t mask = filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
- q0, s[4], s[5], s[6], s[7]);
-
- filter16(mask, *thresh, flat, flat2,
- s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
- s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
- s += p;
- }
-}
-
-void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
-}
-
-void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-// Should we apply any filter at all: 11111111 yes, 00000000 no ?
-static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
- uint16_t p3, uint16_t p2,
- uint16_t p1, uint16_t p0,
- uint16_t q0, uint16_t q1,
- uint16_t q2, uint16_t q3, int bd) {
- int8_t mask = 0;
- int16_t limit16 = (uint16_t)limit << (bd - 8);
- int16_t blimit16 = (uint16_t)blimit << (bd - 8);
- mask |= (abs(p3 - p2) > limit16) * -1;
- mask |= (abs(p2 - p1) > limit16) * -1;
- mask |= (abs(p1 - p0) > limit16) * -1;
- mask |= (abs(q1 - q0) > limit16) * -1;
- mask |= (abs(q2 - q1) > limit16) * -1;
- mask |= (abs(q3 - q2) > limit16) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
- return ~mask;
-}
-
-static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
- uint16_t p3, uint16_t p2,
- uint16_t p1, uint16_t p0,
- uint16_t q0, uint16_t q1,
- uint16_t q2, uint16_t q3, int bd) {
- int8_t mask = 0;
- int16_t thresh16 = (uint16_t)thresh << (bd - 8);
- mask |= (abs(p1 - p0) > thresh16) * -1;
- mask |= (abs(q1 - q0) > thresh16) * -1;
- mask |= (abs(p2 - p0) > thresh16) * -1;
- mask |= (abs(q2 - q0) > thresh16) * -1;
- mask |= (abs(p3 - p0) > thresh16) * -1;
- mask |= (abs(q3 - q0) > thresh16) * -1;
- return ~mask;
-}
-
-static INLINE int8_t highbd_flat_mask5(uint8_t thresh,
- uint16_t p4, uint16_t p3,
- uint16_t p2, uint16_t p1,
- uint16_t p0, uint16_t q0,
- uint16_t q1, uint16_t q2,
- uint16_t q3, uint16_t q4, int bd) {
- int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- int16_t thresh16 = (uint16_t)thresh << (bd - 8);
- mask |= (abs(p4 - p0) > thresh16) * -1;
- mask |= (abs(q4 - q0) > thresh16) * -1;
- return ~mask;
-}
-
-// Is there high edge variance internal edge:
-// 11111111_11111111 yes, 00000000_00000000 no ?
-static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
- uint16_t q0, uint16_t q1, int bd) {
- int16_t hev = 0;
- int16_t thresh16 = (uint16_t)thresh << (bd - 8);
- hev |= (abs(p1 - p0) > thresh16) * -1;
- hev |= (abs(q1 - q0) > thresh16) * -1;
- return hev;
-}
-
-static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
- uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
- int bd) {
- int16_t filter1, filter2;
- // ^0x80 equivalent to subtracting 0x80 from the values to turn them
- // into -128 to +127 instead of 0 to 255.
- int shift = bd - 8;
- const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
- const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
- const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
- const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
- const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
-
- // Add outer taps if we have high edge variance.
- int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
-
- // Inner taps.
- filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
-
- // Save bottom 3 bits so that we round one side +4 and the other +3
- // if it equals 4 we'll set to adjust by -1 to account for the fact
- // we'd round 3 the other way.
- filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
- filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
-
- *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
- *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
-
- // Outer tap adjustments.
- filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-
- *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
- *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
-}
-
-void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh, int bd) {
- int i;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < 8; ++i) {
- const uint16_t p3 = s[-4 * p];
- const uint16_t p2 = s[-3 * p];
- const uint16_t p1 = s[-2 * p];
- const uint16_t p0 = s[-p];
- const uint16_t q0 = s[0 * p];
- const uint16_t q1 = s[1 * p];
- const uint16_t q2 = s[2 * p];
- const uint16_t q3 = s[3 * p];
- const int8_t mask = highbd_filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3, bd);
- highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
- ++s;
- }
-}
-
-void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1,
- int bd) {
- vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
- vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
-}
-
-void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- int i;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < 8; ++i) {
- const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
- const int8_t mask = highbd_filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3, bd);
- highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
- s += pitch;
- }
-}
-
-void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1,
- int bd) {
- vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
- vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
- thresh1, bd);
-}
-
-static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
- uint16_t *op3, uint16_t *op2,
- uint16_t *op1, uint16_t *op0,
- uint16_t *oq0, uint16_t *oq1,
- uint16_t *oq2, uint16_t *oq3, int bd) {
- if (flat && mask) {
- const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
- const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-
- // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
- *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
- *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
- *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
- *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
- *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
- *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
- } else {
- highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
- }
-}
-
-void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- int i;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < 8; ++i) {
- const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-
- const int8_t mask = highbd_filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
- bd);
- highbd_filter8(mask, *thresh, flat,
- s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
- s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
- ++s;
- }
-}
-
-void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1,
- int bd) {
- vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
- vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
-}
-
-void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- int i;
-
- for (i = 0; i < 8; ++i) {
- const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
- const int8_t mask = highbd_filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
- bd);
- highbd_filter8(mask, *thresh, flat,
- s - 4, s - 3, s - 2, s - 1,
- s, s + 1, s + 2, s + 3,
- bd);
- s += pitch;
- }
-}
-
-void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1,
- int bd) {
- vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
- vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
- thresh1, bd);
-}
-
-static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
- uint8_t flat, uint8_t flat2,
- uint16_t *op7, uint16_t *op6,
- uint16_t *op5, uint16_t *op4,
- uint16_t *op3, uint16_t *op2,
- uint16_t *op1, uint16_t *op0,
- uint16_t *oq0, uint16_t *oq1,
- uint16_t *oq2, uint16_t *oq3,
- uint16_t *oq4, uint16_t *oq5,
- uint16_t *oq6, uint16_t *oq7, int bd) {
- if (flat2 && flat && mask) {
- const uint16_t p7 = *op7;
- const uint16_t p6 = *op6;
- const uint16_t p5 = *op5;
- const uint16_t p4 = *op4;
- const uint16_t p3 = *op3;
- const uint16_t p2 = *op2;
- const uint16_t p1 = *op1;
- const uint16_t p0 = *op0;
- const uint16_t q0 = *oq0;
- const uint16_t q1 = *oq1;
- const uint16_t q2 = *oq2;
- const uint16_t q3 = *oq3;
- const uint16_t q4 = *oq4;
- const uint16_t q5 = *oq5;
- const uint16_t q6 = *oq6;
- const uint16_t q7 = *oq7;
-
- // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
- *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
- q0, 4);
- *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
- q0 + q1, 4);
- *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
- q0 + q1 + q2, 4);
- *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
- q0 + q1 + q2 + q3, 4);
- *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
- q0 + q1 + q2 + q3 + q4, 4);
- *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
- q0 + q1 + q2 + q3 + q4 + q5, 4);
- *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
- q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
- *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
- q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
- *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
- q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
- *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
- q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
- *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
- q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
- *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
- q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
- *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
- q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
- *oq6 = ROUND_POWER_OF_TWO(p0 +
- q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
- } else {
- highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
- bd);
- }
-}
-
-static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh,
- int count, int bd) {
- int i;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < 8 * count; ++i) {
- const uint16_t p3 = s[-4 * p];
- const uint16_t p2 = s[-3 * p];
- const uint16_t p1 = s[-2 * p];
- const uint16_t p0 = s[-p];
- const uint16_t q0 = s[0 * p];
- const uint16_t q1 = s[1 * p];
- const uint16_t q2 = s[2 * p];
- const uint16_t q3 = s[3 * p];
- const int8_t mask = highbd_filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
- bd);
- const int8_t flat2 = highbd_flat_mask5(
- 1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
- q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
-
- highbd_filter16(mask, *thresh, flat, flat2,
- s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
- s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
- s, s + 1 * p, s + 2 * p, s + 3 * p,
- s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p,
- bd);
- ++s;
- }
-}
-
-void vpx_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int bd) {
- highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
-}
-
-void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int bd) {
- highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
-}
-
-static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh,
- int count, int bd) {
- int i;
-
- for (i = 0; i < count; ++i) {
- const uint16_t p3 = s[-4];
- const uint16_t p2 = s[-3];
- const uint16_t p1 = s[-2];
- const uint16_t p0 = s[-1];
- const uint16_t q0 = s[0];
- const uint16_t q1 = s[1];
- const uint16_t q2 = s[2];
- const uint16_t q3 = s[3];
- const int8_t mask = highbd_filter_mask(*limit, *blimit,
- p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
- bd);
- const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
- q0, s[4], s[5], s[6], s[7], bd);
-
- highbd_filter16(mask, *thresh, flat, flat2,
- s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
- s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7,
- bd);
- s += p;
- }
-}
-
-void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
-}
-
-void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh,
- int bd) {
- highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/thirdparty/libvpx/vpx_dsp/prob.c b/thirdparty/libvpx/vpx_dsp/prob.c
deleted file mode 100644
index 639d24dd2f..0000000000
--- a/thirdparty/libvpx/vpx_dsp/prob.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./prob.h"
-
-const uint8_t vpx_norm[256] = {
- 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-static unsigned int tree_merge_probs_impl(unsigned int i,
- const vpx_tree_index *tree,
- const vpx_prob *pre_probs,
- const unsigned int *counts,
- vpx_prob *probs) {
- const int l = tree[i];
- const unsigned int left_count = (l <= 0)
- ? counts[-l]
- : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
- const int r = tree[i + 1];
- const unsigned int right_count = (r <= 0)
- ? counts[-r]
- : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
- const unsigned int ct[2] = { left_count, right_count };
- probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
- return left_count + right_count;
-}
-
-void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
- const unsigned int *counts, vpx_prob *probs) {
- tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
-}
diff --git a/thirdparty/libvpx/vpx_dsp/prob.h b/thirdparty/libvpx/vpx_dsp/prob.h
deleted file mode 100644
index c3cb103ffb..0000000000
--- a/thirdparty/libvpx/vpx_dsp/prob.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_PROB_H_
-#define VPX_DSP_PROB_H_
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_common.h"
-
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef uint8_t vpx_prob;
-
-#define MAX_PROB 255
-
-#define vpx_prob_half ((vpx_prob) 128)
-
-typedef int8_t vpx_tree_index;
-
-#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
-
-#define vpx_complement(x) (255 - x)
-
-#define MODE_MV_COUNT_SAT 20
-
-/* We build coding trees compactly in arrays.
- Each node of the tree is a pair of vpx_tree_indices.
- Array index often references a corresponding probability table.
- Index <= 0 means done encoding/decoding and value = -Index,
- Index > 0 means need another bit, specification at index.
- Nonnegative indices are always even; processing begins at node 0. */
-
-typedef const vpx_tree_index vpx_tree[];
-
-static INLINE vpx_prob clip_prob(int p) {
- return (p > 255) ? 255 : (p < 1) ? 1 : p;
-}
-
-static INLINE vpx_prob get_prob(int num, int den) {
- return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den);
-}
-
-static INLINE vpx_prob get_binary_prob(int n0, int n1) {
- return get_prob(n0, n0 + n1);
-}
-
-/* This function assumes prob1 and prob2 are already within [1,255] range. */
-static INLINE vpx_prob weighted_prob(int prob1, int prob2, int factor) {
- return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
-}
-
-static INLINE vpx_prob merge_probs(vpx_prob pre_prob,
- const unsigned int ct[2],
- unsigned int count_sat,
- unsigned int max_update_factor) {
- const vpx_prob prob = get_binary_prob(ct[0], ct[1]);
- const unsigned int count = VPXMIN(ct[0] + ct[1], count_sat);
- const unsigned int factor = max_update_factor * count / count_sat;
- return weighted_prob(pre_prob, prob, factor);
-}
-
-// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
-static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
- 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
- 70, 76, 83, 89, 96, 102, 108, 115, 121, 128
-};
-
-static INLINE vpx_prob mode_mv_merge_probs(vpx_prob pre_prob,
- const unsigned int ct[2]) {
- const unsigned int den = ct[0] + ct[1];
- if (den == 0) {
- return pre_prob;
- } else {
- const unsigned int count = VPXMIN(den, MODE_MV_COUNT_SAT);
- const unsigned int factor = count_to_update_factor[count];
- const vpx_prob prob =
- clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den);
- return weighted_prob(pre_prob, prob, factor);
- }
-}
-
-void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
- const unsigned int *counts, vpx_prob *probs);
-
-
-DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VPX_DSP_PROB_H_
diff --git a/thirdparty/libvpx/vpx_dsp/txfm_common.h b/thirdparty/libvpx/vpx_dsp/txfm_common.h
deleted file mode 100644
index 442e6a57b5..0000000000
--- a/thirdparty/libvpx/vpx_dsp/txfm_common.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_TXFM_COMMON_H_
-#define VPX_DSP_TXFM_COMMON_H_
-
-#include "vpx_dsp/vpx_dsp_common.h"
-
-// Constants and Macros used by all idct/dct functions
-#define DCT_CONST_BITS 14
-#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
-
-#define UNIT_QUANT_SHIFT 2
-#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
-
-// Constants:
-// for (int i = 1; i< 32; ++i)
-// printf("static const int cospi_%d_64 = %.0f;\n", i,
-// round(16384 * cos(i*M_PI/64)));
-// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const tran_high_t cospi_1_64 = 16364;
-static const tran_high_t cospi_2_64 = 16305;
-static const tran_high_t cospi_3_64 = 16207;
-static const tran_high_t cospi_4_64 = 16069;
-static const tran_high_t cospi_5_64 = 15893;
-static const tran_high_t cospi_6_64 = 15679;
-static const tran_high_t cospi_7_64 = 15426;
-static const tran_high_t cospi_8_64 = 15137;
-static const tran_high_t cospi_9_64 = 14811;
-static const tran_high_t cospi_10_64 = 14449;
-static const tran_high_t cospi_11_64 = 14053;
-static const tran_high_t cospi_12_64 = 13623;
-static const tran_high_t cospi_13_64 = 13160;
-static const tran_high_t cospi_14_64 = 12665;
-static const tran_high_t cospi_15_64 = 12140;
-static const tran_high_t cospi_16_64 = 11585;
-static const tran_high_t cospi_17_64 = 11003;
-static const tran_high_t cospi_18_64 = 10394;
-static const tran_high_t cospi_19_64 = 9760;
-static const tran_high_t cospi_20_64 = 9102;
-static const tran_high_t cospi_21_64 = 8423;
-static const tran_high_t cospi_22_64 = 7723;
-static const tran_high_t cospi_23_64 = 7005;
-static const tran_high_t cospi_24_64 = 6270;
-static const tran_high_t cospi_25_64 = 5520;
-static const tran_high_t cospi_26_64 = 4756;
-static const tran_high_t cospi_27_64 = 3981;
-static const tran_high_t cospi_28_64 = 3196;
-static const tran_high_t cospi_29_64 = 2404;
-static const tran_high_t cospi_30_64 = 1606;
-static const tran_high_t cospi_31_64 = 804;
-
-// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
-static const tran_high_t sinpi_1_9 = 5283;
-static const tran_high_t sinpi_2_9 = 9929;
-static const tran_high_t sinpi_3_9 = 13377;
-static const tran_high_t sinpi_4_9 = 15212;
-
-#endif // VPX_DSP_TXFM_COMMON_H_
diff --git a/thirdparty/libvpx/vpx_dsp/vpx_convolve.c b/thirdparty/libvpx/vpx_dsp/vpx_convolve.c
deleted file mode 100644
index 2d1c927cbe..0000000000
--- a/thirdparty/libvpx/vpx_dsp/vpx_convolve.c
+++ /dev/null
@@ -1,612 +0,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <string.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_dsp/vpx_convolve.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4, int w, int h) {
- int x, y;
- src -= SUBPEL_TAPS / 2 - 1;
- for (y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (x = 0; x < w; ++x) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_x[k] * x_filter[k];
- dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4, int w, int h) {
- int x, y;
- src -= SUBPEL_TAPS / 2 - 1;
- for (y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (x = 0; x < w; ++x) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_x[k] * x_filter[k];
- dst[x] = ROUND_POWER_OF_TWO(dst[x] +
- clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
- int x, y;
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
- for (x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (y = 0; y < h; ++y) {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_y[k * src_stride] * y_filter[k];
- dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
- int x, y;
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
- for (x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (y = 0; y < h; ++y) {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_y[k * src_stride] * y_filter[k];
- dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
- clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-static void convolve(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters,
- int x0_q4, int x_step_q4,
- const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4,
- int w, int h) {
- // Note: Fixed size intermediate buffer, temp, places limits on parameters.
- // 2d filtering proceeds in 2 steps:
- // (1) Interpolate horizontally into an intermediate buffer, temp.
- // (2) Interpolate temp vertically to derive the sub-pixel result.
- // Deriving the maximum number of rows in the temp buffer (135):
- // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
- // --Largest block size is 64x64 pixels.
- // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
- // original frame (in 1/16th pixel units).
- // --Must round-up because block may be located at sub-pixel position.
- // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
- // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
- uint8_t temp[135 * 64];
- int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- assert(w <= 64);
- assert(h <= 64);
- assert(y_step_q4 <= 32);
- assert(x_step_q4 <= 32);
-
- convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
- x_filters, x0_q4, x_step_q4, w, intermediate_height);
- convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
- y_filters, y0_q4, y_step_q4, w, h);
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
- // NOTE: This assumes that the filter table is 256-byte aligned.
- // TODO(agrange) Modify to make independent of table alignment.
- return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
- return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- (void)filter_y;
- (void)y_step_q4;
-
- convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
- x0_q4, x_step_q4, w, h);
-}
-
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- (void)filter_y;
- (void)y_step_q4;
-
- convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
- x0_q4, x_step_q4, w, h);
-}
-
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- (void)filter_x;
- (void)x_step_q4;
-
- convolve_vert(src, src_stride, dst, dst_stride, filters_y,
- y0_q4, y_step_q4, w, h);
-}
-
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- (void)filter_x;
- (void)x_step_q4;
-
- convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
- y0_q4, y_step_q4, w, h);
-}
-
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- convolve(src, src_stride, dst, dst_stride,
- filters_x, x0_q4, x_step_q4,
- filters_y, y0_q4, y_step_q4, w, h);
-}
-
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- /* Fixed size intermediate buffer places limits on parameters. */
- DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
- assert(w <= 64);
- assert(h <= 64);
-
- vpx_convolve8_c(src, src_stride, temp, 64,
- filter_x, x_step_q4, filter_y, y_step_q4, w, h);
- vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
-}
-
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
- int r;
-
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
-
- for (r = h; r > 0; --r) {
- memcpy(dst, src, w);
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
- int x, y;
-
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
-
- for (y = 0; y < h; ++y) {
- for (x = 0; x < w; ++x)
- dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
-}
-
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
-}
-
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
-}
-
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4,
- int w, int h, int bd) {
- int x, y;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- src -= SUBPEL_TAPS / 2 - 1;
- for (y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (x = 0; x < w; ++x) {
- const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_x[k] * x_filter[k];
- dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4,
- int w, int h, int bd) {
- int x, y;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- src -= SUBPEL_TAPS / 2 - 1;
- for (y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (x = 0; x < w; ++x) {
- const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_x[k] * x_filter[k];
- dst[x] = ROUND_POWER_OF_TWO(dst[x] +
- clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h,
- int bd) {
- int x, y;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- for (x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (y = 0; y < h; ++y) {
- const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_y[k * src_stride] * y_filter[k];
- dst[y * dst_stride] = clip_pixel_highbd(
- ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h,
- int bd) {
- int x, y;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- for (x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (y = 0; y < h; ++y) {
- const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_y[k * src_stride] * y_filter[k];
- dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
- clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters,
- int x0_q4, int x_step_q4,
- const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4,
- int w, int h, int bd) {
- // Note: Fixed size intermediate buffer, temp, places limits on parameters.
- // 2d filtering proceeds in 2 steps:
- // (1) Interpolate horizontally into an intermediate buffer, temp.
- // (2) Interpolate temp vertically to derive the sub-pixel result.
- // Deriving the maximum number of rows in the temp buffer (135):
- // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
- // --Largest block size is 64x64 pixels.
- // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
- // original frame (in 1/16th pixel units).
- // --Must round-up because block may be located at sub-pixel position.
- // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
- // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
- uint16_t temp[64 * 135];
- int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- assert(w <= 64);
- assert(h <= 64);
- assert(y_step_q4 <= 32);
- assert(x_step_q4 <= 32);
-
- highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, CONVERT_TO_BYTEPTR(temp), 64,
- x_filters, x0_q4, x_step_q4, w,
- intermediate_height, bd);
- highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
- 64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
- w, h, bd);
-}
-
-
-void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
- (void)filter_y;
- (void)y_step_q4;
-
- highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
- x0_q4, x_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
- (void)filter_y;
- (void)y_step_q4;
-
- highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
- x0_q4, x_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
- (void)filter_x;
- (void)x_step_q4;
-
- highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
- y0_q4, y_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
- (void)filter_x;
- (void)x_step_q4;
-
- highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
- y0_q4, y_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- highbd_convolve(src, src_stride, dst, dst_stride,
- filters_x, x0_q4, x_step_q4,
- filters_y, y0_q4, y_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- // Fixed size intermediate buffer places limits on parameters.
- DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
- assert(w <= 64);
- assert(h <= 64);
-
- vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
- filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
- vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
- NULL, 0, NULL, 0, w, h, bd);
-}
-
-void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h, int bd) {
- int r;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
- (void)bd;
-
- for (r = h; r > 0; --r) {
- memcpy(dst, src, w * sizeof(uint16_t));
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h, int bd) {
- int x, y;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
- (void)bd;
-
- for (y = 0; y < h; ++y) {
- for (x = 0; x < w; ++x) {
- dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-#endif
diff --git a/thirdparty/libvpx/vpx_dsp/vpx_convolve.h b/thirdparty/libvpx/vpx_dsp/vpx_convolve.h
deleted file mode 100644
index 9ed3f1750f..0000000000
--- a/thirdparty/libvpx/vpx_dsp/vpx_convolve.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef VPX_DSP_VPX_CONVOLVE_H_
-#define VPX_DSP_VPX_CONVOLVE_H_
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd);
-#endif
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VPX_DSP_VPX_CONVOLVE_H_
diff --git a/thirdparty/libvpx/vpx_dsp/vpx_dsp_common.h b/thirdparty/libvpx/vpx_dsp/vpx_dsp_common.h
deleted file mode 100644
index a1d0a51ef5..0000000000
--- a/thirdparty/libvpx/vpx_dsp/vpx_dsp_common.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_VPX_DSP_COMMON_H_
-#define VPX_DSP_VPX_DSP_COMMON_H_
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
-#define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
-
-#if CONFIG_VP9_HIGHBITDEPTH
-// Note:
-// tran_low_t is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int64_t tran_high_t;
-typedef int32_t tran_low_t;
-#else
-// Note:
-// tran_low_t is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int32_t tran_high_t;
-typedef int16_t tran_low_t;
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-static INLINE uint8_t clip_pixel(int val) {
- return (val > 255) ? 255 : (val < 0) ? 0 : val;
-}
-
-static INLINE int clamp(int value, int low, int high) {
- return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE double fclamp(double value, double low, double high) {
- return value < low ? low : (value > high ? high : value);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
- switch (bd) {
- case 8:
- default:
- return (uint16_t)clamp(val, 0, 255);
- case 10:
- return (uint16_t)clamp(val, 0, 1023);
- case 12:
- return (uint16_t)clamp(val, 0, 4095);
- }
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VPX_DSP_VPX_DSP_COMMON_H_
diff --git a/thirdparty/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/thirdparty/libvpx/vpx_dsp/vpx_dsp_rtcd.c
deleted file mode 100644
index 5fe27b614b..0000000000
--- a/thirdparty/libvpx/vpx_dsp/vpx_dsp_rtcd.c
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vpx_config.h"
-#define RTCD_C
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/vpx_once.h"
-
-void vpx_dsp_rtcd() {
- once(setup_rtcd_internal);
-}
diff --git a/thirdparty/libvpx/vpx_dsp/vpx_filter.h b/thirdparty/libvpx/vpx_dsp/vpx_filter.h
deleted file mode 100644
index 2617febf3b..0000000000
--- a/thirdparty/libvpx/vpx_dsp/vpx_filter.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_VPX_FILTER_H_
-#define VPX_DSP_VPX_FILTER_H_
-
-#include "vpx/vpx_integer.h"
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define FILTER_BITS 7
-
-#define SUBPEL_BITS 4
-#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
-#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
-#define SUBPEL_TAPS 8
-
-typedef int16_t InterpKernel[SUBPEL_TAPS];
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VPX_DSP_VPX_FILTER_H_
diff --git a/thirdparty/libvpx/vpx_dsp/x86/convolve.h b/thirdparty/libvpx/vpx_dsp/x86/convolve.h
deleted file mode 100644
index 7e43eb7c72..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/convolve.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef VPX_DSP_X86_CONVOLVE_H_
-#define VPX_DSP_X86_CONVOLVE_H_
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-typedef void filter8_1dfunction (
- const uint8_t *src_ptr,
- ptrdiff_t src_pitch,
- uint8_t *output_ptr,
- ptrdiff_t out_pitch,
- uint32_t output_height,
- const int16_t *filter
-);
-
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h) { \
- assert(filter[3] != 128); \
- assert(step_q4 == 16); \
- if (filter[0] | filter[1] | filter[2]) { \
- while (w >= 16) { \
- vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- if (w == 8) { \
- vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- } else if (w == 4) { \
- vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- } \
- } else { \
- while (w >= 16) { \
- vpx_filter_block1d16_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- if (w == 8) { \
- vpx_filter_block1d8_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- } else if (w == 4) { \
- vpx_filter_block1d4_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- } \
- } \
-}
-
-#define FUN_CONV_2D(avg, opt) \
-void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h) { \
- assert(filter_x[3] != 128); \
- assert(filter_y[3] != 128); \
- assert(w <= 64); \
- assert(h <= 64); \
- assert(x_step_q4 == 16); \
- assert(y_step_q4 == 16); \
- if (filter_x[0] | filter_x[1] | filter_x[2]) { \
- DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
- vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h + 7); \
- vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h); \
- } else { \
- DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
- vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h + 1); \
- vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h); \
- } \
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-
-typedef void highbd_filter8_1dfunction (
- const uint16_t *src_ptr,
- const ptrdiff_t src_pitch,
- uint16_t *output_ptr,
- ptrdiff_t out_pitch,
- unsigned int output_height,
- const int16_t *filter,
- int bd
-);
-
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
- ptrdiff_t src_stride, \
- uint8_t *dst8, \
- ptrdiff_t dst_stride, \
- const int16_t *filter_x, \
- int x_step_q4, \
- const int16_t *filter_y, \
- int y_step_q4, \
- int w, int h, int bd) { \
- if (step_q4 == 16 && filter[3] != 128) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- if (filter[0] | filter[1] | filter[2]) { \
- while (w >= 16) { \
- vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else { \
- while (w >= 16) { \
- vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } \
- } \
- if (w) { \
- vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h, bd); \
- } \
-}
-
-#define HIGH_FUN_CONV_2D(avg, opt) \
-void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h, int bd) { \
- assert(w <= 64); \
- assert(h <= 64); \
- if (x_step_q4 == 16 && y_step_q4 == 16) { \
- if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
- vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
- CONVERT_TO_BYTEPTR(fdata2), 64, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h + 7, bd); \
- vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
- 64, dst, dst_stride, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h, bd); \
- } else { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
- vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
- CONVERT_TO_BYTEPTR(fdata2), 64, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h + 1, bd); \
- vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
- dst, dst_stride, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h, bd); \
- } \
- } else { \
- vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, w, \
- h, bd); \
- } \
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-#endif // VPX_DSP_X86_CONVOLVE_H_
diff --git a/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm
deleted file mode 100644
index cd6a6ae982..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm
+++ /dev/null
@@ -1,860 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pb_1: times 16 db 1
-pw_4: times 8 dw 4
-pw_8: times 8 dw 8
-pw_16: times 8 dw 16
-pw_32: times 8 dw 32
-dc_128: times 16 db 128
-pw2_4: times 8 dw 2
-pw2_8: times 8 dw 4
-pw2_16: times 8 dw 8
-pw2_32: times 8 dw 16
-
-SECTION .text
-
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
- pavgb %4, %1, %3
- pxor %3, %1
- pand %3, [GLOBAL(pb_1)]
- psubb %4, %3
- pavgb %4, %2
-%endmacro
-
-INIT_XMM sse2
-cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
- GET_GOT goffsetq
-
- movq m0, [aboveq]
- DEFINE_ARGS dst, stride, temp
- psrldq m1, m0, 1
- psrldq m2, m0, 2
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
-
- ; store 4 lines
- movd [dstq ], m3
- psrlq m3, 8
- movd [dstq+strideq ], m3
- lea dstq, [dstq+strideq*2]
- psrlq m3, 8
- movd [dstq ], m3
- psrlq m3, 8
- movd [dstq+strideq ], m3
- psrlq m0, 56
- movd tempq, m0
- mov [dstq+strideq+3], tempb
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
- GET_GOT goffsetq
-
- movu m1, [aboveq]
- pslldq m0, m1, 1
- psrldq m2, m1, 1
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
- punpckhbw m0, m0 ; 7 7
- punpcklwd m0, m0 ; 7 7 7 7
- punpckldq m0, m0 ; 7 7 7 7 7 7 7 7
- punpcklqdq m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
-
- ; store 4 lines
- psrldq m3, 1
- movq [dstq ], m3
- psrldq m3, 1
- movq [dstq+strideq ], m3
- psrldq m3, 1
- movq [dstq+strideq*2], m3
- psrldq m3, 1
- movq [dstq+stride3q ], m3
- lea dstq, [dstq+strideq*4]
-
- ; store next 4 lines
- psrldq m3, 1
- movq [dstq ], m3
- psrldq m3, 1
- movq [dstq+strideq ], m3
- psrldq m3, 1
- movq [dstq+strideq*2], m3
- psrldq m3, 1
- movq [dstq+stride3q ], m3
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
- GET_GOT goffsetq
-
- movd m0, [leftq] ; abcd [byte]
- punpcklbw m4, m0, m0 ; aabb ccdd
- punpcklwd m4, m4 ; aaaa bbbb cccc dddd
- psrldq m4, 12 ; dddd
- punpckldq m0, m4 ; abcd dddd
- psrldq m1, m0, 1 ; bcdd
- psrldq m2, m0, 2 ; cddd
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; a2bc b2cd c3d d
- pavgb m1, m0 ; ab, bc, cd, d [byte]
-
- punpcklbw m1, m3 ; ab, a2bc, bc, b2cd, cd, c3d, d, d
- movd [dstq ], m1
- psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d
- movd [dstq+strideq], m1
-
- lea dstq, [dstq+strideq*2]
- psrlq m1, 16 ; cd, c3d, d, d
- movd [dstq ], m1
- movd [dstq+strideq], m4 ; d, d, d, d
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- movd m2, [leftq]
- movd m0, [aboveq]
- pxor m1, m1
- punpckldq m0, m2
- psadbw m0, m1
- paddw m0, [GLOBAL(pw_4)]
- psraw m0, 3
- pshuflw m0, m0, 0x0
- packuswb m0, m0
- movd [dstq ], m0
- movd [dstq+strideq], m0
- lea dstq, [dstq+strideq*2]
- movd [dstq ], m0
- movd [dstq+strideq], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
- movifnidn leftq, leftmp
- GET_GOT goffsetq
-
- pxor m1, m1
- movd m0, [leftq]
- psadbw m0, m1
- paddw m0, [GLOBAL(pw2_4)]
- psraw m0, 2
- pshuflw m0, m0, 0x0
- packuswb m0, m0
- movd [dstq ], m0
- movd [dstq+strideq], m0
- lea dstq, [dstq+strideq*2]
- movd [dstq ], m0
- movd [dstq+strideq], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- movd m0, [aboveq]
- psadbw m0, m1
- paddw m0, [GLOBAL(pw2_4)]
- psraw m0, 2
- pshuflw m0, m0, 0x0
- packuswb m0, m0
- movd [dstq ], m0
- movd [dstq+strideq], m0
- lea dstq, [dstq+strideq*2]
- movd [dstq ], m0
- movd [dstq+strideq], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- movq m0, [aboveq]
- movq m2, [leftq]
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- psadbw m0, m1
- psadbw m2, m1
- paddw m0, m2
- paddw m0, [GLOBAL(pw_8)]
- psraw m0, 4
- punpcklbw m0, m0
- pshuflw m0, m0, 0x0
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- movq m0, [aboveq]
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- psadbw m0, m1
- paddw m0, [GLOBAL(pw2_8)]
- psraw m0, 3
- punpcklbw m0, m0
- pshuflw m0, m0, 0x0
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
- movifnidn leftq, leftmp
- GET_GOT goffsetq
-
- pxor m1, m1
- movq m0, [leftq]
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- psadbw m0, m1
- paddw m0, [GLOBAL(pw2_8)]
- psraw m0, 3
- punpcklbw m0, m0
- pshuflw m0, m0, 0x0
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- movd m0, [GLOBAL(dc_128)]
- movd [dstq ], m0
- movd [dstq+strideq ], m0
- movd [dstq+strideq*2], m0
- movd [dstq+stride3q ], m0
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- movq m0, [GLOBAL(dc_128)]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [aboveq]
- mova m2, [leftq]
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 4
- psadbw m0, m1
- psadbw m2, m1
- paddw m0, m2
- movhlps m2, m0
- paddw m0, m2
- paddw m0, [GLOBAL(pw_16)]
- psraw m0, 5
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
- packuswb m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-
-INIT_XMM sse2
-cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [aboveq]
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 4
- psadbw m0, m1
- movhlps m2, m0
- paddw m0, m2
- paddw m0, [GLOBAL(pw2_16)]
- psraw m0, 4
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
- packuswb m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [leftq]
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 4
- psadbw m0, m1
- movhlps m2, m0
- paddw m0, m2
- paddw m0, [GLOBAL(pw2_16)]
- psraw m0, 4
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
- packuswb m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 4
- mova m0, [GLOBAL(dc_128)]
-.loop:
- mova [dstq ], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
- RESTORE_GOT
- RET
-
-
-INIT_XMM sse2
-cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [aboveq]
- mova m2, [aboveq+16]
- mova m3, [leftq]
- mova m4, [leftq+16]
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 8
- psadbw m0, m1
- psadbw m2, m1
- psadbw m3, m1
- psadbw m4, m1
- paddw m0, m2
- paddw m0, m3
- paddw m0, m4
- movhlps m2, m0
- paddw m0, m2
- paddw m0, [GLOBAL(pw_32)]
- psraw m0, 6
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
- packuswb m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq +16], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m0
- mova [dstq+stride3q ], m0
- mova [dstq+stride3q +16], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [aboveq]
- mova m2, [aboveq+16]
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 8
- psadbw m0, m1
- psadbw m2, m1
- paddw m0, m2
- movhlps m2, m0
- paddw m0, m2
- paddw m0, [GLOBAL(pw2_32)]
- psraw m0, 5
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
- packuswb m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq +16], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m0
- mova [dstq+stride3q ], m0
- mova [dstq+stride3q +16], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [leftq]
- mova m2, [leftq+16]
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 8
- psadbw m0, m1
- psadbw m2, m1
- paddw m0, m2
- movhlps m2, m0
- paddw m0, m2
- paddw m0, [GLOBAL(pw2_32)]
- psraw m0, 5
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
- packuswb m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq +16], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m0
- mova [dstq+stride3q ], m0
- mova [dstq+stride3q +16], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 8
- mova m0, [GLOBAL(dc_128)]
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq +16], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m0
- mova [dstq+stride3q ], m0
- mova [dstq+stride3q +16], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
- movd m0, [aboveq]
- movd [dstq ], m0
- movd [dstq+strideq], m0
- lea dstq, [dstq+strideq*2]
- movd [dstq ], m0
- movd [dstq+strideq], m0
- RET
-
-INIT_XMM sse2
-cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
- movq m0, [aboveq]
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- RET
-
-INIT_XMM sse2
-cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
- mova m0, [aboveq]
- DEFINE_ARGS dst, stride, stride3, nlines4
- lea stride3q, [strideq*3]
- mov nlines4d, 4
-.loop:
- mova [dstq ], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- dec nlines4d
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
- mova m0, [aboveq]
- mova m1, [aboveq+16]
- DEFINE_ARGS dst, stride, stride3, nlines4
- lea stride3q, [strideq*3]
- mov nlines4d, 8
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m1
- mova [dstq+strideq ], m0
- mova [dstq+strideq +16], m1
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m1
- mova [dstq+stride3q ], m0
- mova [dstq+stride3q +16], m1
- lea dstq, [dstq+strideq*4]
- dec nlines4d
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
- movifnidn leftq, leftmp
- movd m0, [leftq]
- punpcklbw m0, m0
- punpcklbw m0, m0
- pshufd m1, m0, 0x1
- movd [dstq ], m0
- movd [dstq+strideq], m1
- pshufd m2, m0, 0x2
- lea dstq, [dstq+strideq*2]
- pshufd m3, m0, 0x3
- movd [dstq ], m2
- movd [dstq+strideq], m3
- RET
-
-INIT_XMM sse2
-cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
- movifnidn leftq, leftmp
- mov lineq, -2
- DEFINE_ARGS dst, stride, line, left, stride3
- lea stride3q, [strideq*3]
- movq m0, [leftq ]
- punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8
-.loop:
- pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1
- pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2
- movq [dstq ], m1
- movq [dstq+strideq], m2
- pshuflw m1, m0, 0xaa
- pshuflw m2, m0, 0xff
- movq [dstq+strideq*2], m1
- movq [dstq+stride3q ], m2
- pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
- inc lineq
- lea dstq, [dstq+strideq*4]
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
- movifnidn leftq, leftmp
- mov lineq, -4
- DEFINE_ARGS dst, stride, line, left, stride3
- lea stride3q, [strideq*3]
-.loop:
- movd m0, [leftq]
- punpcklbw m0, m0
- punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
- pshufd m1, m0, 0x0 ; l1 repeated 16 times
- pshufd m2, m0, 0x55 ; l2 repeated 16 times
- mova [dstq ], m1
- mova [dstq+strideq ], m2
- pshufd m1, m0, 0xaa
- pshufd m2, m0, 0xff
- mova [dstq+strideq*2], m1
- mova [dstq+stride3q ], m2
- inc lineq
- lea leftq, [leftq+4 ]
- lea dstq, [dstq+strideq*4]
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
- movifnidn leftq, leftmp
- mov lineq, -8
- DEFINE_ARGS dst, stride, line, left, stride3
- lea stride3q, [strideq*3]
-.loop:
- movd m0, [leftq]
- punpcklbw m0, m0
- punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
- pshufd m1, m0, 0x0 ; l1 repeated 16 times
- pshufd m2, m0, 0x55 ; l2 repeated 16 times
- mova [dstq ], m1
- mova [dstq+16 ], m1
- mova [dstq+strideq ], m2
- mova [dstq+strideq+16 ], m2
- pshufd m1, m0, 0xaa
- pshufd m2, m0, 0xff
- mova [dstq+strideq*2 ], m1
- mova [dstq+strideq*2+16], m1
- mova [dstq+stride3q ], m2
- mova [dstq+stride3q+16 ], m2
- inc lineq
- lea leftq, [leftq+4 ]
- lea dstq, [dstq+strideq*4]
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
- pxor m1, m1
- movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
- punpcklbw m0, m1
- pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word]
- psrldq m0, 2
- psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
- movd m2, [leftq]
- punpcklbw m2, m1
- pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
- pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
- paddw m4, m0
- paddw m3, m0
- packuswb m4, m4
- packuswb m3, m3
- movd [dstq ], m4
- movd [dstq+strideq], m3
- lea dstq, [dstq+strideq*2]
- pshuflw m4, m2, 0xaa
- pshuflw m3, m2, 0xff
- paddw m4, m0
- paddw m3, m0
- packuswb m4, m4
- packuswb m3, m3
- movd [dstq ], m4
- movd [dstq+strideq], m3
- RET
-
-INIT_XMM sse2
-cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
- pxor m1, m1
- movd m2, [aboveq-1]
- movq m0, [aboveq]
- punpcklbw m2, m1
- punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
- pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word]
- DEFINE_ARGS dst, stride, line, left
- mov lineq, -4
- punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word]
- psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word]
- movq m2, [leftq]
- punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
-.loop
- pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
- pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
- punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
- punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
- paddw m4, m0
- paddw m3, m0
- packuswb m4, m3
- movq [dstq ], m4
- movhps [dstq+strideq], m4
- lea dstq, [dstq+strideq*2]
- psrldq m2, 4
- inc lineq
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left
- pxor m1, m1
- mova m2, [aboveq-16];
- mova m0, [aboveq] ; t1 t2 ... t16 [byte]
- punpckhbw m2, m1 ; [127:112] tl [word]
- punpckhbw m4, m0, m1
- punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word]
- DEFINE_ARGS dst, stride, line, left, stride8
- mov lineq, -8
- pshufhw m2, m2, 0xff
- mova m3, [leftq] ; l1 l2 ... l16 [byte]
- punpckhqdq m2, m2 ; tl repeated 8 times [word]
- psubw m0, m2
- psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word]
- punpckhbw m5, m3, m1
- punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word]
- lea stride8q, [strideq*8]
-.loop:
- pshuflw m6, m3, 0x0
- pshuflw m7, m5, 0x0
- punpcklqdq m6, m6 ; l1 repeated 8 times [word]
- punpcklqdq m7, m7 ; l8 repeated 8 times [word]
- paddw m1, m6, m0
- paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word]
- psrldq m5, 2
- packuswb m1, m6
- mova [dstq ], m1
- paddw m1, m7, m0
- paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word]
- psrldq m3, 2
- packuswb m1, m7
- mova [dstq+stride8q], m1
- inc lineq
- lea dstq, [dstq+strideq]
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
- pxor m1, m1
- movd m2, [aboveq-1]
- mova m0, [aboveq]
- mova m4, [aboveq+16]
- punpcklbw m2, m1
- punpckhbw m3, m0, m1
- punpckhbw m5, m4, m1
- punpcklbw m0, m1
- punpcklbw m4, m1
- pshuflw m2, m2, 0x0
- DEFINE_ARGS dst, stride, line, left
- mov lineq, -16
- punpcklqdq m2, m2
- add leftq, 32
- psubw m0, m2
- psubw m3, m2
- psubw m4, m2
- psubw m5, m2
-.loop:
- movd m2, [leftq+lineq*2]
- pxor m1, m1
- punpcklbw m2, m1
- pshuflw m7, m2, 0x55
- pshuflw m2, m2, 0x0
- punpcklqdq m2, m2
- punpcklqdq m7, m7
- paddw m6, m2, m3
- paddw m1, m2, m0
- packuswb m1, m6
- mova [dstq ], m1
- paddw m6, m2, m5
- paddw m1, m2, m4
- packuswb m1, m6
- mova [dstq+16 ], m1
- paddw m6, m7, m3
- paddw m1, m7, m0
- packuswb m1, m6
- mova [dstq+strideq ], m1
- paddw m6, m7, m5
- paddw m1, m7, m4
- packuswb m1, m6
- mova [dstq+strideq+16], m1
- lea dstq, [dstq+strideq*2]
- inc lineq
- jnz .loop
- REP_RET
diff --git a/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm
deleted file mode 100644
index 5e0139fa8d..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm
+++ /dev/null
@@ -1,871 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-
-pb_1: times 16 db 1
-sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
-sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
-sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
-sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
-sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-SECTION .text
-
-INIT_XMM ssse3
-cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
- GET_GOT goffsetq
-
- mova m0, [aboveq]
- DEFINE_ARGS dst, stride, stride3, dst8, line
- lea stride3q, [strideq*3]
- lea dst8q, [dstq+strideq*8]
- mova m1, [GLOBAL(sh_b123456789abcdeff)]
- pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
- pavgb m3, m2, m0
- pxor m2, m0
- pshufb m0, m1
- pand m2, [GLOBAL(pb_1)]
- psubb m3, m2
- pavgb m0, m3
-
- ; first 4 lines and first half of 3rd 4 lines
- mov lined, 2
-.loop:
- mova [dstq ], m0
- movhps [dst8q ], m0
- pshufb m0, m1
- mova [dstq +strideq ], m0
- movhps [dst8q+strideq ], m0
- pshufb m0, m1
- mova [dstq +strideq*2 ], m0
- movhps [dst8q+strideq*2 ], m0
- pshufb m0, m1
- mova [dstq +stride3q ], m0
- movhps [dst8q+stride3q ], m0
- pshufb m0, m1
- lea dstq, [dstq +strideq*4]
- lea dst8q, [dst8q+strideq*4]
- dec lined
- jnz .loop
-
- ; bottom-right 8x8 block
- movhps [dstq +8], m0
- movhps [dstq+strideq +8], m0
- movhps [dstq+strideq*2+8], m0
- movhps [dstq+stride3q +8], m0
- lea dstq, [dstq+strideq*4]
- movhps [dstq +8], m0
- movhps [dstq+strideq +8], m0
- movhps [dstq+strideq*2+8], m0
- movhps [dstq+stride3q +8], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM ssse3
-cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
- GET_GOT goffsetq
-
- mova m0, [aboveq]
- mova m4, [aboveq+16]
- DEFINE_ARGS dst, stride, stride3, dst16, line
- lea stride3q, [strideq*3]
- lea dst16q, [dstq +strideq*8]
- lea dst16q, [dst16q+strideq*8]
- mova m1, [GLOBAL(sh_b123456789abcdeff)]
- pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)]
- pavgb m3, m2, m4
- pxor m2, m4
- palignr m5, m4, m0, 1
- palignr m6, m4, m0, 2
- pshufb m4, m1
- pand m2, [GLOBAL(pb_1)]
- psubb m3, m2
- pavgb m4, m3
- pavgb m3, m0, m6
- pxor m0, m6
- pand m0, [GLOBAL(pb_1)]
- psubb m3, m0
- pavgb m5, m3
-
- ; write 4x4 lines (and the first half of the second 4x4 lines)
- mov lined, 4
-.loop:
- mova [dstq ], m5
- mova [dstq +16], m4
- mova [dst16q ], m4
- palignr m3, m4, m5, 1
- pshufb m4, m1
- mova [dstq +strideq ], m3
- mova [dstq +strideq +16], m4
- mova [dst16q+strideq ], m4
- palignr m5, m4, m3, 1
- pshufb m4, m1
- mova [dstq +strideq*2 ], m5
- mova [dstq +strideq*2+16], m4
- mova [dst16q+strideq*2 ], m4
- palignr m3, m4, m5, 1
- pshufb m4, m1
- mova [dstq +stride3q ], m3
- mova [dstq +stride3q +16], m4
- mova [dst16q+stride3q ], m4
- palignr m5, m4, m3, 1
- pshufb m4, m1
- lea dstq, [dstq +strideq*4]
- lea dst16q, [dst16q+strideq*4]
- dec lined
- jnz .loop
-
- ; write second half of second 4x4 lines
- mova [dstq +16], m4
- mova [dstq +strideq +16], m4
- mova [dstq +strideq*2+16], m4
- mova [dstq +stride3q +16], m4
- lea dstq, [dstq +strideq*4]
- mova [dstq +16], m4
- mova [dstq +strideq +16], m4
- mova [dstq +strideq*2+16], m4
- mova [dstq +stride3q +16], m4
- lea dstq, [dstq +strideq*4]
- mova [dstq +16], m4
- mova [dstq +strideq +16], m4
- mova [dstq +strideq*2+16], m4
- mova [dstq +stride3q +16], m4
- lea dstq, [dstq +strideq*4]
- mova [dstq +16], m4
- mova [dstq +strideq +16], m4
- mova [dstq +strideq*2+16], m4
- mova [dstq +stride3q +16], m4
-
- RESTORE_GOT
- RET
-
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
- pavgb %4, %1, %3
- pxor %3, %1
- pand %3, [GLOBAL(pb_1)]
- psubb %4, %3
- pavgb %4, %2
-%endmacro
-
-INIT_XMM ssse3
-cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
- GET_GOT goffsetq
-
- movq m3, [aboveq]
- pshufb m1, m3, [GLOBAL(sh_b23456777)]
- pshufb m2, m3, [GLOBAL(sh_b12345677)]
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
- pavgb m3, m2
-
- ; store 4 lines
- movd [dstq ], m3
- movd [dstq+strideq], m4
- lea dstq, [dstq+strideq*2]
- psrldq m3, 1
- psrldq m4, 1
- movd [dstq ], m3
- movd [dstq+strideq], m4
- RESTORE_GOT
- RET
-
-INIT_XMM ssse3
-cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
- GET_GOT goffsetq
-
- movq m3, [aboveq]
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
- pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
- pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
- pshufb m3, [GLOBAL(sh_b0123456777777777)]
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
- pavgb m3, m2
-
- ; store 4 lines
- movq [dstq ], m3
- movq [dstq+strideq], m4
- psrldq m3, 1
- psrldq m4, 1
- movq [dstq+strideq*2], m3
- movq [dstq+stride3q ], m4
- lea dstq, [dstq+strideq*4]
- psrldq m3, 1
- psrldq m4, 1
-
- ; store 4 lines
- movq [dstq ], m3
- movq [dstq+strideq], m4
- psrldq m3, 1
- psrldq m4, 1
- movq [dstq+strideq*2], m3
- movq [dstq+stride3q ], m4
- RESTORE_GOT
- RET
-
-INIT_XMM ssse3
-cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
- GET_GOT goffsetq
-
- mova m0, [aboveq]
- DEFINE_ARGS dst, stride, stride3, line
- lea stride3q, [strideq*3]
- mova m1, [GLOBAL(sh_b123456789abcdeff)]
- pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
- pshufb m3, m0, m1
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
- pavgb m0, m3
-
- mov lined, 4
-.loop:
- mova [dstq ], m0
- mova [dstq+strideq ], m4
- pshufb m0, m1
- pshufb m4, m1
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m4
- pshufb m0, m1
- pshufb m4, m1
- lea dstq, [dstq+strideq*4]
- dec lined
- jnz .loop
- RESTORE_GOT
- REP_RET
-
-INIT_XMM ssse3
-cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
- GET_GOT goffsetq
-
- mova m0, [aboveq]
- mova m7, [aboveq+16]
- DEFINE_ARGS dst, stride, stride3, line
- mova m1, [GLOBAL(sh_b123456789abcdeff)]
- lea stride3q, [strideq*3]
- pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)]
- pshufb m3, m7, m1
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
- palignr m6, m7, m0, 1
- palignr m5, m7, m0, 2
- pavgb m7, m3
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
- pavgb m0, m6
-
- mov lined, 8
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m7
- mova [dstq+strideq ], m2
- mova [dstq+strideq +16], m4
- palignr m3, m7, m0, 1
- palignr m5, m4, m2, 1
- pshufb m7, m1
- pshufb m4, m1
-
- mova [dstq+strideq*2 ], m3
- mova [dstq+strideq*2+16], m7
- mova [dstq+stride3q ], m5
- mova [dstq+stride3q +16], m4
- palignr m0, m7, m3, 1
- palignr m2, m4, m5, 1
- pshufb m7, m1
- pshufb m4, m1
- lea dstq, [dstq+strideq*4]
- dec lined
- jnz .loop
- RESTORE_GOT
- REP_RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
- GET_GOT goffsetq
- movd m0, [leftq] ; l1, l2, l3, l4
- movd m1, [aboveq-1] ; tl, t1, t2, t3
- punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3
- pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
- psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3
- psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3
- ; comments below are for a predictor like this
- ; A1 B1 C1 D1
- ; A2 B2 A1 B1
- ; A3 B3 A2 B2
- ; A4 B4 A3 B3
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1
- pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1
-
- punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
-
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
- movd [dstq+stride3q ], m3
- psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 ..
- movd [dstq+strideq*2], m3
- psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 ..
- movd [dstq+strideq ], m3
- psrldq m3, 2 ; A1 B1 C1 D1 ..
- movd [dstq ], m3
- RESTORE_GOT
- RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
- GET_GOT goffsetq
- movq m0, [leftq] ; [0- 7] l1-8 [byte]
- movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte]
- pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word]
- pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word]
- pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word]
- pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word]
- psrldq m4, m0, 1 ; t1-7 [word]
- psrldq m5, m0, 2 ; t2-7 [word]
- ; comments below are for a predictor like this
- ; A1 B1 C1 D1 E1 F1 G1 H1
- ; A2 B2 A1 B1 C1 D1 E1 F1
- ; A3 B3 A2 B2 A1 B1 C1 D1
- ; A4 B4 A3 B3 A2 B2 A1 B1
- ; A5 B5 A4 B4 A3 B3 A2 B2
- ; A6 B6 A5 B5 A4 B4 A3 B3
- ; A7 B7 A6 B6 A5 B5 A4 B4
- ; A8 B8 A7 B7 A6 B6 A5 B5
- pavgb m6, m1, m2 ; 2-tap avg A8-A1
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1
-
- punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1
-
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
-
- movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1
- palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1
- movq [dstq+strideq*2], m0
- psrldq m0, 2 ; A-B2, A-B1, C-H1
- movq [dstq+strideq ], m0
- psrldq m0, 2 ; A-H1
- movq [dstq ], m0
- lea dstq, [dstq+strideq*4]
- movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5
- psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4
- movq [dstq+strideq*2], m6
- psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3
- movq [dstq+strideq ], m6
- psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2
- movq [dstq ], m6
- RESTORE_GOT
- RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
- GET_GOT goffsetq
- mova m0, [leftq]
- movu m7, [aboveq-1]
- ; comments below are for a predictor like this
- ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
- ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
- ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
- ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
- ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
- ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
- ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
- ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
- ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
- ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
- ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
- ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
- ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
- ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
- ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
- ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
- pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)]
- palignr m5, m0, m6, 15
- palignr m3, m0, m6, 14
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
- pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)]
- pavgb m5, m0 ; A1 - Ag
-
- punpcklbw m0, m4, m5 ; A-B8 ... A-B1
- punpckhbw m4, m5 ; A-B9 ... A-Bg
-
- pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)]
- pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)]
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1
-
- pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)]
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- palignr m2, m1, m6, 14
- mova [dstq ], m2
- palignr m2, m1, m6, 12
- mova [dstq+strideq ], m2
- palignr m2, m1, m6, 10
- mova [dstq+strideq*2], m2
- palignr m2, m1, m6, 8
- mova [dstq+stride3q ], m2
- lea dstq, [dstq+strideq*4]
- palignr m2, m1, m6, 6
- mova [dstq ], m2
- palignr m2, m1, m6, 4
- mova [dstq+strideq ], m2
- palignr m2, m1, m6, 2
- mova [dstq+strideq*2], m2
- pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
- mova [dstq+stride3q ], m6
- lea dstq, [dstq+strideq*4]
-
- palignr m2, m6, m4, 14
- mova [dstq ], m2
- palignr m2, m6, m4, 12
- mova [dstq+strideq ], m2
- palignr m2, m6, m4, 10
- mova [dstq+strideq*2], m2
- palignr m2, m6, m4, 8
- mova [dstq+stride3q ], m2
- lea dstq, [dstq+strideq*4]
- palignr m2, m6, m4, 6
- mova [dstq ], m2
- palignr m2, m6, m4, 4
- mova [dstq+strideq ], m2
- palignr m2, m6, m4, 2
- mova [dstq+strideq*2], m2
- mova [dstq+stride3q ], m4
- RESTORE_GOT
- RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
- GET_GOT goffsetq
- mova m0, [leftq]
- movu m7, [aboveq-1]
- movu m1, [aboveq+15]
-
- pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)]
- pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)]
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high]
-
- palignr m3, m1, m7, 1
- palignr m5, m1, m7, 2
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low]
-
- pshufb m7, [GLOBAL(sh_bfedcba9876543210)]
- palignr m5, m0, m7, 15
- palignr m3, m0, m7, 14
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
- pavgb m5, m0 ; A1 - Ag
- punpcklbw m6, m4, m5 ; A-B8 ... A-B1
- punpckhbw m4, m5 ; A-B9 ... A-Bg
- pshufb m6, [GLOBAL(sh_bfedcba9876543210)]
- pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
-
- DEFINE_ARGS dst, stride, stride3, left, line
- lea stride3q, [strideq*3]
-
- palignr m5, m2, m1, 14
- palignr m7, m1, m6, 14
- mova [dstq ], m7
- mova [dstq+16 ], m5
- palignr m5, m2, m1, 12
- palignr m7, m1, m6, 12
- mova [dstq+strideq ], m7
- mova [dstq+strideq+16 ], m5
- palignr m5, m2, m1, 10
- palignr m7, m1, m6, 10
- mova [dstq+strideq*2 ], m7
- mova [dstq+strideq*2+16], m5
- palignr m5, m2, m1, 8
- palignr m7, m1, m6, 8
- mova [dstq+stride3q ], m7
- mova [dstq+stride3q+16 ], m5
- lea dstq, [dstq+strideq*4]
- palignr m5, m2, m1, 6
- palignr m7, m1, m6, 6
- mova [dstq ], m7
- mova [dstq+16 ], m5
- palignr m5, m2, m1, 4
- palignr m7, m1, m6, 4
- mova [dstq+strideq ], m7
- mova [dstq+strideq+16 ], m5
- palignr m5, m2, m1, 2
- palignr m7, m1, m6, 2
- mova [dstq+strideq*2 ], m7
- mova [dstq+strideq*2+16], m5
- mova [dstq+stride3q ], m6
- mova [dstq+stride3q+16 ], m1
- lea dstq, [dstq+strideq*4]
-
- palignr m5, m1, m6, 14
- palignr m3, m6, m4, 14
- mova [dstq ], m3
- mova [dstq+16 ], m5
- palignr m5, m1, m6, 12
- palignr m3, m6, m4, 12
- mova [dstq+strideq ], m3
- mova [dstq+strideq+16 ], m5
- palignr m5, m1, m6, 10
- palignr m3, m6, m4, 10
- mova [dstq+strideq*2 ], m3
- mova [dstq+strideq*2+16], m5
- palignr m5, m1, m6, 8
- palignr m3, m6, m4, 8
- mova [dstq+stride3q ], m3
- mova [dstq+stride3q+16 ], m5
- lea dstq, [dstq+strideq*4]
- palignr m5, m1, m6, 6
- palignr m3, m6, m4, 6
- mova [dstq ], m3
- mova [dstq+16 ], m5
- palignr m5, m1, m6, 4
- palignr m3, m6, m4, 4
- mova [dstq+strideq ], m3
- mova [dstq+strideq+16 ], m5
- palignr m5, m1, m6, 2
- palignr m3, m6, m4, 2
- mova [dstq+strideq*2 ], m3
- mova [dstq+strideq*2+16], m5
- mova [dstq+stride3q ], m4
- mova [dstq+stride3q+16 ], m6
- lea dstq, [dstq+strideq*4]
-
- mova m7, [leftq]
- mova m3, [leftq+16]
- palignr m5, m3, m7, 15
- palignr m0, m3, m7, 14
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh -
- pavgb m5, m3 ; Ah -
- punpcklbw m3, m2, m5 ; A-B8 ... A-B1
- punpckhbw m2, m5 ; A-B9 ... A-Bg
- pshufb m3, [GLOBAL(sh_bfedcba9876543210)]
- pshufb m2, [GLOBAL(sh_bfedcba9876543210)]
-
- palignr m7, m6, m4, 14
- palignr m0, m4, m3, 14
- mova [dstq ], m0
- mova [dstq+16 ], m7
- palignr m7, m6, m4, 12
- palignr m0, m4, m3, 12
- mova [dstq+strideq ], m0
- mova [dstq+strideq+16 ], m7
- palignr m7, m6, m4, 10
- palignr m0, m4, m3, 10
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m7
- palignr m7, m6, m4, 8
- palignr m0, m4, m3, 8
- mova [dstq+stride3q ], m0
- mova [dstq+stride3q+16 ], m7
- lea dstq, [dstq+strideq*4]
- palignr m7, m6, m4, 6
- palignr m0, m4, m3, 6
- mova [dstq ], m0
- mova [dstq+16 ], m7
- palignr m7, m6, m4, 4
- palignr m0, m4, m3, 4
- mova [dstq+strideq ], m0
- mova [dstq+strideq+16 ], m7
- palignr m7, m6, m4, 2
- palignr m0, m4, m3, 2
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m7
- mova [dstq+stride3q ], m3
- mova [dstq+stride3q+16 ], m4
- lea dstq, [dstq+strideq*4]
-
- palignr m7, m4, m3, 14
- palignr m0, m3, m2, 14
- mova [dstq ], m0
- mova [dstq+16 ], m7
- palignr m7, m4, m3, 12
- palignr m0, m3, m2, 12
- mova [dstq+strideq ], m0
- mova [dstq+strideq+16 ], m7
- palignr m7, m4, m3, 10
- palignr m0, m3, m2, 10
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m7
- palignr m7, m4, m3, 8
- palignr m0, m3, m2, 8
- mova [dstq+stride3q ], m0
- mova [dstq+stride3q+16 ], m7
- lea dstq, [dstq+strideq*4]
- palignr m7, m4, m3, 6
- palignr m0, m3, m2, 6
- mova [dstq ], m0
- mova [dstq+16 ], m7
- palignr m7, m4, m3, 4
- palignr m0, m3, m2, 4
- mova [dstq+strideq ], m0
- mova [dstq+strideq+16 ], m7
- palignr m7, m4, m3, 2
- palignr m0, m3, m2, 2
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m7
- mova [dstq+stride3q ], m2
- mova [dstq+stride3q+16 ], m3
-
- RESTORE_GOT
- RET
-
-INIT_XMM ssse3
-cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
- GET_GOT goffsetq
- movq m3, [leftq] ; abcdefgh [byte]
- lea stride3q, [strideq*3]
-
- pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
- pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
- pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
- pavgb m0, m2
- punpcklbw m0, m3 ; interleaved output
-
- movq [dstq ], m0
- psrldq m0, 2
- movq [dstq+strideq ], m0
- psrldq m0, 2
- movq [dstq+strideq*2], m0
- psrldq m0, 2
- movq [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
- psrldq m0, 2
- movq [dstq ], m0
- psrldq m0, 2
- movq [dstq+strideq ], m0
- psrldq m0, 2
- movq [dstq+strideq*2], m0
- psrldq m0, 2
- movq [dstq+stride3q ], m0
- RESTORE_GOT
- RET
-
-INIT_XMM ssse3
-cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
- GET_GOT goffsetq
- lea stride3q, [strideq*3]
- mova m0, [leftq] ; abcdefghijklmnop [byte]
- pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
- pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
- pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte]
-
- punpckhbw m4, m1, m3 ; interleaved input
- punpcklbw m1, m3 ; interleaved output
- mova [dstq ], m1
- palignr m3, m4, m1, 2
- mova [dstq+strideq ], m3
- palignr m3, m4, m1, 4
- mova [dstq+strideq*2], m3
- palignr m3, m4, m1, 6
- mova [dstq+stride3q ], m3
- lea dstq, [dstq+strideq*4]
- palignr m3, m4, m1, 8
- mova [dstq ], m3
- palignr m3, m4, m1, 10
- mova [dstq+strideq ], m3
- palignr m3, m4, m1, 12
- mova [dstq+strideq*2], m3
- palignr m3, m4, m1, 14
- mova [dstq+stride3q ], m3
- DEFINE_ARGS dst, stride, stride3, line
- mov lined, 2
- mova m0, [GLOBAL(sh_b23456789abcdefff)]
-.loop:
- lea dstq, [dstq+strideq*4]
- mova [dstq ], m4
- pshufb m4, m0
- mova [dstq+strideq ], m4
- pshufb m4, m0
- mova [dstq+strideq*2], m4
- pshufb m4, m0
- mova [dstq+stride3q ], m4
- pshufb m4, m0
- dec lined
- jnz .loop
- RESTORE_GOT
- REP_RET
-
-INIT_XMM ssse3
-cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
- GET_GOT goffsetq
- lea stride3q, [strideq*3]
- mova m1, [leftq] ; 0-15 [byte]
- mova m2, [leftq+16] ; 16-31 [byte]
- pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)]
- pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)]
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
- palignr m6, m2, m1, 1
- palignr m5, m2, m1, 2
- pavgb m2, m4 ; high 16px even lines
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
- pavgb m1, m6 ; low 16px even lines
-
- punpckhbw m6, m1, m0 ; interleaved output 2
- punpcklbw m1, m0 ; interleaved output 1
-
- punpckhbw m7, m2, m3 ; interleaved output 4
- punpcklbw m2, m3 ; interleaved output 3
-
- ; output 1st 8 lines (and half of 2nd 8 lines)
- DEFINE_ARGS dst, stride, stride3, dst8
- lea dst8q, [dstq+strideq*8]
- mova [dstq ], m1
- mova [dstq +16], m6
- mova [dst8q ], m6
- palignr m0, m6, m1, 2
- palignr m4, m2, m6, 2
- mova [dstq +strideq ], m0
- mova [dstq +strideq +16], m4
- mova [dst8q+strideq ], m4
- palignr m0, m6, m1, 4
- palignr m4, m2, m6, 4
- mova [dstq +strideq*2 ], m0
- mova [dstq +strideq*2+16], m4
- mova [dst8q+strideq*2 ], m4
- palignr m0, m6, m1, 6
- palignr m4, m2, m6, 6
- mova [dstq +stride3q ], m0
- mova [dstq +stride3q +16], m4
- mova [dst8q+stride3q ], m4
- lea dstq, [dstq +strideq*4]
- lea dst8q, [dst8q+strideq*4]
- palignr m0, m6, m1, 8
- palignr m4, m2, m6, 8
- mova [dstq ], m0
- mova [dstq +16], m4
- mova [dst8q ], m4
- palignr m0, m6, m1, 10
- palignr m4, m2, m6, 10
- mova [dstq +strideq ], m0
- mova [dstq +strideq +16], m4
- mova [dst8q+strideq ], m4
- palignr m0, m6, m1, 12
- palignr m4, m2, m6, 12
- mova [dstq +strideq*2 ], m0
- mova [dstq +strideq*2+16], m4
- mova [dst8q+strideq*2 ], m4
- palignr m0, m6, m1, 14
- palignr m4, m2, m6, 14
- mova [dstq +stride3q ], m0
- mova [dstq +stride3q +16], m4
- mova [dst8q+stride3q ], m4
- lea dstq, [dstq+strideq*4]
- lea dst8q, [dst8q+strideq*4]
-
- ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
- mova [dstq +16], m2
- mova [dst8q ], m2
- palignr m4, m7, m2, 2
- mova [dstq +strideq +16], m4
- mova [dst8q+strideq ], m4
- palignr m4, m7, m2, 4
- mova [dstq +strideq*2+16], m4
- mova [dst8q+strideq*2 ], m4
- palignr m4, m7, m2, 6
- mova [dstq +stride3q +16], m4
- mova [dst8q+stride3q ], m4
- lea dstq, [dstq+strideq*4]
- lea dst8q, [dst8q+strideq*4]
- palignr m4, m7, m2, 8
- mova [dstq +16], m4
- mova [dst8q ], m4
- palignr m4, m7, m2, 10
- mova [dstq +strideq +16], m4
- mova [dst8q+strideq ], m4
- palignr m4, m7, m2, 12
- mova [dstq +strideq*2+16], m4
- mova [dst8q+strideq*2 ], m4
- palignr m4, m7, m2, 14
- mova [dstq +stride3q +16], m4
- mova [dst8q+stride3q ], m4
- lea dstq, [dstq+strideq*4]
- lea dst8q, [dst8q+strideq*4]
-
- ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
- mova m0, [GLOBAL(sh_b23456789abcdefff)]
- mova [dstq +16], m7
- mova [dst8q ], m7
- pshufb m7, m0
- mova [dstq +strideq +16], m7
- mova [dst8q+strideq ], m7
- pshufb m7, m0
- mova [dstq +strideq*2+16], m7
- mova [dst8q+strideq*2 ], m7
- pshufb m7, m0
- mova [dstq +stride3q +16], m7
- mova [dst8q+stride3q ], m7
- pshufb m7, m0
- lea dstq, [dstq+strideq*4]
- lea dst8q, [dst8q+strideq*4]
- mova [dstq +16], m7
- mova [dst8q ], m7
- pshufb m7, m0
- mova [dstq +strideq +16], m7
- mova [dst8q+strideq ], m7
- pshufb m7, m0
- mova [dstq +strideq*2+16], m7
- mova [dst8q+strideq*2 ], m7
- pshufb m7, m0
- mova [dstq +stride3q +16], m7
- mova [dst8q+stride3q ], m7
- pshufb m7, m0
- lea dstq, [dstq+strideq*4]
-
- ; output last half of 4th 8 lines
- mova [dstq +16], m7
- mova [dstq +strideq +16], m7
- mova [dstq +strideq*2+16], m7
- mova [dstq +stride3q +16], m7
- lea dstq, [dstq+strideq*4]
- mova [dstq +16], m7
- mova [dstq +strideq +16], m7
- mova [dstq +strideq*2+16], m7
- mova [dstq +stride3q +16], m7
-
- ; done!
- RESTORE_GOT
- RET
diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
deleted file mode 100644
index df5068c624..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ /dev/null
@@ -1,4046 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-
-#define RECON_AND_STORE4X4(dest, in_x) \
-{ \
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- d0 = _mm_add_epi16(in_x, d0); \
- d0 = _mm_packus_epi16(d0, d0); \
- *(int *)(dest) = _mm_cvtsi128_si32(d0); \
-}
-
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i cst = _mm_setr_epi16(
- (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
- (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
- (int16_t)cospi_8_64, (int16_t)cospi_24_64);
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i input0, input1, input2, input3;
-
- // Rows
- input0 = load_input_data(input);
- input2 = load_input_data(input + 8);
-
- // Construct i3, i1, i3, i1, i2, i0, i2, i0
- input0 = _mm_shufflelo_epi16(input0, 0xd8);
- input0 = _mm_shufflehi_epi16(input0, 0xd8);
- input2 = _mm_shufflelo_epi16(input2, 0xd8);
- input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
- input1 = _mm_unpackhi_epi32(input0, input0);
- input0 = _mm_unpacklo_epi32(input0, input0);
- input3 = _mm_unpackhi_epi32(input2, input2);
- input2 = _mm_unpacklo_epi32(input2, input2);
-
- // Stage 1
- input0 = _mm_madd_epi16(input0, cst);
- input1 = _mm_madd_epi16(input1, cst);
- input2 = _mm_madd_epi16(input2, cst);
- input3 = _mm_madd_epi16(input3, cst);
-
- input0 = _mm_add_epi32(input0, rounding);
- input1 = _mm_add_epi32(input1, rounding);
- input2 = _mm_add_epi32(input2, rounding);
- input3 = _mm_add_epi32(input3, rounding);
-
- input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
- input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
- input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
- input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
- // Stage 2
- input0 = _mm_packs_epi32(input0, input1);
- input1 = _mm_packs_epi32(input2, input3);
-
- // Transpose
- input2 = _mm_unpacklo_epi16(input0, input1);
- input3 = _mm_unpackhi_epi16(input0, input1);
- input0 = _mm_unpacklo_epi32(input2, input3);
- input1 = _mm_unpackhi_epi32(input2, input3);
-
- // Switch column2, column 3, and then, we got:
- // input2: column1, column 0; input3: column2, column 3.
- input1 = _mm_shuffle_epi32(input1, 0x4e);
- input2 = _mm_add_epi16(input0, input1);
- input3 = _mm_sub_epi16(input0, input1);
-
- // Columns
- // Construct i3, i1, i3, i1, i2, i0, i2, i0
- input0 = _mm_unpacklo_epi32(input2, input2);
- input1 = _mm_unpackhi_epi32(input2, input2);
- input2 = _mm_unpackhi_epi32(input3, input3);
- input3 = _mm_unpacklo_epi32(input3, input3);
-
- // Stage 1
- input0 = _mm_madd_epi16(input0, cst);
- input1 = _mm_madd_epi16(input1, cst);
- input2 = _mm_madd_epi16(input2, cst);
- input3 = _mm_madd_epi16(input3, cst);
-
- input0 = _mm_add_epi32(input0, rounding);
- input1 = _mm_add_epi32(input1, rounding);
- input2 = _mm_add_epi32(input2, rounding);
- input3 = _mm_add_epi32(input3, rounding);
-
- input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
- input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
- input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
- input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
- // Stage 2
- input0 = _mm_packs_epi32(input0, input2);
- input1 = _mm_packs_epi32(input1, input3);
-
- // Transpose
- input2 = _mm_unpacklo_epi16(input0, input1);
- input3 = _mm_unpackhi_epi16(input0, input1);
- input0 = _mm_unpacklo_epi32(input2, input3);
- input1 = _mm_unpackhi_epi32(input2, input3);
-
- // Switch column2, column 3, and then, we got:
- // input2: column1, column 0; input3: column2, column 3.
- input1 = _mm_shuffle_epi32(input1, 0x4e);
- input2 = _mm_add_epi16(input0, input1);
- input3 = _mm_sub_epi16(input0, input1);
-
- // Final round and shift
- input2 = _mm_add_epi16(input2, eight);
- input3 = _mm_add_epi16(input3, eight);
-
- input2 = _mm_srai_epi16(input2, 4);
- input3 = _mm_srai_epi16(input3, 4);
-
- // Reconstruction and Store
- {
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
- __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi32(d0,
- _mm_cvtsi32_si128(*(const int *)(dest + stride)));
- d2 = _mm_unpacklo_epi32(
- _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
- d0 = _mm_unpacklo_epi8(d0, zero);
- d2 = _mm_unpacklo_epi8(d2, zero);
- d0 = _mm_add_epi16(d0, input2);
- d2 = _mm_add_epi16(d2, input3);
- d0 = _mm_packus_epi16(d0, d2);
- // store input0
- *(int *)dest = _mm_cvtsi128_si32(d0);
- // store input1
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
- // store input2
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
- // store input3
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
- }
-}
-
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- __m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 4);
-
- dc_value = _mm_set1_epi16(a);
-
- RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
- RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
- RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
- RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
- const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
- res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
- res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
-}
-
-void idct4_sse2(__m128i *in) {
- const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u[8], v[8];
-
- transpose_4x4(in);
- // stage 1
- u[0] = _mm_unpacklo_epi16(in[0], in[1]);
- u[1] = _mm_unpackhi_epi16(in[0], in[1]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
- u[0] = _mm_packs_epi32(v[0], v[1]);
- u[1] = _mm_packs_epi32(v[3], v[2]);
-
- // stage 2
- in[0] = _mm_add_epi16(u[0], u[1]);
- in[1] = _mm_sub_epi16(u[0], u[1]);
- in[1] = _mm_shuffle_epi32(in[1], 0x4E);
-}
-
-void iadst4_sse2(__m128i *in) {
- const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
- const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
- const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
- const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
- const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
- const __m128i kZero = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u[8], v[8], in7;
-
- transpose_4x4(in);
- in7 = _mm_srli_si128(in[1], 8);
- in7 = _mm_add_epi16(in7, in[0]);
- in7 = _mm_sub_epi16(in7, in[1]);
-
- u[0] = _mm_unpacklo_epi16(in[0], in[1]);
- u[1] = _mm_unpackhi_epi16(in[0], in[1]);
- u[2] = _mm_unpacklo_epi16(in7, kZero);
- u[3] = _mm_unpackhi_epi16(in[0], kZero);
-
- v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
- v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
- v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
- v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
- v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
- v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
-
- u[0] = _mm_add_epi32(v[0], v[1]);
- u[1] = _mm_add_epi32(v[3], v[4]);
- u[2] = v[2];
- u[3] = _mm_add_epi32(u[0], u[1]);
- u[4] = _mm_slli_epi32(v[5], 2);
- u[5] = _mm_add_epi32(u[3], v[5]);
- u[6] = _mm_sub_epi32(u[5], u[4]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[1]);
- in[1] = _mm_packs_epi32(u[2], u[3]);
-}
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
- const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
- const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
- const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
- const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
- out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
- out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
- out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
- }
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
- out0, out1, out2, out3) \
- { \
- const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
- const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- }
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
- cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
- { \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- tmp4 = _mm_madd_epi16(lo_1, cst2); \
- tmp5 = _mm_madd_epi16(hi_1, cst2); \
- tmp6 = _mm_madd_epi16(lo_1, cst3); \
- tmp7 = _mm_madd_epi16(hi_1, cst3); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
- res2 = _mm_packs_epi32(tmp4, tmp5); \
- res3 = _mm_packs_epi32(tmp6, tmp7); \
- }
-
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
- { \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
- }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) \
- { \
- /* Stage1 */ \
- { \
- const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
- const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
- const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
- const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
- \
- MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
- stg1_1, stg1_2, stg1_3, stp1_4, \
- stp1_7, stp1_5, stp1_6) \
- } \
- \
- /* Stage2 */ \
- { \
- const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
- const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
- const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
- const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
- \
- MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
- stg2_1, stg2_2, stg2_3, stp2_0, \
- stp2_1, stp2_2, stp2_3) \
- \
- stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
- tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
- tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
- tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- } \
- \
- /* Stage4 */ \
- out0 = _mm_adds_epi16(stp1_0, stp2_7); \
- out1 = _mm_adds_epi16(stp1_1, stp1_6); \
- out2 = _mm_adds_epi16(stp1_2, stp1_5); \
- out3 = _mm_adds_epi16(stp1_3, stp2_4); \
- out4 = _mm_subs_epi16(stp1_3, stp2_4); \
- out5 = _mm_subs_epi16(stp1_2, stp1_5); \
- out6 = _mm_subs_epi16(stp1_1, stp1_6); \
- out7 = _mm_subs_epi16(stp1_0, stp2_7); \
- }
-
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
-
- // Load input data.
- in0 = load_input_data(input);
- in1 = load_input_data(input + 8 * 1);
- in2 = load_input_data(input + 8 * 2);
- in3 = load_input_data(input + 8 * 3);
- in4 = load_input_data(input + 8 * 4);
- in5 = load_input_data(input + 8 * 5);
- in6 = load_input_data(input + 8 * 6);
- in7 = load_input_data(input + 8 * 7);
-
- // 2-D
- for (i = 0; i < 2; i++) {
- // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
- TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
-
- // 4-stage 1D idct8x8
- IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
- in0, in1, in2, in3, in4, in5, in6, in7);
- }
-
- // Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 5);
- in1 = _mm_srai_epi16(in1, 5);
- in2 = _mm_srai_epi16(in2, 5);
- in3 = _mm_srai_epi16(in3, 5);
- in4 = _mm_srai_epi16(in4, 5);
- in5 = _mm_srai_epi16(in5, 5);
- in6 = _mm_srai_epi16(in6, 5);
- in7 = _mm_srai_epi16(in7, 5);
-
- RECON_AND_STORE(dest + 0 * stride, in0);
- RECON_AND_STORE(dest + 1 * stride, in1);
- RECON_AND_STORE(dest + 2 * stride, in2);
- RECON_AND_STORE(dest + 3 * stride, in3);
- RECON_AND_STORE(dest + 4 * stride, in4);
- RECON_AND_STORE(dest + 5 * stride, in5);
- RECON_AND_STORE(dest + 6 * stride, in6);
- RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- __m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 5);
-
- dc_value = _mm_set1_epi16(a);
-
- RECON_AND_STORE(dest + 0 * stride, dc_value);
- RECON_AND_STORE(dest + 1 * stride, dc_value);
- RECON_AND_STORE(dest + 2 * stride, dc_value);
- RECON_AND_STORE(dest + 3 * stride, dc_value);
- RECON_AND_STORE(dest + 4 * stride, dc_value);
- RECON_AND_STORE(dest + 5 * stride, dc_value);
- RECON_AND_STORE(dest + 6 * stride, dc_value);
- RECON_AND_STORE(dest + 7 * stride, dc_value);
-}
-
-void idct8_sse2(__m128i *in) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
- // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
- TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
- in0, in1, in2, in3, in4, in5, in6, in7);
-
- // 4-stage 1D idct8x8
- IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
- in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
-}
-
-void iadst8_sse2(__m128i *in) {
- const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
- const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__const_0 = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
- __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
- __m128i s0, s1, s2, s3, s4, s5, s6, s7;
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
- // transpose
- array_transpose_8x8(in, in);
-
- // properly aligned for butterfly input
- in0 = in[7];
- in1 = in[0];
- in2 = in[5];
- in3 = in[2];
- in4 = in[3];
- in5 = in[4];
- in6 = in[1];
- in7 = in[6];
-
- // column transformation
- // stage 1
- // interleave and multiply/add into 32-bit integer
- s0 = _mm_unpacklo_epi16(in0, in1);
- s1 = _mm_unpackhi_epi16(in0, in1);
- s2 = _mm_unpacklo_epi16(in2, in3);
- s3 = _mm_unpackhi_epi16(in2, in3);
- s4 = _mm_unpacklo_epi16(in4, in5);
- s5 = _mm_unpackhi_epi16(in4, in5);
- s6 = _mm_unpacklo_epi16(in6, in7);
- s7 = _mm_unpackhi_epi16(in6, in7);
-
- u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
- u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
- u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
- u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
- u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
- u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
- u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
- u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
- u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
- u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
- u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
- u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
- u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
- u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
- u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
- u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
- // addition
- w0 = _mm_add_epi32(u0, u8);
- w1 = _mm_add_epi32(u1, u9);
- w2 = _mm_add_epi32(u2, u10);
- w3 = _mm_add_epi32(u3, u11);
- w4 = _mm_add_epi32(u4, u12);
- w5 = _mm_add_epi32(u5, u13);
- w6 = _mm_add_epi32(u6, u14);
- w7 = _mm_add_epi32(u7, u15);
- w8 = _mm_sub_epi32(u0, u8);
- w9 = _mm_sub_epi32(u1, u9);
- w10 = _mm_sub_epi32(u2, u10);
- w11 = _mm_sub_epi32(u3, u11);
- w12 = _mm_sub_epi32(u4, u12);
- w13 = _mm_sub_epi32(u5, u13);
- w14 = _mm_sub_epi32(u6, u14);
- w15 = _mm_sub_epi32(u7, u15);
-
- // shift and rounding
- v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
- v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
- v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
- v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
- v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
- v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
- v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
- v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
- v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
- u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
- u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
- u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
- u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
- u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
- u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
- u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
- // back to 16-bit and pack 8 integers into __m128i
- in[0] = _mm_packs_epi32(u0, u1);
- in[1] = _mm_packs_epi32(u2, u3);
- in[2] = _mm_packs_epi32(u4, u5);
- in[3] = _mm_packs_epi32(u6, u7);
- in[4] = _mm_packs_epi32(u8, u9);
- in[5] = _mm_packs_epi32(u10, u11);
- in[6] = _mm_packs_epi32(u12, u13);
- in[7] = _mm_packs_epi32(u14, u15);
-
- // stage 2
- s0 = _mm_add_epi16(in[0], in[2]);
- s1 = _mm_add_epi16(in[1], in[3]);
- s2 = _mm_sub_epi16(in[0], in[2]);
- s3 = _mm_sub_epi16(in[1], in[3]);
- u0 = _mm_unpacklo_epi16(in[4], in[5]);
- u1 = _mm_unpackhi_epi16(in[4], in[5]);
- u2 = _mm_unpacklo_epi16(in[6], in[7]);
- u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
- v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
- v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
- v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
- v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
- v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
- v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
- v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
- w0 = _mm_add_epi32(v0, v4);
- w1 = _mm_add_epi32(v1, v5);
- w2 = _mm_add_epi32(v2, v6);
- w3 = _mm_add_epi32(v3, v7);
- w4 = _mm_sub_epi32(v0, v4);
- w5 = _mm_sub_epi32(v1, v5);
- w6 = _mm_sub_epi32(v2, v6);
- w7 = _mm_sub_epi32(v3, v7);
-
- v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
- // back to 16-bit intergers
- s4 = _mm_packs_epi32(u0, u1);
- s5 = _mm_packs_epi32(u2, u3);
- s6 = _mm_packs_epi32(u4, u5);
- s7 = _mm_packs_epi32(u6, u7);
-
- // stage 3
- u0 = _mm_unpacklo_epi16(s2, s3);
- u1 = _mm_unpackhi_epi16(s2, s3);
- u2 = _mm_unpacklo_epi16(s6, s7);
- u3 = _mm_unpackhi_epi16(s6, s7);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
- v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
- v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
- v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
- v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
- v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
- v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
- v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
- s2 = _mm_packs_epi32(v0, v1);
- s3 = _mm_packs_epi32(v2, v3);
- s6 = _mm_packs_epi32(v4, v5);
- s7 = _mm_packs_epi32(v6, v7);
-
- in[0] = s0;
- in[1] = _mm_sub_epi16(k__const_0, s4);
- in[2] = s6;
- in[3] = _mm_sub_epi16(k__const_0, s2);
- in[4] = s3;
- in[5] = _mm_sub_epi16(k__const_0, s7);
- in[6] = s5;
- in[7] = _mm_sub_epi16(k__const_0, s1);
-}
-
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
- // Rows. Load 4-row input data.
- in0 = load_input_data(input);
- in1 = load_input_data(input + 8 * 1);
- in2 = load_input_data(input + 8 * 2);
- in3 = load_input_data(input + 8 * 3);
-
- // 8x4 Transpose
- TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
- // Stage1
- {
- const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
- const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
- tmp0 = _mm_madd_epi16(lo_17, stg1_0);
- tmp2 = _mm_madd_epi16(lo_17, stg1_1);
- tmp4 = _mm_madd_epi16(lo_35, stg1_2);
- tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp1_4 = _mm_packs_epi32(tmp0, tmp2);
- stp1_5 = _mm_packs_epi32(tmp4, tmp6);
- }
-
- // Stage2
- {
- const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
- const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
- tmp0 = _mm_madd_epi16(lo_04, stg2_0);
- tmp2 = _mm_madd_epi16(lo_04, stg2_1);
- tmp4 = _mm_madd_epi16(lo_26, stg2_2);
- tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp2_0 = _mm_packs_epi32(tmp0, tmp2);
- stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
- tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
- tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
-
- stp2_4 = tmp0;
- stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
- stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
- }
-
- // Stage3
- {
- const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
- tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
- tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
-
- stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
- stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
- tmp0 = _mm_madd_epi16(lo_56, stg3_0);
- tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
- stp1_5 = _mm_packs_epi32(tmp0, tmp2);
- }
-
- // Stage4
- tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
- tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
- tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
- tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
-
- TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
- IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
- in0, in1, in2, in3, in4, in5, in6, in7);
- // Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 5);
- in1 = _mm_srai_epi16(in1, 5);
- in2 = _mm_srai_epi16(in2, 5);
- in3 = _mm_srai_epi16(in3, 5);
- in4 = _mm_srai_epi16(in4, 5);
- in5 = _mm_srai_epi16(in5, 5);
- in6 = _mm_srai_epi16(in6, 5);
- in7 = _mm_srai_epi16(in7, 5);
-
- RECON_AND_STORE(dest + 0 * stride, in0);
- RECON_AND_STORE(dest + 1 * stride, in1);
- RECON_AND_STORE(dest + 2 * stride, in2);
- RECON_AND_STORE(dest + 3 * stride, in3);
- RECON_AND_STORE(dest + 4 * stride, in4);
- RECON_AND_STORE(dest + 5 * stride, in5);
- RECON_AND_STORE(dest + 6 * stride, in6);
- RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-#define IDCT16 \
- /* Stage2 */ \
- { \
- const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
- const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
- const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
- const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
- const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
- const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
- const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
- const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
- stg2_0, stg2_1, stg2_2, stg2_3, \
- stp2_8, stp2_15, stp2_9, stp2_14) \
- \
- MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
- stg2_4, stg2_5, stg2_6, stg2_7, \
- stp2_10, stp2_13, stp2_11, stp2_12) \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
- const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
- const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
- const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
- \
- MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
- stg3_0, stg3_1, stg3_2, stg3_3, \
- stp1_4, stp1_7, stp1_5, stp1_6) \
- \
- stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
- \
- stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
- const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
- const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
- const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
- stg4_0, stg4_1, stg4_2, stg4_3, \
- stp2_0, stp2_1, stp2_2, stp2_3) \
- \
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
- stg4_4, stg4_5, stg4_6, stg4_7, \
- stp2_9, stp2_14, stp2_10, stp2_13) \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
- \
- stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
- stg6_0, stg4_0, stg6_0, stg4_0, \
- stp2_10, stp2_13, stp2_11, stp2_12) \
- }
-
-#define IDCT16_10 \
- /* Stage2 */ \
- { \
- const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
- const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
- const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
- const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
- stg2_0, stg2_1, stg2_6, stg2_7, \
- stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
- const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
- \
- MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
- stg3_0, stg3_1, \
- stp2_4, stp2_7) \
- \
- stp1_9 = stp1_8_0; \
- stp1_10 = stp1_11; \
- \
- stp1_13 = stp1_12_0; \
- stp1_14 = stp1_15; \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
- const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
- stg4_0, stg4_1, \
- stp1_0, stp1_1) \
- stp2_5 = stp2_4; \
- stp2_6 = stp2_7; \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
- stg4_4, stg4_5, stg4_6, stg4_7, \
- stp2_9, stp2_14, stp2_10, stp2_13) \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_2 = stp1_1; \
- stp1_3 = stp1_0; \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
- \
- stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
- stg6_0, stg4_0, stg6_0, stg4_0, \
- stp2_10, stp2_13, stp2_11, stp2_12) \
- }
-
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in[16], l[16], r[16], *curr1;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_8_0, stp1_12_0;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
-
- curr1 = l;
- for (i = 0; i < 2; i++) {
- // 1-D idct
-
- // Load input data.
- in[0] = load_input_data(input);
- in[8] = load_input_data(input + 8 * 1);
- in[1] = load_input_data(input + 8 * 2);
- in[9] = load_input_data(input + 8 * 3);
- in[2] = load_input_data(input + 8 * 4);
- in[10] = load_input_data(input + 8 * 5);
- in[3] = load_input_data(input + 8 * 6);
- in[11] = load_input_data(input + 8 * 7);
- in[4] = load_input_data(input + 8 * 8);
- in[12] = load_input_data(input + 8 * 9);
- in[5] = load_input_data(input + 8 * 10);
- in[13] = load_input_data(input + 8 * 11);
- in[6] = load_input_data(input + 8 * 12);
- in[14] = load_input_data(input + 8 * 13);
- in[7] = load_input_data(input + 8 * 14);
- in[15] = load_input_data(input + 8 * 15);
-
- array_transpose_8x8(in, in);
- array_transpose_8x8(in + 8, in + 8);
-
- IDCT16
-
- // Stage7
- curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
- curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
- curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
- curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
- curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
- curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
- curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
- curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
- curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
- curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
- curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
- curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
- curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
- curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
- curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
- curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- curr1 = r;
- input += 128;
- }
- for (i = 0; i < 2; i++) {
- int j;
- // 1-D idct
- array_transpose_8x8(l + i * 8, in);
- array_transpose_8x8(r + i * 8, in + 8);
-
- IDCT16
-
- // 2-D
- in[0] = _mm_add_epi16(stp2_0, stp1_15);
- in[1] = _mm_add_epi16(stp2_1, stp1_14);
- in[2] = _mm_add_epi16(stp2_2, stp2_13);
- in[3] = _mm_add_epi16(stp2_3, stp2_12);
- in[4] = _mm_add_epi16(stp2_4, stp2_11);
- in[5] = _mm_add_epi16(stp2_5, stp2_10);
- in[6] = _mm_add_epi16(stp2_6, stp1_9);
- in[7] = _mm_add_epi16(stp2_7, stp1_8);
- in[8] = _mm_sub_epi16(stp2_7, stp1_8);
- in[9] = _mm_sub_epi16(stp2_6, stp1_9);
- in[10] = _mm_sub_epi16(stp2_5, stp2_10);
- in[11] = _mm_sub_epi16(stp2_4, stp2_11);
- in[12] = _mm_sub_epi16(stp2_3, stp2_12);
- in[13] = _mm_sub_epi16(stp2_2, stp2_13);
- in[14] = _mm_sub_epi16(stp2_1, stp1_14);
- in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- for (j = 0; j < 16; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
-
- dest += 8;
- }
-}
-
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- __m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a, i;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 6);
-
- dc_value = _mm_set1_epi16(a);
-
- for (i = 0; i < 16; ++i) {
- RECON_AND_STORE(dest + 0, dc_value);
- RECON_AND_STORE(dest + 8, dc_value);
- dest += stride;
- }
-}
-
-static void iadst16_8col(__m128i *in) {
- // perform 16x16 1-D ADST for 8 columns
- __m128i s[16], x[16], u[32], v[32];
- const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
- const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kZero = _mm_set1_epi16(0);
-
- u[0] = _mm_unpacklo_epi16(in[15], in[0]);
- u[1] = _mm_unpackhi_epi16(in[15], in[0]);
- u[2] = _mm_unpacklo_epi16(in[13], in[2]);
- u[3] = _mm_unpackhi_epi16(in[13], in[2]);
- u[4] = _mm_unpacklo_epi16(in[11], in[4]);
- u[5] = _mm_unpackhi_epi16(in[11], in[4]);
- u[6] = _mm_unpacklo_epi16(in[9], in[6]);
- u[7] = _mm_unpackhi_epi16(in[9], in[6]);
- u[8] = _mm_unpacklo_epi16(in[7], in[8]);
- u[9] = _mm_unpackhi_epi16(in[7], in[8]);
- u[10] = _mm_unpacklo_epi16(in[5], in[10]);
- u[11] = _mm_unpackhi_epi16(in[5], in[10]);
- u[12] = _mm_unpacklo_epi16(in[3], in[12]);
- u[13] = _mm_unpackhi_epi16(in[3], in[12]);
- u[14] = _mm_unpacklo_epi16(in[1], in[14]);
- u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
- v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
- v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
- v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
- v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
- v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
- v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
- v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
- v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
- v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
- v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
- v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
- v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
- v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
- v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
- v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
- v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
- v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
- v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
- u[0] = _mm_add_epi32(v[0], v[16]);
- u[1] = _mm_add_epi32(v[1], v[17]);
- u[2] = _mm_add_epi32(v[2], v[18]);
- u[3] = _mm_add_epi32(v[3], v[19]);
- u[4] = _mm_add_epi32(v[4], v[20]);
- u[5] = _mm_add_epi32(v[5], v[21]);
- u[6] = _mm_add_epi32(v[6], v[22]);
- u[7] = _mm_add_epi32(v[7], v[23]);
- u[8] = _mm_add_epi32(v[8], v[24]);
- u[9] = _mm_add_epi32(v[9], v[25]);
- u[10] = _mm_add_epi32(v[10], v[26]);
- u[11] = _mm_add_epi32(v[11], v[27]);
- u[12] = _mm_add_epi32(v[12], v[28]);
- u[13] = _mm_add_epi32(v[13], v[29]);
- u[14] = _mm_add_epi32(v[14], v[30]);
- u[15] = _mm_add_epi32(v[15], v[31]);
- u[16] = _mm_sub_epi32(v[0], v[16]);
- u[17] = _mm_sub_epi32(v[1], v[17]);
- u[18] = _mm_sub_epi32(v[2], v[18]);
- u[19] = _mm_sub_epi32(v[3], v[19]);
- u[20] = _mm_sub_epi32(v[4], v[20]);
- u[21] = _mm_sub_epi32(v[5], v[21]);
- u[22] = _mm_sub_epi32(v[6], v[22]);
- u[23] = _mm_sub_epi32(v[7], v[23]);
- u[24] = _mm_sub_epi32(v[8], v[24]);
- u[25] = _mm_sub_epi32(v[9], v[25]);
- u[26] = _mm_sub_epi32(v[10], v[26]);
- u[27] = _mm_sub_epi32(v[11], v[27]);
- u[28] = _mm_sub_epi32(v[12], v[28]);
- u[29] = _mm_sub_epi32(v[13], v[29]);
- u[30] = _mm_sub_epi32(v[14], v[30]);
- u[31] = _mm_sub_epi32(v[15], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
- v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
- v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
- v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
- v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
- v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
- v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
- v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
- v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
- v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
- v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
- v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
- v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
- v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
- v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
- v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
- u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
- u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
- u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
- u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
- u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
- u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
- u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
- u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
- u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
- u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
- u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
- u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
- u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
- u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
- u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
- s[0] = _mm_packs_epi32(u[0], u[1]);
- s[1] = _mm_packs_epi32(u[2], u[3]);
- s[2] = _mm_packs_epi32(u[4], u[5]);
- s[3] = _mm_packs_epi32(u[6], u[7]);
- s[4] = _mm_packs_epi32(u[8], u[9]);
- s[5] = _mm_packs_epi32(u[10], u[11]);
- s[6] = _mm_packs_epi32(u[12], u[13]);
- s[7] = _mm_packs_epi32(u[14], u[15]);
- s[8] = _mm_packs_epi32(u[16], u[17]);
- s[9] = _mm_packs_epi32(u[18], u[19]);
- s[10] = _mm_packs_epi32(u[20], u[21]);
- s[11] = _mm_packs_epi32(u[22], u[23]);
- s[12] = _mm_packs_epi32(u[24], u[25]);
- s[13] = _mm_packs_epi32(u[26], u[27]);
- s[14] = _mm_packs_epi32(u[28], u[29]);
- s[15] = _mm_packs_epi32(u[30], u[31]);
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[8], s[9]);
- u[1] = _mm_unpackhi_epi16(s[8], s[9]);
- u[2] = _mm_unpacklo_epi16(s[10], s[11]);
- u[3] = _mm_unpackhi_epi16(s[10], s[11]);
- u[4] = _mm_unpacklo_epi16(s[12], s[13]);
- u[5] = _mm_unpackhi_epi16(s[12], s[13]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
- v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
- v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
- u[0] = _mm_add_epi32(v[0], v[8]);
- u[1] = _mm_add_epi32(v[1], v[9]);
- u[2] = _mm_add_epi32(v[2], v[10]);
- u[3] = _mm_add_epi32(v[3], v[11]);
- u[4] = _mm_add_epi32(v[4], v[12]);
- u[5] = _mm_add_epi32(v[5], v[13]);
- u[6] = _mm_add_epi32(v[6], v[14]);
- u[7] = _mm_add_epi32(v[7], v[15]);
- u[8] = _mm_sub_epi32(v[0], v[8]);
- u[9] = _mm_sub_epi32(v[1], v[9]);
- u[10] = _mm_sub_epi32(v[2], v[10]);
- u[11] = _mm_sub_epi32(v[3], v[11]);
- u[12] = _mm_sub_epi32(v[4], v[12]);
- u[13] = _mm_sub_epi32(v[5], v[13]);
- u[14] = _mm_sub_epi32(v[6], v[14]);
- u[15] = _mm_sub_epi32(v[7], v[15]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- x[0] = _mm_add_epi16(s[0], s[4]);
- x[1] = _mm_add_epi16(s[1], s[5]);
- x[2] = _mm_add_epi16(s[2], s[6]);
- x[3] = _mm_add_epi16(s[3], s[7]);
- x[4] = _mm_sub_epi16(s[0], s[4]);
- x[5] = _mm_sub_epi16(s[1], s[5]);
- x[6] = _mm_sub_epi16(s[2], s[6]);
- x[7] = _mm_sub_epi16(s[3], s[7]);
- x[8] = _mm_packs_epi32(u[0], u[1]);
- x[9] = _mm_packs_epi32(u[2], u[3]);
- x[10] = _mm_packs_epi32(u[4], u[5]);
- x[11] = _mm_packs_epi32(u[6], u[7]);
- x[12] = _mm_packs_epi32(u[8], u[9]);
- x[13] = _mm_packs_epi32(u[10], u[11]);
- x[14] = _mm_packs_epi32(u[12], u[13]);
- x[15] = _mm_packs_epi32(u[14], u[15]);
-
- // stage 3
- u[0] = _mm_unpacklo_epi16(x[4], x[5]);
- u[1] = _mm_unpackhi_epi16(x[4], x[5]);
- u[2] = _mm_unpacklo_epi16(x[6], x[7]);
- u[3] = _mm_unpackhi_epi16(x[6], x[7]);
- u[4] = _mm_unpacklo_epi16(x[12], x[13]);
- u[5] = _mm_unpackhi_epi16(x[12], x[13]);
- u[6] = _mm_unpacklo_epi16(x[14], x[15]);
- u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
- u[0] = _mm_add_epi32(v[0], v[4]);
- u[1] = _mm_add_epi32(v[1], v[5]);
- u[2] = _mm_add_epi32(v[2], v[6]);
- u[3] = _mm_add_epi32(v[3], v[7]);
- u[4] = _mm_sub_epi32(v[0], v[4]);
- u[5] = _mm_sub_epi32(v[1], v[5]);
- u[6] = _mm_sub_epi32(v[2], v[6]);
- u[7] = _mm_sub_epi32(v[3], v[7]);
- u[8] = _mm_add_epi32(v[8], v[12]);
- u[9] = _mm_add_epi32(v[9], v[13]);
- u[10] = _mm_add_epi32(v[10], v[14]);
- u[11] = _mm_add_epi32(v[11], v[15]);
- u[12] = _mm_sub_epi32(v[8], v[12]);
- u[13] = _mm_sub_epi32(v[9], v[13]);
- u[14] = _mm_sub_epi32(v[10], v[14]);
- u[15] = _mm_sub_epi32(v[11], v[15]);
-
- u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[0] = _mm_add_epi16(x[0], x[2]);
- s[1] = _mm_add_epi16(x[1], x[3]);
- s[2] = _mm_sub_epi16(x[0], x[2]);
- s[3] = _mm_sub_epi16(x[1], x[3]);
- s[4] = _mm_packs_epi32(v[0], v[1]);
- s[5] = _mm_packs_epi32(v[2], v[3]);
- s[6] = _mm_packs_epi32(v[4], v[5]);
- s[7] = _mm_packs_epi32(v[6], v[7]);
- s[8] = _mm_add_epi16(x[8], x[10]);
- s[9] = _mm_add_epi16(x[9], x[11]);
- s[10] = _mm_sub_epi16(x[8], x[10]);
- s[11] = _mm_sub_epi16(x[9], x[11]);
- s[12] = _mm_packs_epi32(v[8], v[9]);
- s[13] = _mm_packs_epi32(v[10], v[11]);
- s[14] = _mm_packs_epi32(v[12], v[13]);
- s[15] = _mm_packs_epi32(v[14], v[15]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(s[2], s[3]);
- u[1] = _mm_unpackhi_epi16(s[2], s[3]);
- u[2] = _mm_unpacklo_epi16(s[6], s[7]);
- u[3] = _mm_unpackhi_epi16(s[6], s[7]);
- u[4] = _mm_unpacklo_epi16(s[10], s[11]);
- u[5] = _mm_unpackhi_epi16(s[10], s[11]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
- v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
- v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- in[0] = s[0];
- in[1] = _mm_sub_epi16(kZero, s[8]);
- in[2] = s[12];
- in[3] = _mm_sub_epi16(kZero, s[4]);
- in[4] = _mm_packs_epi32(v[4], v[5]);
- in[5] = _mm_packs_epi32(v[12], v[13]);
- in[6] = _mm_packs_epi32(v[8], v[9]);
- in[7] = _mm_packs_epi32(v[0], v[1]);
- in[8] = _mm_packs_epi32(v[2], v[3]);
- in[9] = _mm_packs_epi32(v[10], v[11]);
- in[10] = _mm_packs_epi32(v[14], v[15]);
- in[11] = _mm_packs_epi32(v[6], v[7]);
- in[12] = s[5];
- in[13] = _mm_sub_epi16(kZero, s[13]);
- in[14] = s[9];
- in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-static void idct16_8col(__m128i *in) {
- const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
- const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i v[16], u[16], s[16], t[16];
-
- // stage 1
- s[0] = in[0];
- s[1] = in[8];
- s[2] = in[4];
- s[3] = in[12];
- s[4] = in[2];
- s[5] = in[10];
- s[6] = in[6];
- s[7] = in[14];
- s[8] = in[1];
- s[9] = in[9];
- s[10] = in[5];
- s[11] = in[13];
- s[12] = in[3];
- s[13] = in[11];
- s[14] = in[7];
- s[15] = in[15];
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[8], s[15]);
- u[1] = _mm_unpackhi_epi16(s[8], s[15]);
- u[2] = _mm_unpacklo_epi16(s[9], s[14]);
- u[3] = _mm_unpackhi_epi16(s[9], s[14]);
- u[4] = _mm_unpacklo_epi16(s[10], s[13]);
- u[5] = _mm_unpackhi_epi16(s[10], s[13]);
- u[6] = _mm_unpacklo_epi16(s[11], s[12]);
- u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
- v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
- v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[8] = _mm_packs_epi32(u[0], u[1]);
- s[15] = _mm_packs_epi32(u[2], u[3]);
- s[9] = _mm_packs_epi32(u[4], u[5]);
- s[14] = _mm_packs_epi32(u[6], u[7]);
- s[10] = _mm_packs_epi32(u[8], u[9]);
- s[13] = _mm_packs_epi32(u[10], u[11]);
- s[11] = _mm_packs_epi32(u[12], u[13]);
- s[12] = _mm_packs_epi32(u[14], u[15]);
-
- // stage 3
- t[0] = s[0];
- t[1] = s[1];
- t[2] = s[2];
- t[3] = s[3];
- u[0] = _mm_unpacklo_epi16(s[4], s[7]);
- u[1] = _mm_unpackhi_epi16(s[4], s[7]);
- u[2] = _mm_unpacklo_epi16(s[5], s[6]);
- u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[4] = _mm_packs_epi32(u[0], u[1]);
- t[7] = _mm_packs_epi32(u[2], u[3]);
- t[5] = _mm_packs_epi32(u[4], u[5]);
- t[6] = _mm_packs_epi32(u[6], u[7]);
- t[8] = _mm_add_epi16(s[8], s[9]);
- t[9] = _mm_sub_epi16(s[8], s[9]);
- t[10] = _mm_sub_epi16(s[11], s[10]);
- t[11] = _mm_add_epi16(s[10], s[11]);
- t[12] = _mm_add_epi16(s[12], s[13]);
- t[13] = _mm_sub_epi16(s[12], s[13]);
- t[14] = _mm_sub_epi16(s[15], s[14]);
- t[15] = _mm_add_epi16(s[14], s[15]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(t[0], t[1]);
- u[1] = _mm_unpackhi_epi16(t[0], t[1]);
- u[2] = _mm_unpacklo_epi16(t[2], t[3]);
- u[3] = _mm_unpackhi_epi16(t[2], t[3]);
- u[4] = _mm_unpacklo_epi16(t[9], t[14]);
- u[5] = _mm_unpackhi_epi16(t[9], t[14]);
- u[6] = _mm_unpacklo_epi16(t[10], t[13]);
- u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
- v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
- v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
- v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
- v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[0] = _mm_packs_epi32(u[0], u[1]);
- s[1] = _mm_packs_epi32(u[2], u[3]);
- s[2] = _mm_packs_epi32(u[4], u[5]);
- s[3] = _mm_packs_epi32(u[6], u[7]);
- s[4] = _mm_add_epi16(t[4], t[5]);
- s[5] = _mm_sub_epi16(t[4], t[5]);
- s[6] = _mm_sub_epi16(t[7], t[6]);
- s[7] = _mm_add_epi16(t[6], t[7]);
- s[8] = t[8];
- s[15] = t[15];
- s[9] = _mm_packs_epi32(u[8], u[9]);
- s[14] = _mm_packs_epi32(u[10], u[11]);
- s[10] = _mm_packs_epi32(u[12], u[13]);
- s[13] = _mm_packs_epi32(u[14], u[15]);
- s[11] = t[11];
- s[12] = t[12];
-
- // stage 5
- t[0] = _mm_add_epi16(s[0], s[3]);
- t[1] = _mm_add_epi16(s[1], s[2]);
- t[2] = _mm_sub_epi16(s[1], s[2]);
- t[3] = _mm_sub_epi16(s[0], s[3]);
- t[4] = s[4];
- t[7] = s[7];
-
- u[0] = _mm_unpacklo_epi16(s[5], s[6]);
- u[1] = _mm_unpackhi_epi16(s[5], s[6]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- t[5] = _mm_packs_epi32(u[0], u[1]);
- t[6] = _mm_packs_epi32(u[2], u[3]);
-
- t[8] = _mm_add_epi16(s[8], s[11]);
- t[9] = _mm_add_epi16(s[9], s[10]);
- t[10] = _mm_sub_epi16(s[9], s[10]);
- t[11] = _mm_sub_epi16(s[8], s[11]);
- t[12] = _mm_sub_epi16(s[15], s[12]);
- t[13] = _mm_sub_epi16(s[14], s[13]);
- t[14] = _mm_add_epi16(s[13], s[14]);
- t[15] = _mm_add_epi16(s[12], s[15]);
-
- // stage 6
- s[0] = _mm_add_epi16(t[0], t[7]);
- s[1] = _mm_add_epi16(t[1], t[6]);
- s[2] = _mm_add_epi16(t[2], t[5]);
- s[3] = _mm_add_epi16(t[3], t[4]);
- s[4] = _mm_sub_epi16(t[3], t[4]);
- s[5] = _mm_sub_epi16(t[2], t[5]);
- s[6] = _mm_sub_epi16(t[1], t[6]);
- s[7] = _mm_sub_epi16(t[0], t[7]);
- s[8] = t[8];
- s[9] = t[9];
-
- u[0] = _mm_unpacklo_epi16(t[10], t[13]);
- u[1] = _mm_unpackhi_epi16(t[10], t[13]);
- u[2] = _mm_unpacklo_epi16(t[11], t[12]);
- u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- s[10] = _mm_packs_epi32(u[0], u[1]);
- s[13] = _mm_packs_epi32(u[2], u[3]);
- s[11] = _mm_packs_epi32(u[4], u[5]);
- s[12] = _mm_packs_epi32(u[6], u[7]);
- s[14] = t[14];
- s[15] = t[15];
-
- // stage 7
- in[0] = _mm_add_epi16(s[0], s[15]);
- in[1] = _mm_add_epi16(s[1], s[14]);
- in[2] = _mm_add_epi16(s[2], s[13]);
- in[3] = _mm_add_epi16(s[3], s[12]);
- in[4] = _mm_add_epi16(s[4], s[11]);
- in[5] = _mm_add_epi16(s[5], s[10]);
- in[6] = _mm_add_epi16(s[6], s[9]);
- in[7] = _mm_add_epi16(s[7], s[8]);
- in[8] = _mm_sub_epi16(s[7], s[8]);
- in[9] = _mm_sub_epi16(s[6], s[9]);
- in[10] = _mm_sub_epi16(s[5], s[10]);
- in[11] = _mm_sub_epi16(s[4], s[11]);
- in[12] = _mm_sub_epi16(s[3], s[12]);
- in[13] = _mm_sub_epi16(s[2], s[13]);
- in[14] = _mm_sub_epi16(s[1], s[14]);
- in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-void idct16_sse2(__m128i *in0, __m128i *in1) {
- array_transpose_16x16(in0, in1);
- idct16_8col(in0);
- idct16_8col(in1);
-}
-
-void iadst16_sse2(__m128i *in0, __m128i *in1) {
- array_transpose_16x16(in0, in1);
- iadst16_8col(in0);
- iadst16_8col(in1);
-}
-
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- __m128i in[16], l[16];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_8_0, stp1_12_0;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
- // First 1-D inverse DCT
- // Load input data.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 2);
- in[2] = load_input_data(input + 8 * 4);
- in[3] = load_input_data(input + 8 * 6);
-
- TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
- // Stage2
- {
- const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
- const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
- tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
- tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
- tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
- tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
- tmp7 = _mm_add_epi32(tmp7, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
- stp2_8 = _mm_packs_epi32(tmp0, tmp2);
- stp2_11 = _mm_packs_epi32(tmp5, tmp7);
- }
-
- // Stage3
- {
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
-
- tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
- tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
- stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
- stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
- stp1_4 = _mm_packs_epi32(tmp0, tmp2);
- }
-
- // Stage4
- {
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
- tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
- tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
- tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
- tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
- tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
- tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
- tmp7 = _mm_add_epi32(tmp7, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
- stp1_0 = _mm_packs_epi32(tmp0, tmp0);
- stp1_1 = _mm_packs_epi32(tmp2, tmp2);
- stp2_9 = _mm_packs_epi32(tmp1, tmp3);
- stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
- stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
- }
-
- // Stage5 and Stage6
- {
- tmp0 = _mm_add_epi16(stp2_8, stp2_11);
- tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
- tmp2 = _mm_add_epi16(stp2_9, stp2_10);
- tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
- stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
- stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
- stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
- stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
- stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
- stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
- stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
- stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
- }
-
- // Stage6
- {
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
- tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
- tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
- tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
- tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
- tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
- tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
-
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
- stp2_10 = _mm_packs_epi32(tmp0, zero);
- stp2_13 = _mm_packs_epi32(tmp2, zero);
- stp2_11 = _mm_packs_epi32(tmp4, zero);
- stp2_12 = _mm_packs_epi32(tmp6, zero);
-
- tmp0 = _mm_add_epi16(stp1_0, stp1_4);
- tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
- tmp2 = _mm_add_epi16(stp1_1, stp1_6);
- tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
- stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
- stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
- stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
- stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
- stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
- stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
- stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
- stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
- }
-
- // Stage7. Left 8x16 only.
- l[0] = _mm_add_epi16(stp2_0, stp1_15);
- l[1] = _mm_add_epi16(stp2_1, stp1_14);
- l[2] = _mm_add_epi16(stp2_2, stp2_13);
- l[3] = _mm_add_epi16(stp2_3, stp2_12);
- l[4] = _mm_add_epi16(stp2_4, stp2_11);
- l[5] = _mm_add_epi16(stp2_5, stp2_10);
- l[6] = _mm_add_epi16(stp2_6, stp1_9);
- l[7] = _mm_add_epi16(stp2_7, stp1_8);
- l[8] = _mm_sub_epi16(stp2_7, stp1_8);
- l[9] = _mm_sub_epi16(stp2_6, stp1_9);
- l[10] = _mm_sub_epi16(stp2_5, stp2_10);
- l[11] = _mm_sub_epi16(stp2_4, stp2_11);
- l[12] = _mm_sub_epi16(stp2_3, stp2_12);
- l[13] = _mm_sub_epi16(stp2_2, stp2_13);
- l[14] = _mm_sub_epi16(stp2_1, stp1_14);
- l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- // Second 1-D inverse transform, performed per 8x16 block
- for (i = 0; i < 2; i++) {
- int j;
- array_transpose_4X8(l + 8 * i, in);
-
- IDCT16_10
-
- // Stage7
- in[0] = _mm_add_epi16(stp2_0, stp1_15);
- in[1] = _mm_add_epi16(stp2_1, stp1_14);
- in[2] = _mm_add_epi16(stp2_2, stp2_13);
- in[3] = _mm_add_epi16(stp2_3, stp2_12);
- in[4] = _mm_add_epi16(stp2_4, stp2_11);
- in[5] = _mm_add_epi16(stp2_5, stp2_10);
- in[6] = _mm_add_epi16(stp2_6, stp1_9);
- in[7] = _mm_add_epi16(stp2_7, stp1_8);
- in[8] = _mm_sub_epi16(stp2_7, stp1_8);
- in[9] = _mm_sub_epi16(stp2_6, stp1_9);
- in[10] = _mm_sub_epi16(stp2_5, stp2_10);
- in[11] = _mm_sub_epi16(stp2_4, stp2_11);
- in[12] = _mm_sub_epi16(stp2_3, stp2_12);
- in[13] = _mm_sub_epi16(stp2_2, stp2_13);
- in[14] = _mm_sub_epi16(stp2_1, stp1_14);
- in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- for (j = 0; j < 16; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
-
- dest += 8;
- }
-}
-
-#define LOAD_DQCOEFF(reg, input) \
- { \
- reg = load_input_data(input); \
- input += 8; \
- } \
-
-#define IDCT32_34 \
-/* Stage1 */ \
-{ \
- const __m128i zero = _mm_setzero_si128();\
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
- \
- const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
- const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
- \
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
- \
- const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
- const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
- \
- MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
- stg1_1, stp1_16, stp1_31); \
- MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
- stg1_7, stp1_19, stp1_28); \
- MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
- stg1_9, stp1_20, stp1_27); \
- MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
- stg1_15, stp1_23, stp1_24); \
-} \
-\
-/* Stage2 */ \
-{ \
- const __m128i zero = _mm_setzero_si128();\
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
- \
- const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
- const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
- \
- MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
- stg2_1, stp2_8, stp2_15); \
- MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
- stg2_7, stp2_11, stp2_12); \
- \
- stp2_16 = stp1_16; \
- stp2_19 = stp1_19; \
- \
- stp2_20 = stp1_20; \
- stp2_23 = stp1_23; \
- \
- stp2_24 = stp1_24; \
- stp2_27 = stp1_27; \
- \
- stp2_28 = stp1_28; \
- stp2_31 = stp1_31; \
-} \
-\
-/* Stage3 */ \
-{ \
- const __m128i zero = _mm_setzero_si128();\
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
- \
- const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
- const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
- \
- MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
- stg3_1, stp1_4, stp1_7); \
- \
- stp1_8 = stp2_8; \
- stp1_11 = stp2_11; \
- stp1_12 = stp2_12; \
- stp1_15 = stp2_15; \
- \
- MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
- stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
- stp1_18, stp1_29) \
- MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
- stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
- stp1_22, stp1_25) \
- \
- stp1_16 = stp2_16; \
- stp1_31 = stp2_31; \
- stp1_19 = stp2_19; \
- stp1_20 = stp2_20; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_27 = stp2_27; \
- stp1_28 = stp2_28; \
-} \
-\
-/* Stage4 */ \
-{ \
- const __m128i zero = _mm_setzero_si128();\
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
- \
- MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
- stg4_1, stp2_0, stp2_1); \
- \
- stp2_4 = stp1_4; \
- stp2_5 = stp1_4; \
- stp2_6 = stp1_7; \
- stp2_7 = stp1_7; \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
- stp2_10, stp2_13) \
- \
- stp2_8 = stp1_8; \
- stp2_15 = stp1_15; \
- stp2_11 = stp1_11; \
- stp2_12 = stp1_12; \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
-} \
-\
-/* Stage5 */ \
-{ \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
- const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- stp1_0 = stp2_0; \
- stp1_1 = stp2_1; \
- stp1_2 = stp2_1; \
- stp1_3 = stp2_0; \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_4 = stp2_4; \
- stp1_7 = stp2_7; \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- \
- MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
- stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
- stp1_19, stp1_28) \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
- stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
- stp1_21, stp1_26) \
- \
- stp1_22 = stp2_22; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_25 = stp2_25; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
-} \
-\
-/* Stage6 */ \
-{ \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
- \
- stp2_8 = stp1_8; \
- stp2_9 = stp1_9; \
- stp2_14 = stp1_14; \
- stp2_15 = stp1_15; \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
- stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
- stp2_13, stp2_11, stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
- \
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
-} \
-\
-/* Stage7 */ \
-{ \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
- const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- stp1_18 = stp2_18; \
- stp1_19 = stp2_19; \
- \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
- stp1_21, stp1_26) \
- MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
- stp1_23, stp1_24) \
- \
- stp1_28 = stp2_28; \
- stp1_29 = stp2_29; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
-}
-
-
-#define IDCT32 \
-/* Stage1 */ \
-{ \
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
- const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
- const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
- \
- const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
- const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
- const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
- const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
- \
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
- const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
- const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
- \
- const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
- const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
- const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
- const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
- stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
- stp1_17, stp1_30) \
- MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
- stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
- stp1_19, stp1_28) \
- MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
- stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
- stp1_21, stp1_26) \
- MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
- stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
- stp1_23, stp1_24) \
-} \
-\
-/* Stage2 */ \
-{ \
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
- const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
- const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
- \
- const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
- const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
- const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
- const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
- \
- MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
- stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
- stp2_14) \
- MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
- stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
- stp2_11, stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
- stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
- stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
- \
- stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
- stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
- stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
- stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
- \
- stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
- stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
- stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
-} \
-\
-/* Stage3 */ \
-{ \
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
- const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
- const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
- \
- const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
- const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- \
- MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
- stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
- stp1_6) \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
- stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
- \
- MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
- stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
- stp1_18, stp1_29) \
- MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
- stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
- stp1_22, stp1_25) \
- \
- stp1_16 = stp2_16; \
- stp1_31 = stp2_31; \
- stp1_19 = stp2_19; \
- stp1_20 = stp2_20; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_27 = stp2_27; \
- stp1_28 = stp2_28; \
-} \
-\
-/* Stage4 */ \
-{ \
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
- const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
- const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
- stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
- stp2_2, stp2_3) \
- \
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
- stp2_10, stp2_13) \
- \
- stp2_8 = stp1_8; \
- stp2_15 = stp1_15; \
- stp2_11 = stp1_11; \
- stp2_12 = stp1_12; \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
-} \
-\
-/* Stage5 */ \
-{ \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
- const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_4 = stp2_4; \
- stp1_7 = stp2_7; \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- \
- MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
- stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
- stp1_19, stp1_28) \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
- stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
- stp1_21, stp1_26) \
- \
- stp1_22 = stp2_22; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_25 = stp2_25; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
-} \
-\
-/* Stage6 */ \
-{ \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
- \
- stp2_8 = stp1_8; \
- stp2_9 = stp1_9; \
- stp2_14 = stp1_14; \
- stp2_15 = stp1_15; \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
- stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
- stp2_13, stp2_11, stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
- \
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
-} \
-\
-/* Stage7 */ \
-{ \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
- const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- stp1_18 = stp2_18; \
- stp1_19 = stp2_19; \
- \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
- stp1_21, stp1_26) \
- MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
- stp1_23, stp1_24) \
- \
- stp1_28 = stp2_28; \
- stp1_29 = stp2_29; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
-}
-
-// Only upper-left 8x8 has non-zero coeff
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1<<5);
-
- // idct constants for each stage
- const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in[32], col[32];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
- stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
- stp1_30, stp1_31;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
- stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
- stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
- stp2_30, stp2_31;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
-
- // Load input data. Only need to load the top left 8x8 block.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 32);
- in[2] = load_input_data(input + 64);
- in[3] = load_input_data(input + 96);
- in[4] = load_input_data(input + 128);
- in[5] = load_input_data(input + 160);
- in[6] = load_input_data(input + 192);
- in[7] = load_input_data(input + 224);
-
- for (i = 8; i < 32; ++i) {
- in[i] = _mm_setzero_si128();
- }
-
- array_transpose_8x8(in, in);
- // TODO(hkuang): Following transposes are unnecessary. But remove them will
- // lead to performance drop on some devices.
- array_transpose_8x8(in + 8, in + 8);
- array_transpose_8x8(in + 16, in + 16);
- array_transpose_8x8(in + 24, in + 24);
-
- IDCT32_34
-
- // 1_D: Store 32 intermediate results for each 8x32 block.
- col[0] = _mm_add_epi16(stp1_0, stp1_31);
- col[1] = _mm_add_epi16(stp1_1, stp1_30);
- col[2] = _mm_add_epi16(stp1_2, stp1_29);
- col[3] = _mm_add_epi16(stp1_3, stp1_28);
- col[4] = _mm_add_epi16(stp1_4, stp1_27);
- col[5] = _mm_add_epi16(stp1_5, stp1_26);
- col[6] = _mm_add_epi16(stp1_6, stp1_25);
- col[7] = _mm_add_epi16(stp1_7, stp1_24);
- col[8] = _mm_add_epi16(stp1_8, stp1_23);
- col[9] = _mm_add_epi16(stp1_9, stp1_22);
- col[10] = _mm_add_epi16(stp1_10, stp1_21);
- col[11] = _mm_add_epi16(stp1_11, stp1_20);
- col[12] = _mm_add_epi16(stp1_12, stp1_19);
- col[13] = _mm_add_epi16(stp1_13, stp1_18);
- col[14] = _mm_add_epi16(stp1_14, stp1_17);
- col[15] = _mm_add_epi16(stp1_15, stp1_16);
- col[16] = _mm_sub_epi16(stp1_15, stp1_16);
- col[17] = _mm_sub_epi16(stp1_14, stp1_17);
- col[18] = _mm_sub_epi16(stp1_13, stp1_18);
- col[19] = _mm_sub_epi16(stp1_12, stp1_19);
- col[20] = _mm_sub_epi16(stp1_11, stp1_20);
- col[21] = _mm_sub_epi16(stp1_10, stp1_21);
- col[22] = _mm_sub_epi16(stp1_9, stp1_22);
- col[23] = _mm_sub_epi16(stp1_8, stp1_23);
- col[24] = _mm_sub_epi16(stp1_7, stp1_24);
- col[25] = _mm_sub_epi16(stp1_6, stp1_25);
- col[26] = _mm_sub_epi16(stp1_5, stp1_26);
- col[27] = _mm_sub_epi16(stp1_4, stp1_27);
- col[28] = _mm_sub_epi16(stp1_3, stp1_28);
- col[29] = _mm_sub_epi16(stp1_2, stp1_29);
- col[30] = _mm_sub_epi16(stp1_1, stp1_30);
- col[31] = _mm_sub_epi16(stp1_0, stp1_31);
- for (i = 0; i < 4; i++) {
- int j;
- const __m128i zero = _mm_setzero_si128();
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(col + i * 8, in);
- IDCT32_34
-
- // 2_D: Calculate the results and store them to destination.
- in[0] = _mm_add_epi16(stp1_0, stp1_31);
- in[1] = _mm_add_epi16(stp1_1, stp1_30);
- in[2] = _mm_add_epi16(stp1_2, stp1_29);
- in[3] = _mm_add_epi16(stp1_3, stp1_28);
- in[4] = _mm_add_epi16(stp1_4, stp1_27);
- in[5] = _mm_add_epi16(stp1_5, stp1_26);
- in[6] = _mm_add_epi16(stp1_6, stp1_25);
- in[7] = _mm_add_epi16(stp1_7, stp1_24);
- in[8] = _mm_add_epi16(stp1_8, stp1_23);
- in[9] = _mm_add_epi16(stp1_9, stp1_22);
- in[10] = _mm_add_epi16(stp1_10, stp1_21);
- in[11] = _mm_add_epi16(stp1_11, stp1_20);
- in[12] = _mm_add_epi16(stp1_12, stp1_19);
- in[13] = _mm_add_epi16(stp1_13, stp1_18);
- in[14] = _mm_add_epi16(stp1_14, stp1_17);
- in[15] = _mm_add_epi16(stp1_15, stp1_16);
- in[16] = _mm_sub_epi16(stp1_15, stp1_16);
- in[17] = _mm_sub_epi16(stp1_14, stp1_17);
- in[18] = _mm_sub_epi16(stp1_13, stp1_18);
- in[19] = _mm_sub_epi16(stp1_12, stp1_19);
- in[20] = _mm_sub_epi16(stp1_11, stp1_20);
- in[21] = _mm_sub_epi16(stp1_10, stp1_21);
- in[22] = _mm_sub_epi16(stp1_9, stp1_22);
- in[23] = _mm_sub_epi16(stp1_8, stp1_23);
- in[24] = _mm_sub_epi16(stp1_7, stp1_24);
- in[25] = _mm_sub_epi16(stp1_6, stp1_25);
- in[26] = _mm_sub_epi16(stp1_5, stp1_26);
- in[27] = _mm_sub_epi16(stp1_4, stp1_27);
- in[28] = _mm_sub_epi16(stp1_3, stp1_28);
- in[29] = _mm_sub_epi16(stp1_2, stp1_29);
- in[30] = _mm_sub_epi16(stp1_1, stp1_30);
- in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
- for (j = 0; j < 32; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
-
- dest += 8;
- }
-}
-
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
-
- // idct constants for each stage
- const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in[32], col[128], zero_idx[16];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
- stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
- stp1_30, stp1_31;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
- stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
- stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
- stp2_30, stp2_31;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i, j, i32;
-
- for (i = 0; i < 4; i++) {
- i32 = (i << 5);
- // First 1-D idct
- // Load input data.
- LOAD_DQCOEFF(in[0], input);
- LOAD_DQCOEFF(in[8], input);
- LOAD_DQCOEFF(in[16], input);
- LOAD_DQCOEFF(in[24], input);
- LOAD_DQCOEFF(in[1], input);
- LOAD_DQCOEFF(in[9], input);
- LOAD_DQCOEFF(in[17], input);
- LOAD_DQCOEFF(in[25], input);
- LOAD_DQCOEFF(in[2], input);
- LOAD_DQCOEFF(in[10], input);
- LOAD_DQCOEFF(in[18], input);
- LOAD_DQCOEFF(in[26], input);
- LOAD_DQCOEFF(in[3], input);
- LOAD_DQCOEFF(in[11], input);
- LOAD_DQCOEFF(in[19], input);
- LOAD_DQCOEFF(in[27], input);
-
- LOAD_DQCOEFF(in[4], input);
- LOAD_DQCOEFF(in[12], input);
- LOAD_DQCOEFF(in[20], input);
- LOAD_DQCOEFF(in[28], input);
- LOAD_DQCOEFF(in[5], input);
- LOAD_DQCOEFF(in[13], input);
- LOAD_DQCOEFF(in[21], input);
- LOAD_DQCOEFF(in[29], input);
- LOAD_DQCOEFF(in[6], input);
- LOAD_DQCOEFF(in[14], input);
- LOAD_DQCOEFF(in[22], input);
- LOAD_DQCOEFF(in[30], input);
- LOAD_DQCOEFF(in[7], input);
- LOAD_DQCOEFF(in[15], input);
- LOAD_DQCOEFF(in[23], input);
- LOAD_DQCOEFF(in[31], input);
-
- // checking if all entries are zero
- zero_idx[0] = _mm_or_si128(in[0], in[1]);
- zero_idx[1] = _mm_or_si128(in[2], in[3]);
- zero_idx[2] = _mm_or_si128(in[4], in[5]);
- zero_idx[3] = _mm_or_si128(in[6], in[7]);
- zero_idx[4] = _mm_or_si128(in[8], in[9]);
- zero_idx[5] = _mm_or_si128(in[10], in[11]);
- zero_idx[6] = _mm_or_si128(in[12], in[13]);
- zero_idx[7] = _mm_or_si128(in[14], in[15]);
- zero_idx[8] = _mm_or_si128(in[16], in[17]);
- zero_idx[9] = _mm_or_si128(in[18], in[19]);
- zero_idx[10] = _mm_or_si128(in[20], in[21]);
- zero_idx[11] = _mm_or_si128(in[22], in[23]);
- zero_idx[12] = _mm_or_si128(in[24], in[25]);
- zero_idx[13] = _mm_or_si128(in[26], in[27]);
- zero_idx[14] = _mm_or_si128(in[28], in[29]);
- zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
- zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
- zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
- zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
- zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
- zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
- zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
- zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
- zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
- zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
- zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
- zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
- zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
- zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
- zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
- zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
- if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
- col[i32 + 0] = _mm_setzero_si128();
- col[i32 + 1] = _mm_setzero_si128();
- col[i32 + 2] = _mm_setzero_si128();
- col[i32 + 3] = _mm_setzero_si128();
- col[i32 + 4] = _mm_setzero_si128();
- col[i32 + 5] = _mm_setzero_si128();
- col[i32 + 6] = _mm_setzero_si128();
- col[i32 + 7] = _mm_setzero_si128();
- col[i32 + 8] = _mm_setzero_si128();
- col[i32 + 9] = _mm_setzero_si128();
- col[i32 + 10] = _mm_setzero_si128();
- col[i32 + 11] = _mm_setzero_si128();
- col[i32 + 12] = _mm_setzero_si128();
- col[i32 + 13] = _mm_setzero_si128();
- col[i32 + 14] = _mm_setzero_si128();
- col[i32 + 15] = _mm_setzero_si128();
- col[i32 + 16] = _mm_setzero_si128();
- col[i32 + 17] = _mm_setzero_si128();
- col[i32 + 18] = _mm_setzero_si128();
- col[i32 + 19] = _mm_setzero_si128();
- col[i32 + 20] = _mm_setzero_si128();
- col[i32 + 21] = _mm_setzero_si128();
- col[i32 + 22] = _mm_setzero_si128();
- col[i32 + 23] = _mm_setzero_si128();
- col[i32 + 24] = _mm_setzero_si128();
- col[i32 + 25] = _mm_setzero_si128();
- col[i32 + 26] = _mm_setzero_si128();
- col[i32 + 27] = _mm_setzero_si128();
- col[i32 + 28] = _mm_setzero_si128();
- col[i32 + 29] = _mm_setzero_si128();
- col[i32 + 30] = _mm_setzero_si128();
- col[i32 + 31] = _mm_setzero_si128();
- continue;
- }
-
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(in, in);
- array_transpose_8x8(in + 8, in + 8);
- array_transpose_8x8(in + 16, in + 16);
- array_transpose_8x8(in + 24, in + 24);
-
- IDCT32
-
- // 1_D: Store 32 intermediate results for each 8x32 block.
- col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
- col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
- col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
- col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
- col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
- col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
- col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
- col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
- col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
- col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
- col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
- col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
- col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
- col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
- col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
- col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
- col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
- col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
- col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
- col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
- col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
- col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
- col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
- col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
- col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
- col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
- col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
- col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
- col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
- col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
- col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
- col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
- }
- for (i = 0; i < 4; i++) {
- // Second 1-D idct
- j = i << 3;
-
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(col + j, in);
- array_transpose_8x8(col + j + 32, in + 8);
- array_transpose_8x8(col + j + 64, in + 16);
- array_transpose_8x8(col + j + 96, in + 24);
-
- IDCT32
-
- // 2_D: Calculate the results and store them to destination.
- in[0] = _mm_add_epi16(stp1_0, stp1_31);
- in[1] = _mm_add_epi16(stp1_1, stp1_30);
- in[2] = _mm_add_epi16(stp1_2, stp1_29);
- in[3] = _mm_add_epi16(stp1_3, stp1_28);
- in[4] = _mm_add_epi16(stp1_4, stp1_27);
- in[5] = _mm_add_epi16(stp1_5, stp1_26);
- in[6] = _mm_add_epi16(stp1_6, stp1_25);
- in[7] = _mm_add_epi16(stp1_7, stp1_24);
- in[8] = _mm_add_epi16(stp1_8, stp1_23);
- in[9] = _mm_add_epi16(stp1_9, stp1_22);
- in[10] = _mm_add_epi16(stp1_10, stp1_21);
- in[11] = _mm_add_epi16(stp1_11, stp1_20);
- in[12] = _mm_add_epi16(stp1_12, stp1_19);
- in[13] = _mm_add_epi16(stp1_13, stp1_18);
- in[14] = _mm_add_epi16(stp1_14, stp1_17);
- in[15] = _mm_add_epi16(stp1_15, stp1_16);
- in[16] = _mm_sub_epi16(stp1_15, stp1_16);
- in[17] = _mm_sub_epi16(stp1_14, stp1_17);
- in[18] = _mm_sub_epi16(stp1_13, stp1_18);
- in[19] = _mm_sub_epi16(stp1_12, stp1_19);
- in[20] = _mm_sub_epi16(stp1_11, stp1_20);
- in[21] = _mm_sub_epi16(stp1_10, stp1_21);
- in[22] = _mm_sub_epi16(stp1_9, stp1_22);
- in[23] = _mm_sub_epi16(stp1_8, stp1_23);
- in[24] = _mm_sub_epi16(stp1_7, stp1_24);
- in[25] = _mm_sub_epi16(stp1_6, stp1_25);
- in[26] = _mm_sub_epi16(stp1_5, stp1_26);
- in[27] = _mm_sub_epi16(stp1_4, stp1_27);
- in[28] = _mm_sub_epi16(stp1_3, stp1_28);
- in[29] = _mm_sub_epi16(stp1_2, stp1_29);
- in[30] = _mm_sub_epi16(stp1_1, stp1_30);
- in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
- for (j = 0; j < 32; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
-
- dest += 8;
- }
-}
-
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- __m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a, j;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 6);
-
- dc_value = _mm_set1_epi16(a);
-
- for (j = 0; j < 32; ++j) {
- RECON_AND_STORE(dest + 0 + j * stride, dc_value);
- RECON_AND_STORE(dest + 8 + j * stride, dc_value);
- RECON_AND_STORE(dest + 16 + j * stride, dc_value);
- RECON_AND_STORE(dest + 24 + j * stride, dc_value);
- }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
- __m128i ubounded, retval;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
- ubounded = _mm_cmpgt_epi16(value, max);
- retval = _mm_andnot_si128(ubounded, value);
- ubounded = _mm_and_si128(ubounded, max);
- retval = _mm_or_si128(retval, ubounded);
- retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
- return retval;
-}
-
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[4 * 4];
- tran_low_t *outptr = out;
- int i, j;
- __m128i inptr[4];
- __m128i sign_bits[2];
- __m128i temp_mm, min_input, max_input;
- int test;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- int optimised_cols = 0;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i max = _mm_set1_epi16(12043);
- const __m128i min = _mm_set1_epi16(-12043);
- // Load input into __m128i
- inptr[0] = _mm_loadu_si128((const __m128i *)input);
- inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
- inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
- inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
- // Pack to 16 bits
- inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
- inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp_mm = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp_mm);
-
- if (!test) {
- // Do the row transform
- idct4_sse2(inptr);
-
- // Check the min & max values
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp_mm = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp_mm);
-
- if (test) {
- transpose_4x4(inptr);
- sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
- sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
- inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
- inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
- inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
- inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
- _mm_storeu_si128((__m128i *)outptr, inptr[0]);
- _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
- _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
- _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct4_c(input, outptr, bd);
- input += 4;
- outptr += 4;
- }
- }
-
- if (optimised_cols) {
- idct4_sse2(inptr);
-
- // Final round and shift
- inptr[0] = _mm_add_epi16(inptr[0], eight);
- inptr[1] = _mm_add_epi16(inptr[1], eight);
-
- inptr[0] = _mm_srai_epi16(inptr[0], 4);
- inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
- // Reconstruction and Store
- {
- __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
- __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi64(
- d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
- d2 = _mm_unpacklo_epi64(
- d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
- d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
- d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
- // store input0
- _mm_storel_epi64((__m128i *)dest, d0);
- // store input1
- d0 = _mm_srli_si128(d0, 8);
- _mm_storel_epi64((__m128i *)(dest + stride), d0);
- // store input2
- _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
- // store input3
- d2 = _mm_srli_si128(d2, 8);
- _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[4], temp_out[4];
- // Columns
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j)
- temp_in[j] = out[j * 4 + i];
- vpx_highbd_idct4_c(temp_in, temp_out, bd);
- for (j = 0; j < 4; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
- }
- }
- }
-}
-
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8];
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[8];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i sixteen = _mm_set1_epi16(16);
- const __m128i max = _mm_set1_epi16(6201);
- const __m128i min = _mm_set1_epi16(-6201);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 8; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- idct8_sse2(inptr);
-
- // Find the min & max for the column transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- array_transpose_8x8(inptr, inptr);
- for (i = 0; i < 8; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 8; ++i) {
- vpx_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
- }
-
- if (optimised_cols) {
- idct8_sse2(inptr);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[8];
- for (i = 0; i < 8; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], sixteen);
- d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
- inptr[i] = _mm_srai_epi16(inptr[i], 5);
- d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[8], temp_out[8];
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = out[j * 8 + i];
- vpx_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
- }
-}
-
-void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[8];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i sixteen = _mm_set1_epi16(16);
- const __m128i max = _mm_set1_epi16(6201);
- const __m128i min = _mm_set1_epi16(-6201);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 8; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- // only first 4 row has non-zero coefs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 4; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- idct8_sse2(inptr);
-
- // Find the min & max for the column transform
- // N.B. Only first 4 cols contain non-zero coeffs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- // Use fact only first 4 rows contain non-zero coeffs
- array_transpose_4X8(inptr, inptr);
- for (i = 0; i < 4; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
- }
-
- if (optimised_cols) {
- idct8_sse2(inptr);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[8];
- for (i = 0; i < 8; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], sixteen);
- d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
- inptr[i] = _mm_srai_epi16(inptr[i], 5);
- d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[8], temp_out[8];
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = out[j * 8 + i];
- vpx_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
- }
-}
-
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16];
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[32];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i rounding = _mm_set1_epi16(32);
- const __m128i max = _mm_set1_epi16(3155);
- const __m128i min = _mm_set1_epi16(-3155);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 16; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
- inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 32; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- idct16_sse2(inptr, inptr + 16);
-
- // Find the min & max for the column transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 32; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- array_transpose_16x16(inptr, inptr + 16);
- for (i = 0; i < 16; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
- sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 16; ++i) {
- vpx_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
- }
-
- if (optimised_cols) {
- idct16_sse2(inptr, inptr + 16);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[2];
- for (i = 0; i < 16; i++) {
- inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
- inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
- d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
- d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
- inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
- inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
- _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[16], temp_out[16];
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j * 16 + i];
- vpx_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
- }
-}
-
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16] = { 0 };
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[32];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i rounding = _mm_set1_epi16(32);
- const __m128i max = _mm_set1_epi16(3155);
- const __m128i min = _mm_set1_epi16(-3155);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 16; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
- inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- // Since all non-zero dct coefficients are in upper-left 4x4 area,
- // we only need to consider first 4 rows here.
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 4; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform (N.B. This transposes inptr)
- idct16_sse2(inptr, inptr + 16);
-
- // Find the min & max for the column transform
- // N.B. Only first 4 cols contain non-zero coeffs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 16; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- // Use fact only first 4 rows contain non-zero coeffs
- array_transpose_8x8(inptr, inptr);
- array_transpose_8x8(inptr + 8, inptr + 16);
- for (i = 0; i < 4; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
- sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
- }
-
- if (optimised_cols) {
- idct16_sse2(inptr, inptr + 16);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[2];
- for (i = 0; i < 16; i++) {
- inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
- inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
- d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
- d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
- inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
- inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
- _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[16], temp_out[16];
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j * 16 + i];
- vpx_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
- }
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
deleted file mode 100644
index bd520c18e5..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
-#define VPX_DSP_X86_INV_TXFM_SSE2_H_
-
-#include <emmintrin.h> // SSE2
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_dsp/inv_txfm.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-
-// perform 8x8 transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
- res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- \
- in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
- in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
- }
-
-static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
- out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
- out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
- out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
- out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
- __m128i tbuf[8];
- array_transpose_8x8(res0, res0);
- array_transpose_8x8(res1, tbuf);
- array_transpose_8x8(res0 + 8, res1);
- array_transpose_8x8(res1 + 8, res1 + 8);
-
- res0[8] = tbuf[0];
- res0[9] = tbuf[1];
- res0[10] = tbuf[2];
- res0[11] = tbuf[3];
- res0[12] = tbuf[4];
- res0[13] = tbuf[5];
- res0[14] = tbuf[6];
- res0[15] = tbuf[7];
-}
-
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
-// highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
-#if CONFIG_VP9_HIGHBITDEPTH
- return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
- data[6], data[7]);
-#else
- return _mm_load_si128((const __m128i *)data);
-#endif
-}
-
-static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
- in[0] = load_input_data(input + 0 * 16);
- in[1] = load_input_data(input + 1 * 16);
- in[2] = load_input_data(input + 2 * 16);
- in[3] = load_input_data(input + 3 * 16);
- in[4] = load_input_data(input + 4 * 16);
- in[5] = load_input_data(input + 5 * 16);
- in[6] = load_input_data(input + 6 * 16);
- in[7] = load_input_data(input + 7 * 16);
-
- in[8] = load_input_data(input + 8 * 16);
- in[9] = load_input_data(input + 9 * 16);
- in[10] = load_input_data(input + 10 * 16);
- in[11] = load_input_data(input + 11 * 16);
- in[12] = load_input_data(input + 12 * 16);
- in[13] = load_input_data(input + 13 * 16);
- in[14] = load_input_data(input + 14 * 16);
- in[15] = load_input_data(input + 15 * 16);
-}
-
-#define RECON_AND_STORE(dest, in_x) \
- { \
- __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- d0 = _mm_add_epi16(in_x, d0); \
- d0 = _mm_packus_epi16(d0, d0); \
- _mm_storel_epi64((__m128i *)(dest), d0); \
- }
-
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1<<5);
- const __m128i zero = _mm_setzero_si128();
- // Final rounding and shift
- in[0] = _mm_adds_epi16(in[0], final_rounding);
- in[1] = _mm_adds_epi16(in[1], final_rounding);
- in[2] = _mm_adds_epi16(in[2], final_rounding);
- in[3] = _mm_adds_epi16(in[3], final_rounding);
- in[4] = _mm_adds_epi16(in[4], final_rounding);
- in[5] = _mm_adds_epi16(in[5], final_rounding);
- in[6] = _mm_adds_epi16(in[6], final_rounding);
- in[7] = _mm_adds_epi16(in[7], final_rounding);
- in[8] = _mm_adds_epi16(in[8], final_rounding);
- in[9] = _mm_adds_epi16(in[9], final_rounding);
- in[10] = _mm_adds_epi16(in[10], final_rounding);
- in[11] = _mm_adds_epi16(in[11], final_rounding);
- in[12] = _mm_adds_epi16(in[12], final_rounding);
- in[13] = _mm_adds_epi16(in[13], final_rounding);
- in[14] = _mm_adds_epi16(in[14], final_rounding);
- in[15] = _mm_adds_epi16(in[15], final_rounding);
-
- in[0] = _mm_srai_epi16(in[0], 6);
- in[1] = _mm_srai_epi16(in[1], 6);
- in[2] = _mm_srai_epi16(in[2], 6);
- in[3] = _mm_srai_epi16(in[3], 6);
- in[4] = _mm_srai_epi16(in[4], 6);
- in[5] = _mm_srai_epi16(in[5], 6);
- in[6] = _mm_srai_epi16(in[6], 6);
- in[7] = _mm_srai_epi16(in[7], 6);
- in[8] = _mm_srai_epi16(in[8], 6);
- in[9] = _mm_srai_epi16(in[9], 6);
- in[10] = _mm_srai_epi16(in[10], 6);
- in[11] = _mm_srai_epi16(in[11], 6);
- in[12] = _mm_srai_epi16(in[12], 6);
- in[13] = _mm_srai_epi16(in[13], 6);
- in[14] = _mm_srai_epi16(in[14], 6);
- in[15] = _mm_srai_epi16(in[15], 6);
-
- RECON_AND_STORE(dest + 0 * stride, in[0]);
- RECON_AND_STORE(dest + 1 * stride, in[1]);
- RECON_AND_STORE(dest + 2 * stride, in[2]);
- RECON_AND_STORE(dest + 3 * stride, in[3]);
- RECON_AND_STORE(dest + 4 * stride, in[4]);
- RECON_AND_STORE(dest + 5 * stride, in[5]);
- RECON_AND_STORE(dest + 6 * stride, in[6]);
- RECON_AND_STORE(dest + 7 * stride, in[7]);
- RECON_AND_STORE(dest + 8 * stride, in[8]);
- RECON_AND_STORE(dest + 9 * stride, in[9]);
- RECON_AND_STORE(dest + 10 * stride, in[10]);
- RECON_AND_STORE(dest + 11 * stride, in[11]);
- RECON_AND_STORE(dest + 12 * stride, in[12]);
- RECON_AND_STORE(dest + 13 * stride, in[13]);
- RECON_AND_STORE(dest + 14 * stride, in[14]);
- RECON_AND_STORE(dest + 15 * stride, in[15]);
-}
-
-void idct4_sse2(__m128i *in);
-void idct8_sse2(__m128i *in);
-void idct16_sse2(__m128i *in0, __m128i *in1);
-void iadst4_sse2(__m128i *in);
-void iadst8_sse2(__m128i *in);
-void iadst16_sse2(__m128i *in0, __m128i *in1);
-
-#endif // VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
deleted file mode 100644
index 20baf820f6..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ /dev/null
@@ -1,1793 +0,0 @@
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the inverse transformation. Part
-; of the functions are originally derived from the ffmpeg project.
-; Note that the current version applies to x86 64-bit only.
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-
-pw_m2404x2: times 8 dw -2404*2
-pw_m4756x2: times 8 dw -4756*2
-pw_m5520x2: times 8 dw -5520*2
-pw_m8423x2: times 8 dw -8423*2
-pw_m9102x2: times 8 dw -9102*2
-pw_m10394x2: times 8 dw -10394*2
-pw_m11003x2: times 8 dw -11003*2
-
-pw_16364x2: times 8 dw 16364*2
-pw_16305x2: times 8 dw 16305*2
-pw_16207x2: times 8 dw 16207*2
-pw_16069x2: times 8 dw 16069*2
-pw_15893x2: times 8 dw 15893*2
-pw_15679x2: times 8 dw 15679*2
-pw_15426x2: times 8 dw 15426*2
-pw_15137x2: times 8 dw 15137*2
-pw_14811x2: times 8 dw 14811*2
-pw_14449x2: times 8 dw 14449*2
-pw_14053x2: times 8 dw 14053*2
-pw_13623x2: times 8 dw 13623*2
-pw_13160x2: times 8 dw 13160*2
-pw_12665x2: times 8 dw 12665*2
-pw_12140x2: times 8 dw 12140*2
-pw__9760x2: times 8 dw 9760*2
-pw__7723x2: times 8 dw 7723*2
-pw__7005x2: times 8 dw 7005*2
-pw__6270x2: times 8 dw 6270*2
-pw__3981x2: times 8 dw 3981*2
-pw__3196x2: times 8 dw 3196*2
-pw__1606x2: times 8 dw 1606*2
-pw___804x2: times 8 dw 804*2
-
-pd_8192: times 4 dd 8192
-pw_32: times 8 dw 32
-pw_16: times 8 dw 16
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
-pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1
-pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
-%endmacro
-
-TRANSFORM_COEFFS 6270, 15137
-TRANSFORM_COEFFS 3196, 16069
-TRANSFORM_COEFFS 13623, 9102
-
-; constants for 32x32_34
-TRANSFORM_COEFFS 804, 16364
-TRANSFORM_COEFFS 15426, 5520
-TRANSFORM_COEFFS 3981, 15893
-TRANSFORM_COEFFS 16207, 2404
-TRANSFORM_COEFFS 1606, 16305
-TRANSFORM_COEFFS 15679, 4756
-TRANSFORM_COEFFS 11585, 11585
-
-; constants for 32x32_1024
-TRANSFORM_COEFFS 12140, 11003
-TRANSFORM_COEFFS 7005, 14811
-TRANSFORM_COEFFS 14053, 8423
-TRANSFORM_COEFFS 9760, 13160
-TRANSFORM_COEFFS 12665, 10394
-TRANSFORM_COEFFS 7723, 14449
-
-%macro PAIR_PP_COEFFS 2
-dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2
-%endmacro
-
-%macro PAIR_MP_COEFFS 2
-dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2
-%endmacro
-
-%macro PAIR_MM_COEFFS 2
-dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
-%endmacro
-
-PAIR_PP_COEFFS 30274, 12540
-PAIR_PP_COEFFS 6392, 32138
-PAIR_MP_COEFFS 18204, 27246
-
-PAIR_PP_COEFFS 12540, 12540
-PAIR_PP_COEFFS 30274, 30274
-PAIR_PP_COEFFS 6392, 6392
-PAIR_PP_COEFFS 32138, 32138
-PAIR_MM_COEFFS 18204, 18204
-PAIR_PP_COEFFS 27246, 27246
-
-SECTION .text
-
-%if ARCH_X86_64
-%macro SUM_SUB 3
- psubw m%3, m%1, m%2
- paddw m%1, m%2
- SWAP %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
- pmaddwd m%1, m%3, %5
- pmaddwd m%2, m%3, %6
- paddd m%1, %4
- paddd m%2, %4
- psrad m%1, 14
- psrad m%2, 14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
- punpckhwd m%6, m%2, m%1
- MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4]
- punpcklwd m%2, m%1
- MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4]
- packssdw m%1, m%7
- packssdw m%2, m%6
-%endmacro
-
-%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
- punpckhwd m%6, m%2, m%1
- MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4]
- punpcklwd m%2, m%1
- MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4]
- packssdw m%1, m%7
- packssdw m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
- punpckh%1 m%4, m%2, m%3
- punpckl%1 m%2, m%3
- SWAP %3, %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
- INTERLEAVE_2X wd, %1, %2, %9
- INTERLEAVE_2X wd, %3, %4, %9
- INTERLEAVE_2X wd, %5, %6, %9
- INTERLEAVE_2X wd, %7, %8, %9
-
- INTERLEAVE_2X dq, %1, %3, %9
- INTERLEAVE_2X dq, %2, %4, %9
- INTERLEAVE_2X dq, %5, %7, %9
- INTERLEAVE_2X dq, %6, %8, %9
-
- INTERLEAVE_2X qdq, %1, %5, %9
- INTERLEAVE_2X qdq, %3, %7, %9
- INTERLEAVE_2X qdq, %2, %6, %9
- INTERLEAVE_2X qdq, %4, %8, %9
-
- SWAP %2, %5
- SWAP %4, %7
-%endmacro
-
-%macro IDCT8_1D 0
- SUM_SUB 0, 4, 9
- BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10
- pmulhrsw m0, m12
- pmulhrsw m4, m12
- BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10
- BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10
-
- SUM_SUB 1, 5, 9
- SUM_SUB 7, 3, 9
- SUM_SUB 0, 6, 9
- SUM_SUB 4, 2, 9
- SUM_SUB 3, 5, 9
- pmulhrsw m3, m12
- pmulhrsw m5, m12
-
- SUM_SUB 0, 7, 9
- SUM_SUB 4, 3, 9
- SUM_SUB 2, 5, 9
- SUM_SUB 6, 1, 9
-
- SWAP 3, 6
- SWAP 1, 4
-%endmacro
-
-; This macro handles 8 pixels per line
-%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero
- paddw m%1, m11
- paddw m%2, m11
- psraw m%1, 5
- psraw m%2, 5
-
- movh m%3, [outputq]
- movh m%4, [outputq + strideq]
- punpcklbw m%3, m%5
- punpcklbw m%4, m%5
- paddw m%3, m%1
- paddw m%4, m%2
- packuswb m%3, m%5
- packuswb m%4, m%5
- movh [outputq], m%3
- movh [outputq + strideq], m%4
-%endmacro
-
-INIT_XMM ssse3
-; full inverse 8x8 2D-DCT transform
-cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
- mova m8, [pd_8192]
- mova m11, [pw_16]
- mova m12, [pw_11585x2]
-
- lea r3, [2 * strideq]
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [inputq + 0]
- packssdw m0, [inputq + 16]
- mova m1, [inputq + 32]
- packssdw m1, [inputq + 48]
- mova m2, [inputq + 64]
- packssdw m2, [inputq + 80]
- mova m3, [inputq + 96]
- packssdw m3, [inputq + 112]
- mova m4, [inputq + 128]
- packssdw m4, [inputq + 144]
- mova m5, [inputq + 160]
- packssdw m5, [inputq + 176]
- mova m6, [inputq + 192]
- packssdw m6, [inputq + 208]
- mova m7, [inputq + 224]
- packssdw m7, [inputq + 240]
-%else
- mova m0, [inputq + 0]
- mova m1, [inputq + 16]
- mova m2, [inputq + 32]
- mova m3, [inputq + 48]
- mova m4, [inputq + 64]
- mova m5, [inputq + 80]
- mova m6, [inputq + 96]
- mova m7, [inputq + 112]
-%endif
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
- IDCT8_1D
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
- IDCT8_1D
-
- pxor m12, m12
- ADD_STORE_8P_2X 0, 1, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 2, 3, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 4, 5, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 6, 7, 9, 10, 12
-
- RET
-
-; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
-cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
- mova m8, [pd_8192]
- mova m11, [pw_16]
- mova m12, [pw_11585x2]
-
- lea r3, [2 * strideq]
-
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [inputq + 0]
- packssdw m0, [inputq + 16]
- mova m1, [inputq + 32]
- packssdw m1, [inputq + 48]
- mova m2, [inputq + 64]
- packssdw m2, [inputq + 80]
- mova m3, [inputq + 96]
- packssdw m3, [inputq + 112]
-%else
- mova m0, [inputq + 0]
- mova m1, [inputq + 16]
- mova m2, [inputq + 32]
- mova m3, [inputq + 48]
-%endif
-
- punpcklwd m0, m1
- punpcklwd m2, m3
- punpckhdq m9, m0, m2
- punpckldq m0, m2
- SWAP 2, 9
-
- ; m0 -> [0], [0]
- ; m1 -> [1], [1]
- ; m2 -> [2], [2]
- ; m3 -> [3], [3]
- punpckhqdq m10, m0, m0
- punpcklqdq m0, m0
- punpckhqdq m9, m2, m2
- punpcklqdq m2, m2
- SWAP 1, 10
- SWAP 3, 9
-
- pmulhrsw m0, m12
- pmulhrsw m2, [dpw_30274_12540]
- pmulhrsw m1, [dpw_6392_32138]
- pmulhrsw m3, [dpw_m18204_27246]
-
- SUM_SUB 0, 2, 9
- SUM_SUB 1, 3, 9
-
- punpcklqdq m9, m3, m3
- punpckhqdq m5, m3, m9
-
- SUM_SUB 3, 5, 9
- punpckhqdq m5, m3
- pmulhrsw m5, m12
-
- punpckhqdq m9, m1, m5
- punpcklqdq m1, m5
- SWAP 5, 9
-
- SUM_SUB 0, 5, 9
- SUM_SUB 2, 1, 9
-
- punpckhqdq m3, m0, m0
- punpckhqdq m4, m1, m1
- punpckhqdq m6, m5, m5
- punpckhqdq m7, m2, m2
-
- punpcklwd m0, m3
- punpcklwd m7, m2
- punpcklwd m1, m4
- punpcklwd m6, m5
-
- punpckhdq m4, m0, m7
- punpckldq m0, m7
- punpckhdq m10, m1, m6
- punpckldq m5, m1, m6
-
- punpckhqdq m1, m0, m5
- punpcklqdq m0, m5
- punpckhqdq m3, m4, m10
- punpcklqdq m2, m4, m10
-
-
- pmulhrsw m0, m12
- pmulhrsw m6, m2, [dpw_30274_30274]
- pmulhrsw m4, m2, [dpw_12540_12540]
-
- pmulhrsw m7, m1, [dpw_32138_32138]
- pmulhrsw m1, [dpw_6392_6392]
- pmulhrsw m5, m3, [dpw_m18204_m18204]
- pmulhrsw m3, [dpw_27246_27246]
-
- mova m2, m0
- SUM_SUB 0, 6, 9
- SUM_SUB 2, 4, 9
- SUM_SUB 1, 5, 9
- SUM_SUB 7, 3, 9
-
- SUM_SUB 3, 5, 9
- pmulhrsw m3, m12
- pmulhrsw m5, m12
-
- SUM_SUB 0, 7, 9
- SUM_SUB 2, 3, 9
- SUM_SUB 4, 5, 9
- SUM_SUB 6, 1, 9
-
- SWAP 3, 6
- SWAP 1, 2
- SWAP 2, 4
-
-
- pxor m12, m12
- ADD_STORE_8P_2X 0, 1, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 2, 3, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 4, 5, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 6, 7, 9, 10, 12
-
- RET
-
-%define idx0 16 * 0
-%define idx1 16 * 1
-%define idx2 16 * 2
-%define idx3 16 * 3
-%define idx4 16 * 4
-%define idx5 16 * 5
-%define idx6 16 * 6
-%define idx7 16 * 7
-%define idx8 16 * 0
-%define idx9 16 * 1
-%define idx10 16 * 2
-%define idx11 16 * 3
-%define idx12 16 * 4
-%define idx13 16 * 5
-%define idx14 16 * 6
-%define idx15 16 * 7
-%define idx16 16 * 0
-%define idx17 16 * 1
-%define idx18 16 * 2
-%define idx19 16 * 3
-%define idx20 16 * 4
-%define idx21 16 * 5
-%define idx22 16 * 6
-%define idx23 16 * 7
-%define idx24 16 * 0
-%define idx25 16 * 1
-%define idx26 16 * 2
-%define idx27 16 * 3
-%define idx28 16 * 4
-%define idx29 16 * 5
-%define idx30 16 * 6
-%define idx31 16 * 7
-
-; FROM idct32x32_add_neon.asm
-;
-; Instead of doing the transforms stage by stage, it is done by loading
-; some input values and doing as many stages as possible to minimize the
-; storing/loading of intermediate results. To fit within registers, the
-; final coefficients are cut into four blocks:
-; BLOCK A: 16-19,28-31
-; BLOCK B: 20-23,24-27
-; BLOCK C: 8-11,12-15
-; BLOCK D: 0-3,4-7
-; Blocks A and C are straight calculation through the various stages. In
-; block B, further calculations are performed using the results from
-; block A. In block D, further calculations are performed using the results
-; from block C and then the final calculations are done using results from
-; block A and B which have been combined at the end of block B.
-;
-
-%macro IDCT32X32_34 4
- ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m11, m1
- pmulhrsw m1, [pw___804x2] ; stp1_16
- mova [r4 + 0], m0
- pmulhrsw m11, [pw_16364x2] ; stp2_31
- mova [r4 + 16 * 2], m2
- mova m12, m7
- pmulhrsw m7, [pw_15426x2] ; stp1_28
- mova [r4 + 16 * 4], m4
- pmulhrsw m12, [pw_m5520x2] ; stp2_19
- mova [r4 + 16 * 6], m6
-
- ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m2, m1 ; stp1_16
- mova m0, m11 ; stp1_31
- mova m4, m7 ; stp1_28
- mova m15, m12 ; stp1_19
-
- ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
- BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
-
- ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
- SUM_SUB 0, 15, 9 ; stp2_17, stp2_18
- SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
- SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
-
- ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
- BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
-
- ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m6, m5
- pmulhrsw m5, [pw__3981x2] ; stp1_20
- mova [stp + %4 + idx28], m12
- mova [stp + %4 + idx29], m15
- pmulhrsw m6, [pw_15893x2] ; stp2_27
- mova [stp + %4 + idx30], m2
- mova m2, m3
- pmulhrsw m3, [pw_m2404x2] ; stp1_23
- mova [stp + %4 + idx31], m11
- pmulhrsw m2, [pw_16207x2] ; stp2_24
-
- ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m13, m5 ; stp1_20
- mova m14, m6 ; stp1_27
- mova m15, m3 ; stp1_23
- mova m11, m2 ; stp1_24
-
- ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
- BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
-
- ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
- SUM_SUB 15, 14, 9 ; stp2_22, stp2_21
- SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
- SUM_SUB 11, 13, 9 ; stp2_25, stp2_26
-
- ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
- BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
-
- ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 3, 9 ; stp2_16, stp2_23
- SUM_SUB 0, 15, 9 ; stp2_17, stp2_22
- SUM_SUB 4, 14, 9 ; stp2_18, stp2_21
- SUM_SUB 7, 5, 9 ; stp2_19, stp2_20
- mova [stp + %3 + idx16], m1
- mova [stp + %3 + idx17], m0
- mova [stp + %3 + idx18], m4
- mova [stp + %3 + idx19], m7
-
- mova m4, [stp + %4 + idx28]
- mova m7, [stp + %4 + idx29]
- mova m10, [stp + %4 + idx30]
- mova m12, [stp + %4 + idx31]
- SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
- SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
- SUM_SUB 10, 11, 9 ; stp2_30, stp2_25
- SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
- mova [stp + %4 + idx28], m4
- mova [stp + %4 + idx29], m7
- mova [stp + %4 + idx30], m10
- mova [stp + %4 + idx31], m12
-
- ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 6, 5, 9
- pmulhrsw m6, m10 ; stp1_27
- pmulhrsw m5, m10 ; stp1_20
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_26
- pmulhrsw m14, m10 ; stp1_21
- SUM_SUB 11, 15, 9
- pmulhrsw m11, m10 ; stp1_25
- pmulhrsw m15, m10 ; stp1_22
- SUM_SUB 2, 3, 9
- pmulhrsw m2, m10 ; stp1_24
- pmulhrsw m3, m10 ; stp1_23
-%else
- BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
- SWAP 6, 5
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
- SWAP 13, 14
- BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
- SWAP 11, 15
- BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
- SWAP 2, 3
-%endif
-
- mova [stp + %4 + idx24], m2
- mova [stp + %4 + idx25], m11
- mova [stp + %4 + idx26], m13
- mova [stp + %4 + idx27], m6
-
- ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 2]
- mova m6, [rsp + transposed_in + 16 * 6]
-
- mova m1, m0
- pmulhrsw m0, [pw__1606x2] ; stp1_8
- mova [stp + %3 + idx20], m5
- mova [stp + %3 + idx21], m14
- pmulhrsw m1, [pw_16305x2] ; stp2_15
- mova [stp + %3 + idx22], m15
- mova m7, m6
- pmulhrsw m7, [pw_m4756x2] ; stp2_11
- mova [stp + %3 + idx23], m3
- pmulhrsw m6, [pw_15679x2] ; stp1_12
-
- ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m3, m0 ; stp1_8
- mova m2, m1 ; stp1_15
-
- ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
- mova m4, m7 ; stp1_11
- mova m5, m6 ; stp1_12
- BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
-
- ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
- SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
- SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
- SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
-
- ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 5, 4, 9
- pmulhrsw m5, m10 ; stp1_13
- pmulhrsw m4, m10 ; stp1_10
- SUM_SUB 6, 7, 9
- pmulhrsw m6, m10 ; stp1_12
- pmulhrsw m7, m10 ; stp1_11
-%else
- BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
- SWAP 5, 4
- BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
- SWAP 6, 7
-%endif
-
- ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova [stp + %2 + idx8], m0
- mova [stp + %2 + idx9], m2
- mova [stp + %2 + idx10], m4
- mova [stp + %2 + idx11], m7
-
- ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m11, [rsp + transposed_in + 16 * 4]
- mova m12, m11
- pmulhrsw m11, [pw__3196x2] ; stp1_4
- pmulhrsw m12, [pw_16069x2] ; stp1_7
-
- ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 0]
- mova m10, [pw_11585x2]
- pmulhrsw m0, m10 ; stp1_1
-
- mova m14, m11 ; stp1_4
- mova m13, m12 ; stp1_7
-
- ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_6
- pmulhrsw m14, m10 ; stp1_5
-%else
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
- SWAP 13, 14
-%endif
- mova m7, m0 ; stp1_0 = stp1_1
- mova m4, m0 ; stp1_1
- mova m2, m7 ; stp1_0
-
- ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
- SUM_SUB 7, 13, 9 ; stp1_1, stp1_6
- SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
- SUM_SUB 4, 11, 9 ; stp1_3, stp1_4
-
- ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 1, 9 ; stp1_0, stp1_15
- SUM_SUB 7, 3, 9 ; stp1_1, stp1_14
- SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
- SUM_SUB 4, 6, 9 ; stp1_3, stp1_12
-
- ; 0-3, 28-31 final stage
- mova m15, [stp + %4 + idx30]
- mova m10, [stp + %4 + idx31]
- SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
- SUM_SUB 7, 15, 9 ; stp1_1, stp1_30
- mova [stp + %1 + idx0], m0
- mova [stp + %1 + idx1], m7
- mova [stp + %4 + idx30], m15
- mova [stp + %4 + idx31], m10
- mova m7, [stp + %4 + idx28]
- mova m0, [stp + %4 + idx29]
- SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
- SUM_SUB 4, 7, 9 ; stp1_3, stp1_28
- mova [stp + %1 + idx2], m2
- mova [stp + %1 + idx3], m4
- mova [stp + %4 + idx28], m7
- mova [stp + %4 + idx29], m0
-
- ; 12-15, 16-19 final stage
- mova m0, [stp + %3 + idx16]
- mova m7, [stp + %3 + idx17]
- mova m2, [stp + %3 + idx18]
- mova m4, [stp + %3 + idx19]
- SUM_SUB 1, 0, 9 ; stp1_15, stp1_16
- SUM_SUB 3, 7, 9 ; stp1_14, stp1_17
- SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
- SUM_SUB 6, 4, 9 ; stp1_12, stp1_19
- mova [stp + %2 + idx12], m6
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m3
- mova [stp + %2 + idx15], m1
- mova [stp + %3 + idx16], m0
- mova [stp + %3 + idx17], m7
- mova [stp + %3 + idx18], m2
- mova [stp + %3 + idx19], m4
-
- mova m4, [stp + %2 + idx8]
- mova m5, [stp + %2 + idx9]
- mova m6, [stp + %2 + idx10]
- mova m7, [stp + %2 + idx11]
- SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
- SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
- SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
- SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
-
- ; 4-7, 24-27 final stage
- mova m0, [stp + %4 + idx27]
- mova m1, [stp + %4 + idx26]
- mova m2, [stp + %4 + idx25]
- mova m3, [stp + %4 + idx24]
- SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
- SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
- SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
- SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
- mova [stp + %4 + idx27], m0
- mova [stp + %4 + idx26], m1
- mova [stp + %4 + idx25], m2
- mova [stp + %4 + idx24], m3
- mova [stp + %1 + idx4], m11
- mova [stp + %1 + idx5], m14
- mova [stp + %1 + idx6], m13
- mova [stp + %1 + idx7], m12
-
- ; 8-11, 20-23 final stage
- mova m0, [stp + %3 + idx20]
- mova m1, [stp + %3 + idx21]
- mova m2, [stp + %3 + idx22]
- mova m3, [stp + %3 + idx23]
- SUM_SUB 7, 0, 9 ; stp1_11, stp_20
- SUM_SUB 6, 1, 9 ; stp1_10, stp_21
- SUM_SUB 5, 2, 9 ; stp1_9, stp_22
- SUM_SUB 4, 3, 9 ; stp1_8, stp_23
- mova [stp + %2 + idx8], m4
- mova [stp + %2 + idx9], m5
- mova [stp + %2 + idx10], m6
- mova [stp + %2 + idx11], m7
- mova [stp + %3 + idx20], m0
- mova [stp + %3 + idx21], m1
- mova [stp + %3 + idx22], m2
- mova [stp + %3 + idx23], m3
-%endmacro
-
-%macro RECON_AND_STORE 1
- mova m11, [pw_32]
- lea stp, [rsp + %1]
- mov r6, 32
- pxor m8, m8
-%%recon_and_store:
- mova m0, [stp + 16 * 32 * 0]
- mova m1, [stp + 16 * 32 * 1]
- mova m2, [stp + 16 * 32 * 2]
- mova m3, [stp + 16 * 32 * 3]
- add stp, 16
-
- paddw m0, m11
- paddw m1, m11
- paddw m2, m11
- paddw m3, m11
- psraw m0, 6
- psraw m1, 6
- psraw m2, 6
- psraw m3, 6
- movh m4, [outputq + 0]
- movh m5, [outputq + 8]
- movh m6, [outputq + 16]
- movh m7, [outputq + 24]
- punpcklbw m4, m8
- punpcklbw m5, m8
- punpcklbw m6, m8
- punpcklbw m7, m8
- paddw m0, m4
- paddw m1, m5
- paddw m2, m6
- paddw m3, m7
- packuswb m0, m1
- packuswb m2, m3
- mova [outputq + 0], m0
- mova [outputq + 16], m2
- lea outputq, [outputq + strideq]
- dec r6
- jnz %%recon_and_store
-%endmacro
-
-%define i32x32_size 16*32*5
-%define pass_two_start 16*32*0
-%define transposed_in 16*32*4
-%define pass_one_start 16*32*0
-%define stp r8
-
-INIT_XMM ssse3
-cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
- mova m8, [pd_8192]
- lea stp, [rsp + pass_one_start]
-
-idct32x32_34:
- mov r3, inputq
- lea r4, [rsp + transposed_in]
-
-idct32x32_34_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- IDCT32X32_34 16*0, 16*32, 16*64, 16*96
- lea stp, [stp + 16 * 8]
- mov r6, 4
- lea stp, [rsp + pass_one_start]
- lea r9, [rsp + pass_one_start]
-
-idct32x32_34_2:
- lea r4, [rsp + transposed_in]
- mov r3, r9
-
-idct32x32_34_transpose_2:
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 1]
- mova m2, [r3 + 16 * 2]
- mova m3, [r3 + 16 * 3]
- mova m4, [r3 + 16 * 4]
- mova m5, [r3 + 16 * 5]
- mova m6, [r3 + 16 * 6]
- mova m7, [r3 + 16 * 7]
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- IDCT32X32_34 16*0, 16*8, 16*16, 16*24
-
- lea stp, [stp + 16 * 32]
- add r9, 16 * 32
- dec r6
- jnz idct32x32_34_2
-
- RECON_AND_STORE pass_two_start
-
- RET
-
-%macro IDCT32X32_135 4
- ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m1, [rsp + transposed_in + 16 * 1]
- mova m11, m1
- pmulhrsw m1, [pw___804x2] ; stp1_16
- pmulhrsw m11, [pw_16364x2] ; stp2_31
-
- mova m7, [rsp + transposed_in + 16 * 7]
- mova m12, m7
- pmulhrsw m7, [pw_15426x2] ; stp1_28
- pmulhrsw m12, [pw_m5520x2] ; stp2_19
-
- mova m3, [rsp + transposed_in + 16 * 9]
- mova m4, m3
- pmulhrsw m3, [pw__7005x2] ; stp1_18
- pmulhrsw m4, [pw_14811x2] ; stp2_29
-
- mova m0, [rsp + transposed_in + 16 * 15]
- mova m2, m0
- pmulhrsw m0, [pw_12140x2] ; stp1_30
- pmulhrsw m2, [pw_m11003x2] ; stp2_17
-
- ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
- SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
- SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
- SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
-
- ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
- BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
-
- ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
- SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
- SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
- SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
-
- ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
- BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
-
- mova [stp + %3 + idx16], m1
- mova [stp + %3 + idx17], m0
- mova [stp + %3 + idx18], m4
- mova [stp + %3 + idx19], m7
- mova [stp + %4 + idx28], m12
- mova [stp + %4 + idx29], m3
- mova [stp + %4 + idx30], m2
- mova [stp + %4 + idx31], m11
-
- ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m2, [rsp + transposed_in + 16 * 3]
- mova m3, m2
- pmulhrsw m3, [pw_m2404x2] ; stp1_23
- pmulhrsw m2, [pw_16207x2] ; stp2_24
-
- mova m5, [rsp + transposed_in + 16 * 5]
- mova m6, m5
- pmulhrsw m5, [pw__3981x2] ; stp1_20
- pmulhrsw m6, [pw_15893x2] ; stp2_27
-
- mova m14, [rsp + transposed_in + 16 * 11]
- mova m13, m14
- pmulhrsw m13, [pw_m8423x2] ; stp1_21
- pmulhrsw m14, [pw_14053x2] ; stp2_26
-
- mova m0, [rsp + transposed_in + 16 * 13]
- mova m1, m0
- pmulhrsw m0, [pw__9760x2] ; stp1_22
- pmulhrsw m1, [pw_13160x2] ; stp2_25
-
- ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
- SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
- SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
- SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
-
- ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
- BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
-
- ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
- SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
- SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
- SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
-
- ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
- BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
-
- ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m4, [stp + %3 + idx16]
- mova m7, [stp + %3 + idx17]
- mova m11, [stp + %3 + idx18]
- mova m12, [stp + %3 + idx19]
- SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
- SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
- SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
- SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
- mova [stp + %3 + idx16], m4
- mova [stp + %3 + idx17], m7
- mova [stp + %3 + idx18], m11
- mova [stp + %3 + idx19], m12
-
- mova m4, [stp + %4 + idx28]
- mova m7, [stp + %4 + idx29]
- mova m11, [stp + %4 + idx30]
- mova m12, [stp + %4 + idx31]
- SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
- SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
- SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
- SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
- mova [stp + %4 + idx28], m4
- mova [stp + %4 + idx29], m7
- mova [stp + %4 + idx30], m11
- mova [stp + %4 + idx31], m12
-
- ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 6, 5, 9
- pmulhrsw m6, m10 ; stp1_27
- pmulhrsw m5, m10 ; stp1_20
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_26
- pmulhrsw m14, m10 ; stp1_21
- SUM_SUB 1, 0, 9
- pmulhrsw m1, m10 ; stp1_25
- pmulhrsw m0, m10 ; stp1_22
- SUM_SUB 2, 3, 9
- pmulhrsw m2, m10 ; stp1_25
- pmulhrsw m3, m10 ; stp1_22
-%else
- BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
- SWAP 6, 5
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
- SWAP 13, 14
- BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
- SWAP 1, 0
- BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
- SWAP 2, 3
-%endif
- mova [stp + %3 + idx20], m5
- mova [stp + %3 + idx21], m14
- mova [stp + %3 + idx22], m0
- mova [stp + %3 + idx23], m3
- mova [stp + %4 + idx24], m2
- mova [stp + %4 + idx25], m1
- mova [stp + %4 + idx26], m13
- mova [stp + %4 + idx27], m6
-
- ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 2]
- mova m1, m0
- pmulhrsw m0, [pw__1606x2] ; stp1_8
- pmulhrsw m1, [pw_16305x2] ; stp2_15
-
- mova m6, [rsp + transposed_in + 16 * 6]
- mova m7, m6
- pmulhrsw m7, [pw_m4756x2] ; stp2_11
- pmulhrsw m6, [pw_15679x2] ; stp1_12
-
- mova m4, [rsp + transposed_in + 16 * 10]
- mova m5, m4
- pmulhrsw m4, [pw__7723x2] ; stp1_10
- pmulhrsw m5, [pw_14449x2] ; stp2_13
-
- mova m2, [rsp + transposed_in + 16 * 14]
- mova m3, m2
- pmulhrsw m3, [pw_m10394x2] ; stp1_9
- pmulhrsw m2, [pw_12665x2] ; stp2_14
-
- ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
- SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
- SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
- SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
-
- ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
- BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
-
- ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
- SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
- SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
- SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
-
- ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 5, 4, 9
- pmulhrsw m5, m10 ; stp1_13
- pmulhrsw m4, m10 ; stp1_10
- SUM_SUB 6, 7, 9
- pmulhrsw m6, m10 ; stp1_12
- pmulhrsw m7, m10 ; stp1_11
-%else
- BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
- SWAP 5, 4
- BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
- SWAP 6, 7
-%endif
- ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova [stp + %2 + idx8], m0
- mova [stp + %2 + idx9], m2
- mova [stp + %2 + idx10], m4
- mova [stp + %2 + idx11], m7
- mova [stp + %2 + idx12], m6
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m3
- mova [stp + %2 + idx15], m1
-
- ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m11, [rsp + transposed_in + 16 * 4]
- mova m12, m11
- pmulhrsw m11, [pw__3196x2] ; stp1_4
- pmulhrsw m12, [pw_16069x2] ; stp1_7
-
- mova m13, [rsp + transposed_in + 16 * 12]
- mova m14, m13
- pmulhrsw m13, [pw_13623x2] ; stp1_6
- pmulhrsw m14, [pw_m9102x2] ; stp1_5
-
- ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 0]
- mova m2, [rsp + transposed_in + 16 * 8]
- pmulhrsw m0, [pw_11585x2] ; stp1_1
- mova m3, m2
- pmulhrsw m2, [pw__6270x2] ; stp1_2
- pmulhrsw m3, [pw_15137x2] ; stp1_3
-
- SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
- SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
-
- ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_6
- pmulhrsw m14, m10 ; stp1_5
-%else
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
- SWAP 13, 14
-%endif
- mova m1, m0 ; stp1_0 = stp1_1
- SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
- SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
-
- ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
- SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
- SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
- SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
-
- ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m4, [stp + %2 + idx12]
- mova m5, [stp + %2 + idx13]
- mova m6, [stp + %2 + idx14]
- mova m7, [stp + %2 + idx15]
- SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
- SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
- SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
- SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
-
- ; 0-3, 28-31 final stage
- mova m10, [stp + %4 + idx31]
- mova m15, [stp + %4 + idx30]
- SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
- SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
- mova [stp + %1 + idx0], m0
- mova [stp + %1 + idx1], m1
- mova [stp + %4 + idx31], m10
- mova [stp + %4 + idx30], m15
- mova m0, [stp + %4 + idx29]
- mova m1, [stp + %4 + idx28]
- SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
- SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
- mova [stp + %1 + idx2], m2
- mova [stp + %1 + idx3], m3
- mova [stp + %4 + idx29], m0
- mova [stp + %4 + idx28], m1
-
- ; 12-15, 16-19 final stage
- mova m0, [stp + %3 + idx16]
- mova m1, [stp + %3 + idx17]
- mova m2, [stp + %3 + idx18]
- mova m3, [stp + %3 + idx19]
- SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
- SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
- SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
- SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
- mova [stp + %2 + idx12], m4
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m6
- mova [stp + %2 + idx15], m7
- mova [stp + %3 + idx16], m0
- mova [stp + %3 + idx17], m1
- mova [stp + %3 + idx18], m2
- mova [stp + %3 + idx19], m3
-
- mova m4, [stp + %2 + idx8]
- mova m5, [stp + %2 + idx9]
- mova m6, [stp + %2 + idx10]
- mova m7, [stp + %2 + idx11]
- SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
- SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
- SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
- SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
-
- ; 4-7, 24-27 final stage
- mova m3, [stp + %4 + idx24]
- mova m2, [stp + %4 + idx25]
- mova m1, [stp + %4 + idx26]
- mova m0, [stp + %4 + idx27]
- SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
- SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
- SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
- SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
- mova [stp + %4 + idx24], m3
- mova [stp + %4 + idx25], m2
- mova [stp + %4 + idx26], m1
- mova [stp + %4 + idx27], m0
- mova [stp + %1 + idx4], m11
- mova [stp + %1 + idx5], m14
- mova [stp + %1 + idx6], m13
- mova [stp + %1 + idx7], m12
-
- ; 8-11, 20-23 final stage
- mova m0, [stp + %3 + idx20]
- mova m1, [stp + %3 + idx21]
- mova m2, [stp + %3 + idx22]
- mova m3, [stp + %3 + idx23]
- SUM_SUB 7, 0, 9 ; stp1_11, stp_20
- SUM_SUB 6, 1, 9 ; stp1_10, stp_21
- SUM_SUB 5, 2, 9 ; stp1_9, stp_22
- SUM_SUB 4, 3, 9 ; stp1_8, stp_23
- mova [stp + %2 + idx8], m4
- mova [stp + %2 + idx9], m5
- mova [stp + %2 + idx10], m6
- mova [stp + %2 + idx11], m7
- mova [stp + %3 + idx20], m0
- mova [stp + %3 + idx21], m1
- mova [stp + %3 + idx22], m2
- mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
- mova m8, [pd_8192]
- mov r6, 2
- lea stp, [rsp + pass_one_start]
-
-idct32x32_135:
- mov r3, inputq
- lea r4, [rsp + transposed_in]
- mov r7, 2
-
-idct32x32_135_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- mova [r4 + 0], m0
- mova [r4 + 16 * 1], m1
- mova [r4 + 16 * 2], m2
- mova [r4 + 16 * 3], m3
- mova [r4 + 16 * 4], m4
- mova [r4 + 16 * 5], m5
- mova [r4 + 16 * 6], m6
- mova [r4 + 16 * 7], m7
-
-%if CONFIG_VP9_HIGHBITDEPTH
- add r3, 32
-%else
- add r3, 16
-%endif
- add r4, 16 * 8
- dec r7
- jne idct32x32_135_transpose
-
- IDCT32X32_135 16*0, 16*32, 16*64, 16*96
- lea stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
- lea inputq, [inputq + 32 * 32]
-%else
- lea inputq, [inputq + 16 * 32]
-%endif
- dec r6
- jnz idct32x32_135
-
- mov r6, 4
- lea stp, [rsp + pass_one_start]
- lea r9, [rsp + pass_one_start]
-
-idct32x32_135_2:
- lea r4, [rsp + transposed_in]
- mov r3, r9
- mov r7, 2
-
-idct32x32_135_transpose_2:
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 1]
- mova m2, [r3 + 16 * 2]
- mova m3, [r3 + 16 * 3]
- mova m4, [r3 + 16 * 4]
- mova m5, [r3 + 16 * 5]
- mova m6, [r3 + 16 * 6]
- mova m7, [r3 + 16 * 7]
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- mova [r4 + 0], m0
- mova [r4 + 16 * 1], m1
- mova [r4 + 16 * 2], m2
- mova [r4 + 16 * 3], m3
- mova [r4 + 16 * 4], m4
- mova [r4 + 16 * 5], m5
- mova [r4 + 16 * 6], m6
- mova [r4 + 16 * 7], m7
-
- add r3, 16 * 8
- add r4, 16 * 8
- dec r7
- jne idct32x32_135_transpose_2
-
- IDCT32X32_135 16*0, 16*8, 16*16, 16*24
-
- lea stp, [stp + 16 * 32]
- add r9, 16 * 32
- dec r6
- jnz idct32x32_135_2
-
- RECON_AND_STORE pass_two_start
-
- RET
-
-%macro IDCT32X32_1024 4
- ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m1, [rsp + transposed_in + 16 * 1]
- mova m11, [rsp + transposed_in + 16 * 31]
- BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31
-
- mova m0, [rsp + transposed_in + 16 * 15]
- mova m2, [rsp + transposed_in + 16 * 17]
- BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30
-
- mova m7, [rsp + transposed_in + 16 * 7]
- mova m12, [rsp + transposed_in + 16 * 25]
- BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28
-
- mova m3, [rsp + transposed_in + 16 * 9]
- mova m4, [rsp + transposed_in + 16 * 23]
- BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29
-
- ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
- SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
- SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
- SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
-
- ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
- BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
-
- ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
- SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
- SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
- SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
-
- ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
- BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
-
- mova [stp + %3 + idx16], m1
- mova [stp + %3 + idx17], m0
- mova [stp + %3 + idx18], m4
- mova [stp + %3 + idx19], m7
- mova [stp + %4 + idx28], m12
- mova [stp + %4 + idx29], m3
- mova [stp + %4 + idx30], m2
- mova [stp + %4 + idx31], m11
-
- ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m5, [rsp + transposed_in + 16 * 5]
- mova m6, [rsp + transposed_in + 16 * 27]
- BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27
-
- mova m13, [rsp + transposed_in + 16 * 21]
- mova m14, [rsp + transposed_in + 16 * 11]
- BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26
-
- mova m0, [rsp + transposed_in + 16 * 13]
- mova m1, [rsp + transposed_in + 16 * 19]
- BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25
-
- mova m2, [rsp + transposed_in + 16 * 3]
- mova m3, [rsp + transposed_in + 16 * 29]
- BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24
-
- ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
- SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
- SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
- SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
-
- ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
- BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
-
- ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
- SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
- SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
- SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
-
- ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
- BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
-
- ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m4, [stp + %3 + idx16]
- mova m7, [stp + %3 + idx17]
- mova m11, [stp + %3 + idx18]
- mova m12, [stp + %3 + idx19]
- SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
- SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
- SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
- SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
- mova [stp + %3 + idx16], m4
- mova [stp + %3 + idx17], m7
- mova [stp + %3 + idx18], m11
- mova [stp + %3 + idx19], m12
-
- mova m4, [stp + %4 + idx28]
- mova m7, [stp + %4 + idx29]
- mova m11, [stp + %4 + idx30]
- mova m12, [stp + %4 + idx31]
- SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
- SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
- SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
- SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
- mova [stp + %4 + idx28], m4
- mova [stp + %4 + idx29], m7
- mova [stp + %4 + idx30], m11
- mova [stp + %4 + idx31], m12
-
- ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 6, 5, 9
- pmulhrsw m6, m10 ; stp1_27
- pmulhrsw m5, m10 ; stp1_20
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_26
- pmulhrsw m14, m10 ; stp1_21
- SUM_SUB 1, 0, 9
- pmulhrsw m1, m10 ; stp1_25
- pmulhrsw m0, m10 ; stp1_22
- SUM_SUB 2, 3, 9
- pmulhrsw m2, m10 ; stp1_25
- pmulhrsw m3, m10 ; stp1_22
-%else
- BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
- SWAP 6, 5
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
- SWAP 13, 14
- BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
- SWAP 1, 0
- BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
- SWAP 2, 3
-%endif
- mova [stp + %3 + idx20], m5
- mova [stp + %3 + idx21], m14
- mova [stp + %3 + idx22], m0
- mova [stp + %3 + idx23], m3
- mova [stp + %4 + idx24], m2
- mova [stp + %4 + idx25], m1
- mova [stp + %4 + idx26], m13
- mova [stp + %4 + idx27], m6
-
- ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 2]
- mova m1, [rsp + transposed_in + 16 * 30]
- BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15
-
- mova m2, [rsp + transposed_in + 16 * 14]
- mova m3, [rsp + transposed_in + 16 * 18]
- BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14
-
- mova m4, [rsp + transposed_in + 16 * 10]
- mova m5, [rsp + transposed_in + 16 * 22]
- BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13
-
- mova m6, [rsp + transposed_in + 16 * 6]
- mova m7, [rsp + transposed_in + 16 * 26]
- BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12
-
- ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
- SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
- SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
- SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
-
- ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
- BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
-
- ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
- SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
- SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
- SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
-
- ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 5, 4, 9
- pmulhrsw m5, m10 ; stp1_13
- pmulhrsw m4, m10 ; stp1_10
- SUM_SUB 6, 7, 9
- pmulhrsw m6, m10 ; stp1_12
- pmulhrsw m7, m10 ; stp1_11
-%else
- BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
- SWAP 5, 4
- BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
- SWAP 6, 7
-%endif
- ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova [stp + %2 + idx8], m0
- mova [stp + %2 + idx9], m2
- mova [stp + %2 + idx10], m4
- mova [stp + %2 + idx11], m7
- mova [stp + %2 + idx12], m6
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m3
- mova [stp + %2 + idx15], m1
-
- ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m11, [rsp + transposed_in + 16 * 4]
- mova m12, [rsp + transposed_in + 16 * 28]
- BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7
-
- mova m13, [rsp + transposed_in + 16 * 12]
- mova m14, [rsp + transposed_in + 16 * 20]
- BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6
-
- ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 0]
- mova m1, [rsp + transposed_in + 16 * 16]
-
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 0, 1, 9
- pmulhrsw m0, m10 ; stp1_1
- pmulhrsw m1, m10 ; stp1_0
-%else
- BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0
- SWAP 0, 1
-%endif
- mova m2, [rsp + transposed_in + 16 * 8]
- mova m3, [rsp + transposed_in + 16 * 24]
- BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3
-
- mova m10, [pw_11585x2]
- SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
- SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
-
- ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_6
- pmulhrsw m14, m10 ; stp1_5
-%else
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
- SWAP 13, 14
-%endif
- SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
- SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
-
- ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
- SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
- SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
- SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
-
- ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m4, [stp + %2 + idx12]
- mova m5, [stp + %2 + idx13]
- mova m6, [stp + %2 + idx14]
- mova m7, [stp + %2 + idx15]
- SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
- SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
- SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
- SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
-
- ; 0-3, 28-31 final stage
- mova m10, [stp + %4 + idx31]
- mova m15, [stp + %4 + idx30]
- SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
- SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
- mova [stp + %1 + idx0], m0
- mova [stp + %1 + idx1], m1
- mova [stp + %4 + idx31], m10
- mova [stp + %4 + idx30], m15
- mova m0, [stp + %4 + idx29]
- mova m1, [stp + %4 + idx28]
- SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
- SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
- mova [stp + %1 + idx2], m2
- mova [stp + %1 + idx3], m3
- mova [stp + %4 + idx29], m0
- mova [stp + %4 + idx28], m1
-
- ; 12-15, 16-19 final stage
- mova m0, [stp + %3 + idx16]
- mova m1, [stp + %3 + idx17]
- mova m2, [stp + %3 + idx18]
- mova m3, [stp + %3 + idx19]
- SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
- SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
- SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
- SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
- mova [stp + %2 + idx12], m4
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m6
- mova [stp + %2 + idx15], m7
- mova [stp + %3 + idx16], m0
- mova [stp + %3 + idx17], m1
- mova [stp + %3 + idx18], m2
- mova [stp + %3 + idx19], m3
-
- mova m4, [stp + %2 + idx8]
- mova m5, [stp + %2 + idx9]
- mova m6, [stp + %2 + idx10]
- mova m7, [stp + %2 + idx11]
- SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
- SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
- SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
- SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
-
- ; 4-7, 24-27 final stage
- mova m3, [stp + %4 + idx24]
- mova m2, [stp + %4 + idx25]
- mova m1, [stp + %4 + idx26]
- mova m0, [stp + %4 + idx27]
- SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
- SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
- SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
- SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
- mova [stp + %4 + idx24], m3
- mova [stp + %4 + idx25], m2
- mova [stp + %4 + idx26], m1
- mova [stp + %4 + idx27], m0
- mova [stp + %1 + idx4], m11
- mova [stp + %1 + idx5], m14
- mova [stp + %1 + idx6], m13
- mova [stp + %1 + idx7], m12
-
- ; 8-11, 20-23 final stage
- mova m0, [stp + %3 + idx20]
- mova m1, [stp + %3 + idx21]
- mova m2, [stp + %3 + idx22]
- mova m3, [stp + %3 + idx23]
- SUM_SUB 7, 0, 9 ; stp1_11, stp_20
- SUM_SUB 6, 1, 9 ; stp1_10, stp_21
- SUM_SUB 5, 2, 9 ; stp1_9, stp_22
- SUM_SUB 4, 3, 9 ; stp1_8, stp_23
- mova [stp + %2 + idx8], m4
- mova [stp + %2 + idx9], m5
- mova [stp + %2 + idx10], m6
- mova [stp + %2 + idx11], m7
- mova [stp + %3 + idx20], m0
- mova [stp + %3 + idx21], m1
- mova [stp + %3 + idx22], m2
- mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
- mova m8, [pd_8192]
- mov r6, 4
- lea stp, [rsp + pass_one_start]
-
-idct32x32_1024:
- mov r3, inputq
- lea r4, [rsp + transposed_in]
- mov r7, 4
-
-idct32x32_1024_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- mova [r4 + 0], m0
- mova [r4 + 16 * 1], m1
- mova [r4 + 16 * 2], m2
- mova [r4 + 16 * 3], m3
- mova [r4 + 16 * 4], m4
- mova [r4 + 16 * 5], m5
- mova [r4 + 16 * 6], m6
- mova [r4 + 16 * 7], m7
-%if CONFIG_VP9_HIGHBITDEPTH
- add r3, 32
-%else
- add r3, 16
-%endif
- add r4, 16 * 8
- dec r7
- jne idct32x32_1024_transpose
-
- IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
-
- lea stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
- lea inputq, [inputq + 32 * 32]
-%else
- lea inputq, [inputq + 16 * 32]
-%endif
- dec r6
- jnz idct32x32_1024
-
- mov r6, 4
- lea stp, [rsp + pass_one_start]
- lea r9, [rsp + pass_one_start]
-
-idct32x32_1024_2:
- lea r4, [rsp + transposed_in]
- mov r3, r9
- mov r7, 4
-
-idct32x32_1024_transpose_2:
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 1]
- mova m2, [r3 + 16 * 2]
- mova m3, [r3 + 16 * 3]
- mova m4, [r3 + 16 * 4]
- mova m5, [r3 + 16 * 5]
- mova m6, [r3 + 16 * 6]
- mova m7, [r3 + 16 * 7]
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- mova [r4 + 0], m0
- mova [r4 + 16 * 1], m1
- mova [r4 + 16 * 2], m2
- mova [r4 + 16 * 3], m3
- mova [r4 + 16 * 4], m4
- mova [r4 + 16 * 5], m5
- mova [r4 + 16 * 6], m6
- mova [r4 + 16 * 7], m7
-
- add r3, 16 * 8
- add r4, 16 * 8
- dec r7
- jne idct32x32_1024_transpose_2
-
- IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
-
- lea stp, [stp + 16 * 32]
- add r9, 16 * 32
- dec r6
- jnz idct32x32_1024_2
-
- RECON_AND_STORE pass_two_start
-
- RET
-%endif
diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
deleted file mode 100644
index fbbcd76bd7..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
+++ /dev/null
@@ -1,109 +0,0 @@
-;
-; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro REORDER_INPUTS 0
- ; a c d b to a b c d
- SWAP 1, 3, 2
-%endmacro
-
-%macro TRANSFORM_COLS 0
- ; input:
- ; m0 a
- ; m1 b
- ; m2 c
- ; m3 d
- paddw m0, m2
- psubw m3, m1
-
- ; wide subtract
- punpcklwd m4, m0
- punpcklwd m5, m3
- psrad m4, 16
- psrad m5, 16
- psubd m4, m5
- psrad m4, 1
- packssdw m4, m4 ; e
-
- psubw m5, m4, m1 ; b
- psubw m4, m2 ; c
- psubw m0, m5
- paddw m3, m4
- ; m0 a
- SWAP 1, 5 ; m1 b
- SWAP 2, 4 ; m2 c
- ; m3 d
-%endmacro
-
-%macro TRANSPOSE_4X4 0
- punpcklwd m0, m2
- punpcklwd m1, m3
- mova m2, m0
- punpcklwd m0, m1
- punpckhwd m2, m1
- pshufd m1, m0, 0x0e
- pshufd m3, m2, 0x0e
-%endmacro
-
-; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
-%macro TRANSPOSE_4X4_WIDE 0
- mova m3, m0
- punpcklwd m0, m1
- punpckhwd m3, m1
- mova m2, m0
- punpcklwd m0, m3
- punpckhwd m2, m3
- pshufd m1, m0, 0x0e
- pshufd m3, m2, 0x0e
-%endmacro
-
-%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero
- movd m%3, [outputq]
- movd m%4, [outputq + strideq]
- punpcklbw m%3, m%5
- punpcklbw m%4, m%5
- paddw m%1, m%3
- paddw m%2, m%4
- packuswb m%1, m%5
- packuswb m%2, m%5
- movd [outputq], m%1
- movd [outputq + strideq], m%2
-%endmacro
-
-INIT_XMM sse2
-cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [inputq + 0]
- packssdw m0, [inputq + 16]
- mova m1, [inputq + 32]
- packssdw m1, [inputq + 48]
-%else
- mova m0, [inputq + 0]
- mova m1, [inputq + 16]
-%endif
- psraw m0, 2
- psraw m1, 2
-
- TRANSPOSE_4X4_WIDE
- REORDER_INPUTS
- TRANSFORM_COLS
- TRANSPOSE_4X4
- REORDER_INPUTS
- TRANSFORM_COLS
-
- pxor m4, m4
- ADD_STORE_4P_2X 0, 1, 5, 6, 4
- lea outputq, [outputq + 2 * strideq]
- ADD_STORE_4P_2X 2, 3, 5, 6, 4
-
- RET
diff --git a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c
deleted file mode 100644
index be1087c1e9..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c
+++ /dev/null
@@ -1,979 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h> /* AVX2 */
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
-
-void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- __m128i mask, hev, flat, flat2;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi8(1);
- __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
- __m128i abs_p1p0;
-
- const __m128i thresh = _mm_broadcastb_epi8(
- _mm_cvtsi32_si128((int) _thresh[0]));
- const __m128i limit = _mm_broadcastb_epi8(
- _mm_cvtsi32_si128((int) _limit[0]));
- const __m128i blimit = _mm_broadcastb_epi8(
- _mm_cvtsi32_si128((int) _blimit[0]));
-
- q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
- q4p4 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
- q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
- q3p3 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
- q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
- q2p2 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
- q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
- q1p1 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
- p1q1 = _mm_shuffle_epi32(q1p1, 78);
- q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
- q0p0 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
- p0q0 = _mm_shuffle_epi32(q0p0, 78);
-
- {
- __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
- abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
- _mm_subs_epu8(q0p0, q1p1));
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
- fe = _mm_set1_epi8(0xfe);
- ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
- _mm_subs_epu8(p0q0, q0p0));
- abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
- _mm_subs_epu8(p1q1, q1p1));
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(abs_p1p0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
-
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
- _mm_subs_epu8(q1p1, q2p2)),
- _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
- _mm_subs_epu8(q2p2, q3p3)));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- }
-
- // lp filter
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i t1 = _mm_set1_epi16(0x1);
- __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
- __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
- __m128i qs0 = _mm_xor_si128(p0q0, t80);
- __m128i qs1 = _mm_xor_si128(p1q1, t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
- __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
- __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
- filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, qs0ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
- filt = _mm_and_si128(filt, mask);
-
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- filter1 = _mm_unpacklo_epi8(zero, filter1);
- filter1 = _mm_srai_epi16(filter1, 0xB);
- filter2 = _mm_unpacklo_epi8(zero, filter2);
- filter2 = _mm_srai_epi16(filter2, 0xB);
-
- /* Filter1 >> 3 */
- filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
- qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
-
- /* filt >> 1 */
- filt = _mm_adds_epi16(filter1, t1);
- filt = _mm_srai_epi16(filt, 1);
- filt = _mm_andnot_si128(
- _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
- filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
- qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
- // loopfilter done
-
- {
- __m128i work;
- flat = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
- _mm_subs_epu8(q0p0, q2p2)),
- _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
- _mm_subs_epu8(q0p0, q3p3)));
- flat = _mm_max_epu8(abs_p1p0, flat);
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
-
- q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
- q5p5 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q5p5),
- (__m64 *) (s + 5 * p)));
-
- q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
- q6p6 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q6p6),
- (__m64 *) (s + 6 * p)));
-
- flat2 = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
- _mm_subs_epu8(q0p0, q4p4)),
- _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
- _mm_subs_epu8(q0p0, q5p5)));
-
- q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
- q7p7 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q7p7),
- (__m64 *) (s + 7 * p)));
-
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
- _mm_subs_epu8(q0p0, q6p6)),
- _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
- _mm_subs_epu8(q0p0, q7p7)));
-
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // flat and wide flat calculations
- {
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i four = _mm_set1_epi16(4);
- __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
- __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
- __m128i pixelFilter_p, pixelFilter_q;
- __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
- __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
- p7_16 = _mm_unpacklo_epi8(q7p7, zero);
- p6_16 = _mm_unpacklo_epi8(q6p6, zero);
- p5_16 = _mm_unpacklo_epi8(q5p5, zero);
- p4_16 = _mm_unpacklo_epi8(q4p4, zero);
- p3_16 = _mm_unpacklo_epi8(q3p3, zero);
- p2_16 = _mm_unpacklo_epi8(q2p2, zero);
- p1_16 = _mm_unpacklo_epi8(q1p1, zero);
- p0_16 = _mm_unpacklo_epi8(q0p0, zero);
- q0_16 = _mm_unpackhi_epi8(q0p0, zero);
- q1_16 = _mm_unpackhi_epi8(q1p1, zero);
- q2_16 = _mm_unpackhi_epi8(q2p2, zero);
- q3_16 = _mm_unpackhi_epi8(q3p3, zero);
- q4_16 = _mm_unpackhi_epi8(q4p4, zero);
- q5_16 = _mm_unpackhi_epi8(q5p5, zero);
- q6_16 = _mm_unpackhi_epi8(q6p6, zero);
- q7_16 = _mm_unpackhi_epi8(q7p7, zero);
-
- pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
- _mm_add_epi16(p4_16, p3_16));
- pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
- _mm_add_epi16(q4_16, q3_16));
-
- pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
- _mm_add_epi16(p2_16, p1_16));
- pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
- pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
- _mm_add_epi16(q2_16, q1_16));
- pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
- pixelFilter_p = _mm_add_epi16(eight,
- _mm_add_epi16(pixelFilter_p, pixelFilter_q));
- pixetFilter_p2p1p0 = _mm_add_epi16(four,
- _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
- 4);
- flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(p3_16, p0_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(q3_16, q0_16)), 3);
-
- flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(p7_16, p7_16);
- sum_q7 = _mm_add_epi16(q7_16, q7_16);
- sum_p3 = _mm_add_epi16(p3_16, p3_16);
- sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
- pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
- 4);
- flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(sum_p3, p1_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_q2q1q0,
- _mm_add_epi16(sum_q3, q1_16)), 3);
- flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- sum_p3 = _mm_add_epi16(sum_p3, p3_16);
- sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
- 4);
- flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(sum_p3, p2_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_q2q1q0,
- _mm_add_epi16(sum_q3, q2_16)), 3);
- flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
- 4);
- flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
- 4);
- flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
- 4);
- flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
- 4);
- flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
- }
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- flat = _mm_shuffle_epi32(flat, 68);
- flat2 = _mm_shuffle_epi32(flat2, 68);
-
- q2p2 = _mm_andnot_si128(flat, q2p2);
- flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
- q2p2 = _mm_or_si128(q2p2, flat_q2p2);
-
- qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
- flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
- q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
- qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
- flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
- q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
- q6p6 = _mm_andnot_si128(flat2, q6p6);
- flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
- q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
- _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
- _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
-
- q5p5 = _mm_andnot_si128(flat2, q5p5);
- flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
- q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
- _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
- _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
-
- q4p4 = _mm_andnot_si128(flat2, q4p4);
- flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
- q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
- _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
- _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
-
- q3p3 = _mm_andnot_si128(flat2, q3p3);
- flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
- q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
- _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
- _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
-
- q2p2 = _mm_andnot_si128(flat2, q2p2);
- flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
- q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
- _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
- _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
-
- q1p1 = _mm_andnot_si128(flat2, q1p1);
- flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
- q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
- _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
- _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
-
- q0p0 = _mm_andnot_si128(flat2, q0p0);
- flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
- q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
- _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
- _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
- }
-}
-
-DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
- 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
- 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
-};
-
-void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- __m128i mask, hev, flat, flat2;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi8(1);
- __m128i p7, p6, p5;
- __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
- __m128i q5, q6, q7;
- __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
- q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
- p256_0, q256_0;
-
- const __m128i thresh = _mm_broadcastb_epi8(
- _mm_cvtsi32_si128((int) _thresh[0]));
- const __m128i limit = _mm_broadcastb_epi8(
- _mm_cvtsi32_si128((int) _limit[0]));
- const __m128i blimit = _mm_broadcastb_epi8(
- _mm_cvtsi32_si128((int) _blimit[0]));
-
- p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 5 * p)));
- p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 4 * p)));
- p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 3 * p)));
- p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 2 * p)));
- p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 1 * p)));
- q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 0 * p)));
- q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 1 * p)));
- q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 2 * p)));
- q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 3 * p)));
- q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 4 * p)));
-
- p4 = _mm256_castsi256_si128(p256_4);
- p3 = _mm256_castsi256_si128(p256_3);
- p2 = _mm256_castsi256_si128(p256_2);
- p1 = _mm256_castsi256_si128(p256_1);
- p0 = _mm256_castsi256_si128(p256_0);
- q0 = _mm256_castsi256_si128(q256_0);
- q1 = _mm256_castsi256_si128(q256_1);
- q2 = _mm256_castsi256_si128(q256_2);
- q3 = _mm256_castsi256_si128(q256_3);
- q4 = _mm256_castsi256_si128(q256_4);
-
- {
- const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
- _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
- _mm_subs_epu8(q0, q1));
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
- _mm_subs_epu8(q0, p0));
- __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
- _mm_subs_epu8(q1, p1));
- __m128i work;
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(flat, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
- _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
- mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
- _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- }
-
- // lp filter
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i te0 = _mm_set1_epi8(0xe0);
- const __m128i t1f = _mm_set1_epi8(0x1f);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i t7f = _mm_set1_epi8(0x7f);
-
- __m128i ps1 = _mm_xor_si128(p1, t80);
- __m128i ps0 = _mm_xor_si128(p0, t80);
- __m128i qs0 = _mm_xor_si128(q0, t80);
- __m128i qs1 = _mm_xor_si128(q1, t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
- __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
- flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
- flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
- flat_q2;
-
- filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
- filt = _mm_and_si128(filt, mask);
-
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- /* Filter1 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter1);
- filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter1 = _mm_and_si128(filter1, t1f);
- filter1 = _mm_or_si128(filter1, work_a);
- qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-
- /* Filter2 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter2);
- filter2 = _mm_srli_epi16(filter2, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter2 = _mm_and_si128(filter2, t1f);
- filter2 = _mm_or_si128(filter2, work_a);
- ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-
- /* filt >> 1 */
- filt = _mm_adds_epi8(filter1, t1);
- work_a = _mm_cmpgt_epi8(zero, filt);
- filt = _mm_srli_epi16(filt, 1);
- work_a = _mm_and_si128(work_a, t80);
- filt = _mm_and_si128(filt, t7f);
- filt = _mm_or_si128(filt, work_a);
- filt = _mm_andnot_si128(hev, filt);
- ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
- qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
- // loopfilter done
-
- {
- __m128i work;
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
- _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
- _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
- _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
-
- p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 6 * p)));
- q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 5 * p)));
- p5 = _mm256_castsi256_si128(p256_5);
- q5 = _mm256_castsi256_si128(q256_5);
- flat2 = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
- _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
-
- flat2 = _mm_max_epu8(work, flat2);
- p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 7 * p)));
- q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 6 * p)));
- p6 = _mm256_castsi256_si128(p256_6);
- q6 = _mm256_castsi256_si128(q256_6);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
- _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
-
- flat2 = _mm_max_epu8(work, flat2);
-
- p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s - 8 * p)));
- q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
- (__m128d const *)(s + 7 * p)));
- p7 = _mm256_castsi256_si128(p256_7);
- q7 = _mm256_castsi256_si128(q256_7);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
- _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
-
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // flat and wide flat calculations
- {
- const __m256i eight = _mm256_set1_epi16(8);
- const __m256i four = _mm256_set1_epi16(4);
- __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
- pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
- res_q;
-
- const __m256i filter = _mm256_load_si256(
- (__m256i const *)filt_loopfilter_avx2);
- p256_7 = _mm256_shuffle_epi8(p256_7, filter);
- p256_6 = _mm256_shuffle_epi8(p256_6, filter);
- p256_5 = _mm256_shuffle_epi8(p256_5, filter);
- p256_4 = _mm256_shuffle_epi8(p256_4, filter);
- p256_3 = _mm256_shuffle_epi8(p256_3, filter);
- p256_2 = _mm256_shuffle_epi8(p256_2, filter);
- p256_1 = _mm256_shuffle_epi8(p256_1, filter);
- p256_0 = _mm256_shuffle_epi8(p256_0, filter);
- q256_0 = _mm256_shuffle_epi8(q256_0, filter);
- q256_1 = _mm256_shuffle_epi8(q256_1, filter);
- q256_2 = _mm256_shuffle_epi8(q256_2, filter);
- q256_3 = _mm256_shuffle_epi8(q256_3, filter);
- q256_4 = _mm256_shuffle_epi8(q256_4, filter);
- q256_5 = _mm256_shuffle_epi8(q256_5, filter);
- q256_6 = _mm256_shuffle_epi8(q256_6, filter);
- q256_7 = _mm256_shuffle_epi8(q256_7, filter);
-
- pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
- _mm256_add_epi16(p256_4, p256_3));
- pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
- _mm256_add_epi16(q256_4, q256_3));
-
- pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
- _mm256_add_epi16(p256_2, p256_1));
- pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
- pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
- _mm256_add_epi16(q256_2, q256_1));
- pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-
- pixelFilter_p = _mm256_add_epi16(eight,
- _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
-
- pixetFilter_p2p1p0 = _mm256_add_epi16(four,
- _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(p256_7, p256_0)), 4);
-
- flat2_p0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(q256_7, q256_0)), 4);
-
- flat2_q0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(p256_3, p256_0)), 3);
-
- flat_p0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(q256_3, q256_0)), 3);
-
- flat_q0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
-
- sum_p7 = _mm256_add_epi16(p256_7, p256_7);
-
- sum_q7 = _mm256_add_epi16(q256_7, q256_7);
-
- sum_p3 = _mm256_add_epi16(p256_3, p256_3);
-
- sum_q3 = _mm256_add_epi16(q256_3, q256_3);
-
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
-
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(sum_p7, p256_1)), 4);
-
- flat2_p1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q,
- _mm256_add_epi16(sum_q7, q256_1)), 4);
-
- flat2_q1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
-
- pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
-
- pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(sum_p3, p256_1)), 3);
-
- flat_p1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixetFilter_q2q1q0,
- _mm256_add_epi16(sum_q3, q256_1)), 3);
-
- flat_q1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
-
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
- sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
-
- sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
-
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
-
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(sum_p7, p256_2)), 4);
-
- flat2_p2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q,
- _mm256_add_epi16(sum_q7, q256_2)), 4);
-
- flat2_q2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
-
- pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
-
- pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(sum_p3, p256_2)), 3);
-
- flat_p2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixetFilter_q2q1q0,
- _mm256_add_epi16(sum_q3, q256_2)), 3);
-
- flat_q2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
-
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
-
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(sum_p7, p256_3)), 4);
-
- flat2_p3 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q,
- _mm256_add_epi16(sum_q7, q256_3)), 4);
-
- flat2_q3 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
-
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
-
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(sum_p7, p256_4)), 4);
-
- flat2_p4 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q,
- _mm256_add_epi16(sum_q7, q256_4)), 4);
-
- flat2_q4 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
-
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
-
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(sum_p7, p256_5)), 4);
-
- flat2_p5 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q,
- _mm256_add_epi16(sum_q7, q256_5)), 4);
-
- flat2_q5 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
-
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
-
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p,
- _mm256_add_epi16(sum_p7, p256_6)), 4);
-
- flat2_p6 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
- 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q,
- _mm256_add_epi16(sum_q7, q256_6)), 4);
-
- flat2_q6 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
- 168));
- }
-
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- p2 = _mm_andnot_si128(flat, p2);
- flat_p2 = _mm_and_si128(flat, flat_p2);
- p2 = _mm_or_si128(flat_p2, p2);
-
- p1 = _mm_andnot_si128(flat, ps1);
- flat_p1 = _mm_and_si128(flat, flat_p1);
- p1 = _mm_or_si128(flat_p1, p1);
-
- p0 = _mm_andnot_si128(flat, ps0);
- flat_p0 = _mm_and_si128(flat, flat_p0);
- p0 = _mm_or_si128(flat_p0, p0);
-
- q0 = _mm_andnot_si128(flat, qs0);
- flat_q0 = _mm_and_si128(flat, flat_q0);
- q0 = _mm_or_si128(flat_q0, q0);
-
- q1 = _mm_andnot_si128(flat, qs1);
- flat_q1 = _mm_and_si128(flat, flat_q1);
- q1 = _mm_or_si128(flat_q1, q1);
-
- q2 = _mm_andnot_si128(flat, q2);
- flat_q2 = _mm_and_si128(flat, flat_q2);
- q2 = _mm_or_si128(flat_q2, q2);
-
- p6 = _mm_andnot_si128(flat2, p6);
- flat2_p6 = _mm_and_si128(flat2, flat2_p6);
- p6 = _mm_or_si128(flat2_p6, p6);
- _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
-
- p5 = _mm_andnot_si128(flat2, p5);
- flat2_p5 = _mm_and_si128(flat2, flat2_p5);
- p5 = _mm_or_si128(flat2_p5, p5);
- _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
-
- p4 = _mm_andnot_si128(flat2, p4);
- flat2_p4 = _mm_and_si128(flat2, flat2_p4);
- p4 = _mm_or_si128(flat2_p4, p4);
- _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
-
- p3 = _mm_andnot_si128(flat2, p3);
- flat2_p3 = _mm_and_si128(flat2, flat2_p3);
- p3 = _mm_or_si128(flat2_p3, p3);
- _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
-
- p2 = _mm_andnot_si128(flat2, p2);
- flat2_p2 = _mm_and_si128(flat2, flat2_p2);
- p2 = _mm_or_si128(flat2_p2, p2);
- _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
-
- p1 = _mm_andnot_si128(flat2, p1);
- flat2_p1 = _mm_and_si128(flat2, flat2_p1);
- p1 = _mm_or_si128(flat2_p1, p1);
- _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
-
- p0 = _mm_andnot_si128(flat2, p0);
- flat2_p0 = _mm_and_si128(flat2, flat2_p0);
- p0 = _mm_or_si128(flat2_p0, p0);
- _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
-
- q0 = _mm_andnot_si128(flat2, q0);
- flat2_q0 = _mm_and_si128(flat2, flat2_q0);
- q0 = _mm_or_si128(flat2_q0, q0);
- _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
-
- q1 = _mm_andnot_si128(flat2, q1);
- flat2_q1 = _mm_and_si128(flat2, flat2_q1);
- q1 = _mm_or_si128(flat2_q1, q1);
- _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
-
- q2 = _mm_andnot_si128(flat2, q2);
- flat2_q2 = _mm_and_si128(flat2, flat2_q2);
- q2 = _mm_or_si128(flat2_q2, q2);
- _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
-
- q3 = _mm_andnot_si128(flat2, q3);
- flat2_q3 = _mm_and_si128(flat2, flat2_q3);
- q3 = _mm_or_si128(flat2_q3, q3);
- _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
-
- q4 = _mm_andnot_si128(flat2, q4);
- flat2_q4 = _mm_and_si128(flat2, flat2_q4);
- q4 = _mm_or_si128(flat2_q4, q4);
- _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
-
- q5 = _mm_andnot_si128(flat2, q5);
- flat2_q5 = _mm_and_si128(flat2, flat2_q5);
- q5 = _mm_or_si128(flat2_q5, q5);
- _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
-
- q6 = _mm_andnot_si128(flat2, q6);
- flat2_q6 = _mm_and_si128(flat2, flat2_q6);
- q6 = _mm_or_si128(flat2_q6, q6);
- _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
- }
-}
diff --git a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c
deleted file mode 100644
index 739adf31d0..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c
+++ /dev/null
@@ -1,1776 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-static INLINE __m128i abs_diff(__m128i a, __m128i b) {
- return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
-}
-
-// filter_mask and hev_mask
-#define FILTER_HEV_MASK do { \
- /* (abs(q1 - q0), abs(p1 - p0) */ \
- __m128i flat = abs_diff(q1p1, q0p0); \
- /* abs(p1 - q1), abs(p0 - q0) */ \
- const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
- __m128i abs_p0q0, abs_p1q1, work; \
- \
- /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
- hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
- hev = _mm_cmpgt_epi16(hev, thresh); \
- hev = _mm_packs_epi16(hev, hev); \
- \
- /* const int8_t mask = filter_mask(*limit, *blimit, */ \
- /* p3, p2, p1, p0, q0, q1, q2, q3); */ \
- abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */\
- abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */\
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
- abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
- /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
- mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
- /* abs(p3 - p2), abs(p2 - p1) */ \
- work = abs_diff(p3p2, p2p1); \
- flat = _mm_max_epu8(work, flat); \
- /* abs(q3 - q2), abs(q2 - q1) */ \
- work = abs_diff(q3q2, q2q1); \
- flat = _mm_max_epu8(work, flat); \
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
- mask = _mm_unpacklo_epi64(mask, flat); \
- mask = _mm_subs_epu8(mask, limit); \
- mask = _mm_cmpeq_epi8(mask, zero); \
- mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
-} while (0)
-
-#define FILTER4 do { \
- const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, \
- 4, 4, 4, 4, 4, 4, 4, 4); \
- const __m128i t80 = _mm_set1_epi8(0x80); \
- __m128i filter, filter2filter1, work; \
- \
- ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \
- qs1qs0 = _mm_xor_si128(q1q0, t80); \
- \
- /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \
- work = _mm_subs_epi8(ps1ps0, qs1qs0); \
- filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \
- /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \
- filter = _mm_subs_epi8(filter, work); \
- filter = _mm_subs_epi8(filter, work); \
- filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \
- filter = _mm_and_si128(filter, mask); /* & mask */ \
- filter = _mm_unpacklo_epi64(filter, filter); \
- \
- /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \
- /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \
- filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \
- filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \
- filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \
- filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \
- filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \
- filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \
- \
- /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \
- filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \
- filter = _mm_unpacklo_epi8(filter, filter); \
- filter = _mm_srai_epi16(filter, 9); /* round */ \
- filter = _mm_packs_epi16(filter, filter); \
- filter = _mm_andnot_si128(hev, filter); \
- \
- hev = _mm_unpackhi_epi64(filter2filter1, filter); \
- filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \
- \
- /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
- qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \
- /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
- ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \
- qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \
- ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \
-} while (0)
-
-void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
- const uint8_t *_blimit, const uint8_t *_limit,
- const uint8_t *_thresh) {
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i limit =
- _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
- _mm_loadl_epi64((const __m128i *)_limit));
- const __m128i thresh =
- _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
- const __m128i ff = _mm_cmpeq_epi8(zero, zero);
- __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
- __m128i mask, hev;
-
- p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
- _mm_loadl_epi64((__m128i *)(s - 4 * p)));
- q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
- _mm_loadl_epi64((__m128i *)(s + 1 * p)));
- q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
- _mm_loadl_epi64((__m128i *)(s + 0 * p)));
- q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
- _mm_loadl_epi64((__m128i *)(s + 3 * p)));
- p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
- p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
- q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
- q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-
- FILTER_HEV_MASK;
- FILTER4;
-
- _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1
- _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0
- _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0
- _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1
-}
-
-void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
- const uint8_t *_blimit, const uint8_t *_limit,
- const uint8_t *_thresh) {
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i limit =
- _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
- _mm_loadl_epi64((const __m128i *)_limit));
- const __m128i thresh =
- _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
- const __m128i ff = _mm_cmpeq_epi8(zero, zero);
- __m128i x0, x1, x2, x3;
- __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
- __m128i mask, hev;
-
- // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
- _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
-
- // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
- x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
- _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
-
- // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
- x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
- _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
-
- // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
- x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
- _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
-
- // Transpose 8x8
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- p1p0 = _mm_unpacklo_epi16(q1q0, x1);
- // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
- x0 = _mm_unpacklo_epi16(x2, x3);
- // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- p3p2 = _mm_unpacklo_epi32(p1p0, x0);
- // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- p1p0 = _mm_unpackhi_epi32(p1p0, x0);
- p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high
- p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high
-
- // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
- q1q0 = _mm_unpackhi_epi16(q1q0, x1);
- // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
- x2 = _mm_unpackhi_epi16(x2, x3);
- // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
- q3q2 = _mm_unpackhi_epi32(q1q0, x2);
- // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
- q1q0 = _mm_unpacklo_epi32(q1q0, x2);
-
- q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
- q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
- p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
- p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
- q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-
- FILTER_HEV_MASK;
- FILTER4;
-
- // Transpose 8x4 to 4x8
- // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37
- // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07
- // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
- ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
- // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37
- x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
- // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27
- ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
- // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
- qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
-
- *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
- ps1ps0 = _mm_srli_si128(ps1ps0, 4);
- *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
- ps1ps0 = _mm_srli_si128(ps1ps0, 4);
- *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
- ps1ps0 = _mm_srli_si128(ps1ps0, 4);
- *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-
- *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
- qs1qs0 = _mm_srli_si128(qs1qs0, 4);
- *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
- qs1qs0 = _mm_srli_si128(qs1qs0, 4);
- *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
- qs1qs0 = _mm_srli_si128(qs1qs0, 4);
- *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-}
-
-void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi8(1);
- const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
- const __m128i limit = _mm_load_si128((const __m128i *)_limit);
- const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
- __m128i mask, hev, flat, flat2;
- __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
- __m128i abs_p1p0;
-
- q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
- q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
- (__m64 *)(s + 4 * p)));
- q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
- q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
- (__m64 *)(s + 3 * p)));
- q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
- (__m64 *)(s + 2 * p)));
- q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
- (__m64 *)(s + 1 * p)));
- p1q1 = _mm_shuffle_epi32(q1p1, 78);
- q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
- (__m64 *)(s - 0 * p)));
- p0q0 = _mm_shuffle_epi32(q0p0, 78);
-
- {
- __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
- abs_p1p0 = abs_diff(q1p1, q0p0);
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
- fe = _mm_set1_epi8(0xfe);
- ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- abs_p0q0 = abs_diff(q0p0, p0q0);
- abs_p1q1 = abs_diff(q1p1, p1q1);
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
- abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(abs_p1p0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
-
- work = _mm_max_epu8(abs_diff(q2p2, q1p1),
- abs_diff(q3p3, q2p2));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- }
-
- // lp filter
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i t1 = _mm_set1_epi16(0x1);
- __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
- __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
- __m128i qs0 = _mm_xor_si128(p0q0, t80);
- __m128i qs1 = _mm_xor_si128(p1q1, t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
- __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
- __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
- filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, qs0ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- // (vpx_filter + 3 * (qs0 - ps0)) & mask
- filt = _mm_and_si128(filt, mask);
-
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- filter1 = _mm_unpacklo_epi8(zero, filter1);
- filter1 = _mm_srai_epi16(filter1, 0xB);
- filter2 = _mm_unpacklo_epi8(zero, filter2);
- filter2 = _mm_srai_epi16(filter2, 0xB);
-
- // Filter1 >> 3
- filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
- qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
-
- // filt >> 1
- filt = _mm_adds_epi16(filter1, t1);
- filt = _mm_srai_epi16(filt, 1);
- filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
- filt);
- filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
- qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
- // loopfilter done
-
- {
- __m128i work;
- flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
- flat = _mm_max_epu8(abs_p1p0, flat);
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
-
- q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
- q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
- (__m64 *)(s + 5 * p)));
-
- q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
- q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
- (__m64 *)(s + 6 * p)));
- flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
-
- q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
- q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
- (__m64 *)(s + 7 * p)));
- work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // flat and wide flat calculations
- {
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i four = _mm_set1_epi16(4);
- __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
- __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
- __m128i pixelFilter_p, pixelFilter_q;
- __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
- __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
- p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
- p6_16 = _mm_unpacklo_epi8(q6p6, zero);
- p5_16 = _mm_unpacklo_epi8(q5p5, zero);
- p4_16 = _mm_unpacklo_epi8(q4p4, zero);
- p3_16 = _mm_unpacklo_epi8(q3p3, zero);
- p2_16 = _mm_unpacklo_epi8(q2p2, zero);
- p1_16 = _mm_unpacklo_epi8(q1p1, zero);
- p0_16 = _mm_unpacklo_epi8(q0p0, zero);
- q0_16 = _mm_unpackhi_epi8(q0p0, zero);
- q1_16 = _mm_unpackhi_epi8(q1p1, zero);
- q2_16 = _mm_unpackhi_epi8(q2p2, zero);
- q3_16 = _mm_unpackhi_epi8(q3p3, zero);
- q4_16 = _mm_unpackhi_epi8(q4p4, zero);
- q5_16 = _mm_unpackhi_epi8(q5p5, zero);
- q6_16 = _mm_unpackhi_epi8(q6p6, zero);
- q7_16 = _mm_unpackhi_epi8(q7p7, zero);
-
- pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
- _mm_add_epi16(p4_16, p3_16));
- pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
- _mm_add_epi16(q4_16, q3_16));
-
- pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
- pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
- pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
- pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
- pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
- pixelFilter_q));
- pixetFilter_p2p1p0 = _mm_add_epi16(four,
- _mm_add_epi16(pixetFilter_p2p1p0,
- pixetFilter_q2q1q0));
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(p7_16, p0_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(q7_16, q0_16)), 4);
- flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(p3_16, p0_16)), 3);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(q3_16, q0_16)), 3);
-
- flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(p7_16, p7_16);
- sum_q7 = _mm_add_epi16(q7_16, q7_16);
- sum_p3 = _mm_add_epi16(p3_16, p3_16);
- sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
- pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p1_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q1_16)), 4);
- flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(sum_p3, p1_16)), 3);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
- _mm_add_epi16(sum_q3, q1_16)), 3);
- flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- sum_p3 = _mm_add_epi16(sum_p3, p3_16);
- sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p2_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q2_16)), 4);
- flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
- res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
- _mm_add_epi16(sum_p3, p2_16)), 3);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
- _mm_add_epi16(sum_q3, q2_16)), 3);
- flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p3_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q3_16)), 4);
- flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p4_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q4_16)), 4);
- flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p5_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q5_16)), 4);
- flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
- res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(sum_p7, p6_16)), 4);
- res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
- _mm_add_epi16(sum_q7, q6_16)), 4);
- flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
- }
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- flat = _mm_shuffle_epi32(flat, 68);
- flat2 = _mm_shuffle_epi32(flat2, 68);
-
- q2p2 = _mm_andnot_si128(flat, q2p2);
- flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
- q2p2 = _mm_or_si128(q2p2, flat_q2p2);
-
- qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
- flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
- q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
- qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
- flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
- q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
- q6p6 = _mm_andnot_si128(flat2, q6p6);
- flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
- q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
- _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
- _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
-
- q5p5 = _mm_andnot_si128(flat2, q5p5);
- flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
- q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
- _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
- _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
-
- q4p4 = _mm_andnot_si128(flat2, q4p4);
- flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
- q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
- _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
- _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
-
- q3p3 = _mm_andnot_si128(flat2, q3p3);
- flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
- q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
- _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
- _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
-
- q2p2 = _mm_andnot_si128(flat2, q2p2);
- flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
- q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
- _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
- _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
-
- q1p1 = _mm_andnot_si128(flat2, q1p1);
- flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
- q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
- _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
- _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
-
- q0p0 = _mm_andnot_si128(flat2, q0p0);
- flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
- q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
- _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
- _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
- }
-}
-
-static INLINE __m128i filter_add2_sub2(const __m128i *const total,
- const __m128i *const a1,
- const __m128i *const a2,
- const __m128i *const s1,
- const __m128i *const s2) {
- __m128i x = _mm_add_epi16(*a1, *total);
- x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
- return x;
-}
-
-static INLINE __m128i filter8_mask(const __m128i *const flat,
- const __m128i *const other_filt,
- const __m128i *const f8_lo,
- const __m128i *const f8_hi) {
- const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
- _mm_srli_epi16(*f8_hi, 3));
- const __m128i result = _mm_and_si128(*flat, f8);
- return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
-}
-
-static INLINE __m128i filter16_mask(const __m128i *const flat,
- const __m128i *const other_filt,
- const __m128i *const f_lo,
- const __m128i *const f_hi) {
- const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
- _mm_srli_epi16(*f_hi, 4));
- const __m128i result = _mm_and_si128(*flat, f);
- return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
-}
-
-void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi8(1);
- const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
- const __m128i limit = _mm_load_si128((const __m128i *)_limit);
- const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
- __m128i mask, hev, flat, flat2;
- __m128i p7, p6, p5;
- __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
- __m128i q5, q6, q7;
-
- __m128i op2, op1, op0, oq0, oq1, oq2;
-
- __m128i max_abs_p1p0q1q0;
-
- p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
- p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
- p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
- p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
- p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
- p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
- q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
- q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
- q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
- q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
-
- {
- const __m128i abs_p1p0 = abs_diff(p1, p0);
- const __m128i abs_q1q0 = abs_diff(q1, q0);
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(zero, zero);
- __m128i abs_p0q0 = abs_diff(p0, q0);
- __m128i abs_p1q1 = abs_diff(p1, q1);
- __m128i work;
- max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
-
- abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
- mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- }
-
- {
- __m128i work;
- work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
- flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
- work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
- flat2 = _mm_max_epu8(work, flat2);
- work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
- flat2 = _mm_max_epu8(work, flat2);
- work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // filter4
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i te0 = _mm_set1_epi8(0xe0);
- const __m128i t1f = _mm_set1_epi8(0x1f);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i t7f = _mm_set1_epi8(0x7f);
- const __m128i ff = _mm_cmpeq_epi8(t4, t4);
-
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
-
- op1 = _mm_xor_si128(p1, t80);
- op0 = _mm_xor_si128(p0, t80);
- oq0 = _mm_xor_si128(q0, t80);
- oq1 = _mm_xor_si128(q1, t80);
-
- hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
-
- work_a = _mm_subs_epi8(oq0, op0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- // (vpx_filter + 3 * (qs0 - ps0)) & mask
- filt = _mm_and_si128(filt, mask);
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- // Filter1 >> 3
- work_a = _mm_cmpgt_epi8(zero, filter1);
- filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter1 = _mm_and_si128(filter1, t1f);
- filter1 = _mm_or_si128(filter1, work_a);
- oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
-
- // Filter2 >> 3
- work_a = _mm_cmpgt_epi8(zero, filter2);
- filter2 = _mm_srli_epi16(filter2, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter2 = _mm_and_si128(filter2, t1f);
- filter2 = _mm_or_si128(filter2, work_a);
- op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
-
- // filt >> 1
- filt = _mm_adds_epi8(filter1, t1);
- work_a = _mm_cmpgt_epi8(zero, filt);
- filt = _mm_srli_epi16(filt, 1);
- work_a = _mm_and_si128(work_a, t80);
- filt = _mm_and_si128(filt, t7f);
- filt = _mm_or_si128(filt, work_a);
- filt = _mm_andnot_si128(hev, filt);
- op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
- oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
- // loopfilter done
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // filter8
- {
- const __m128i four = _mm_set1_epi16(4);
- const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
- const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
- const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
- const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
- const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
- const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
- const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
- const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
-
- const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
- const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
- const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
- const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
- const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
- const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
- const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
- const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
- __m128i f8_lo, f8_hi;
-
- f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
- _mm_add_epi16(p3_lo, p2_lo));
- f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
- _mm_add_epi16(p2_lo, p1_lo));
- f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
-
- f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
- _mm_add_epi16(p3_hi, p2_hi));
- f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
- _mm_add_epi16(p2_hi, p1_hi));
- f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
-
- op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
-
- f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
- f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
- op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
-
- f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
- f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
- op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
-
- f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
- f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
- oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
-
- f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
- f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
- oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
-
- f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
- f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
- oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
- }
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // wide flat calculations
- {
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
- const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
- const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
- const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
- const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
- const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
- const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
- const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
- const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
- const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
- const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
- const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
- const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
- const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
- const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
- const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
-
- const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
- const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
- const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
- const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
- const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
- const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
- const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
- const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
- const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
- const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
- const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
- const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
- const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
- const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
- const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
- const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
-
- __m128i f_lo;
- __m128i f_hi;
-
- f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7
- f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1),
- _mm_add_epi16(p4_lo, f_lo));
- f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
- _mm_add_epi16(p2_lo, p1_lo));
- f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
- f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
-
- f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7
- f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1),
- _mm_add_epi16(p4_hi, f_hi));
- f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
- _mm_add_epi16(p2_hi, p1_hi));
- f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
- f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
-
- p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
-
- f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
- p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
-
- f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
- p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
-
- f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
- p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
-
- f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
- op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
-
- f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
- op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
-
- f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
- op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
- oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
- oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
- oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
- q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
- q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
- q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
- q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
- }
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- }
-}
-
-void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
- const __m128i limit = _mm_load_si128((const __m128i *)_limit);
- const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
- __m128i mask, hev, flat;
- __m128i p3, p2, p1, p0, q0, q1, q2, q3;
- __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
-
- q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
- _mm_loadl_epi64((__m128i *)(s + 3 * p)));
- q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
- _mm_loadl_epi64((__m128i *)(s + 2 * p)));
- q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
- _mm_loadl_epi64((__m128i *)(s + 1 * p)));
- q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
- _mm_loadl_epi64((__m128i *)(s - 0 * p)));
- p1q1 = _mm_shuffle_epi32(q1p1, 78);
- p0q0 = _mm_shuffle_epi32(q0p0, 78);
-
- {
- // filter_mask and hev_mask
- const __m128i one = _mm_set1_epi8(1);
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(fe, fe);
- __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
- abs_p1p0 = abs_diff(q1p1, q0p0);
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-
- abs_p0q0 = abs_diff(q0p0, p0q0);
- abs_p1q1 = abs_diff(q1p1, p1q1);
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
- abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(abs_p1p0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
-
- work = _mm_max_epu8(abs_diff(q2p2, q1p1),
- abs_diff(q3p3, q2p2));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
-
- // flat_mask4
-
- flat = _mm_max_epu8(abs_diff(q2p2, q0p0),
- abs_diff(q3p3, q0p0));
- flat = _mm_max_epu8(abs_p1p0, flat);
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- }
-
- {
- const __m128i four = _mm_set1_epi16(4);
- unsigned char *src = s;
- {
- __m128i workp_a, workp_b, workp_shft;
- p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
- p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
- p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
- p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
- q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
- q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
- q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
- q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-
- workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op2[0],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op1[0],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op0[0],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq0[0],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq1[0],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq2[0],
- _mm_packus_epi16(workp_shft, workp_shft));
- }
- }
- // lp filter
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
- t80);
- const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
- t80);
- const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
- t80);
- const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
- t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
-
- filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- // (vpx_filter + 3 * (qs0 - ps0)) & mask
- filt = _mm_and_si128(filt, mask);
-
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- // Filter1 >> 3
- filter1 = _mm_unpacklo_epi8(zero, filter1);
- filter1 = _mm_srai_epi16(filter1, 11);
- filter1 = _mm_packs_epi16(filter1, filter1);
-
- // Filter2 >> 3
- filter2 = _mm_unpacklo_epi8(zero, filter2);
- filter2 = _mm_srai_epi16(filter2, 11);
- filter2 = _mm_packs_epi16(filter2, zero);
-
- // filt >> 1
- filt = _mm_adds_epi8(filter1, t1);
- filt = _mm_unpacklo_epi8(zero, filt);
- filt = _mm_srai_epi16(filt, 9);
- filt = _mm_packs_epi16(filt, zero);
-
- filt = _mm_andnot_si128(hev, filt);
-
- work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
- q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
- work_a = _mm_andnot_si128(flat, work_a);
- q0 = _mm_and_si128(flat, q0);
- q0 = _mm_or_si128(work_a, q0);
-
- work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
- q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
- work_a = _mm_andnot_si128(flat, work_a);
- q1 = _mm_and_si128(flat, q1);
- q1 = _mm_or_si128(work_a, q1);
-
- work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
- work_a = _mm_andnot_si128(flat, work_a);
- q2 = _mm_and_si128(flat, q2);
- q2 = _mm_or_si128(work_a, q2);
-
- work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
- p0 = _mm_loadl_epi64((__m128i *)flat_op0);
- work_a = _mm_andnot_si128(flat, work_a);
- p0 = _mm_and_si128(flat, p0);
- p0 = _mm_or_si128(work_a, p0);
-
- work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
- p1 = _mm_loadl_epi64((__m128i *)flat_op1);
- work_a = _mm_andnot_si128(flat, work_a);
- p1 = _mm_and_si128(flat, p1);
- p1 = _mm_or_si128(work_a, p1);
-
- work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p2 = _mm_loadl_epi64((__m128i *)flat_op2);
- work_a = _mm_andnot_si128(flat, work_a);
- p2 = _mm_and_si128(flat, p2);
- p2 = _mm_or_si128(work_a, p2);
-
- _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
- _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
- _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
- _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
- _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
- _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
- }
-}
-
-void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
- const uint8_t *_blimit0,
- const uint8_t *_limit0,
- const uint8_t *_thresh0,
- const uint8_t *_blimit1,
- const uint8_t *_limit1,
- const uint8_t *_thresh1) {
- DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i blimit =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
- _mm_load_si128((const __m128i *)_blimit1));
- const __m128i limit =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
- _mm_load_si128((const __m128i *)_limit1));
- const __m128i thresh =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
- _mm_load_si128((const __m128i *)_thresh1));
-
- __m128i mask, hev, flat;
- __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-
- p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
- p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
- {
- const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
- _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
- _mm_subs_epu8(q0, q1));
- const __m128i one = _mm_set1_epi8(1);
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
- _mm_subs_epu8(q0, p0));
- __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
- _mm_subs_epu8(q1, p1));
- __m128i work;
-
- // filter_mask and hev_mask
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
- abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(flat, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
- _mm_subs_epu8(p1, p2)),
- _mm_or_si128(_mm_subs_epu8(p3, p2),
- _mm_subs_epu8(p2, p3)));
- mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
- _mm_subs_epu8(q1, q2)),
- _mm_or_si128(_mm_subs_epu8(q3, q2),
- _mm_subs_epu8(q2, q3)));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
-
- // flat_mask4
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
- _mm_subs_epu8(p0, p2)),
- _mm_or_si128(_mm_subs_epu8(q2, q0),
- _mm_subs_epu8(q0, q2)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
- _mm_subs_epu8(p0, p3)),
- _mm_or_si128(_mm_subs_epu8(q3, q0),
- _mm_subs_epu8(q0, q3)));
- flat = _mm_max_epu8(work, flat);
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- }
- {
- const __m128i four = _mm_set1_epi16(4);
- unsigned char *src = s;
- int i = 0;
-
- do {
- __m128i workp_a, workp_b, workp_shft;
- p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
- p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
- p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
- p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
- q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
- q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
- q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
- q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-
- workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- src += 8;
- } while (++i < 2);
- }
- // lp filter
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i te0 = _mm_set1_epi8(0xe0);
- const __m128i t1f = _mm_set1_epi8(0x1f);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i t7f = _mm_set1_epi8(0x7f);
-
- const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
- t80);
- const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
- t80);
- const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
- t80);
- const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
- t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
-
- filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- // (vpx_filter + 3 * (qs0 - ps0)) & mask
- filt = _mm_and_si128(filt, mask);
-
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- // Filter1 >> 3
- work_a = _mm_cmpgt_epi8(zero, filter1);
- filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter1 = _mm_and_si128(filter1, t1f);
- filter1 = _mm_or_si128(filter1, work_a);
-
- // Filter2 >> 3
- work_a = _mm_cmpgt_epi8(zero, filter2);
- filter2 = _mm_srli_epi16(filter2, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter2 = _mm_and_si128(filter2, t1f);
- filter2 = _mm_or_si128(filter2, work_a);
-
- // filt >> 1
- filt = _mm_adds_epi8(filter1, t1);
- work_a = _mm_cmpgt_epi8(zero, filt);
- filt = _mm_srli_epi16(filt, 1);
- work_a = _mm_and_si128(work_a, t80);
- filt = _mm_and_si128(filt, t7f);
- filt = _mm_or_si128(filt, work_a);
-
- filt = _mm_andnot_si128(hev, filt);
-
- work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
- q0 = _mm_load_si128((__m128i *)flat_oq0);
- work_a = _mm_andnot_si128(flat, work_a);
- q0 = _mm_and_si128(flat, q0);
- q0 = _mm_or_si128(work_a, q0);
-
- work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
- q1 = _mm_load_si128((__m128i *)flat_oq1);
- work_a = _mm_andnot_si128(flat, work_a);
- q1 = _mm_and_si128(flat, q1);
- q1 = _mm_or_si128(work_a, q1);
-
- work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q2 = _mm_load_si128((__m128i *)flat_oq2);
- work_a = _mm_andnot_si128(flat, work_a);
- q2 = _mm_and_si128(flat, q2);
- q2 = _mm_or_si128(work_a, q2);
-
- work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
- p0 = _mm_load_si128((__m128i *)flat_op0);
- work_a = _mm_andnot_si128(flat, work_a);
- p0 = _mm_and_si128(flat, p0);
- p0 = _mm_or_si128(work_a, p0);
-
- work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
- p1 = _mm_load_si128((__m128i *)flat_op1);
- work_a = _mm_andnot_si128(flat, work_a);
- p1 = _mm_and_si128(flat, p1);
- p1 = _mm_or_si128(work_a, p1);
-
- work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p2 = _mm_load_si128((__m128i *)flat_op2);
- work_a = _mm_andnot_si128(flat, work_a);
- p2 = _mm_and_si128(flat, p2);
- p2 = _mm_or_si128(work_a, p2);
-
- _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
- _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
- }
-}
-
-void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
- const unsigned char *_blimit0,
- const unsigned char *_limit0,
- const unsigned char *_thresh0,
- const unsigned char *_blimit1,
- const unsigned char *_limit1,
- const unsigned char *_thresh1) {
- const __m128i blimit =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
- _mm_load_si128((const __m128i *)_blimit1));
- const __m128i limit =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
- _mm_load_si128((const __m128i *)_limit1));
- const __m128i thresh =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
- _mm_load_si128((const __m128i *)_thresh1));
- const __m128i zero = _mm_set1_epi16(0);
- __m128i p3, p2, p1, p0, q0, q1, q2, q3;
- __m128i mask, hev, flat;
-
- p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
- p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-
- // filter_mask and hev_mask
- {
- const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
- _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
- _mm_subs_epu8(q0, q1));
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
- _mm_subs_epu8(q0, p0));
- __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
- _mm_subs_epu8(q1, p1));
- __m128i work;
-
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
- abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(flat, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
- _mm_subs_epu8(p1, p2)),
- _mm_or_si128(_mm_subs_epu8(p3, p2),
- _mm_subs_epu8(p2, p3)));
- mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
- _mm_subs_epu8(q1, q2)),
- _mm_or_si128(_mm_subs_epu8(q3, q2),
- _mm_subs_epu8(q2, q3)));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- }
-
- // filter4
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i te0 = _mm_set1_epi8(0xe0);
- const __m128i t1f = _mm_set1_epi8(0x1f);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i t7f = _mm_set1_epi8(0x7f);
-
- const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
- t80);
- const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
- t80);
- const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
- t80);
- const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
- t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
-
- filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- // (vpx_filter + 3 * (qs0 - ps0)) & mask
- filt = _mm_and_si128(filt, mask);
-
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- // Filter1 >> 3
- work_a = _mm_cmpgt_epi8(zero, filter1);
- filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter1 = _mm_and_si128(filter1, t1f);
- filter1 = _mm_or_si128(filter1, work_a);
-
- // Filter2 >> 3
- work_a = _mm_cmpgt_epi8(zero, filter2);
- filter2 = _mm_srli_epi16(filter2, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter2 = _mm_and_si128(filter2, t1f);
- filter2 = _mm_or_si128(filter2, work_a);
-
- // filt >> 1
- filt = _mm_adds_epi8(filter1, t1);
- work_a = _mm_cmpgt_epi8(zero, filt);
- filt = _mm_srli_epi16(filt, 1);
- work_a = _mm_and_si128(work_a, t80);
- filt = _mm_and_si128(filt, t7f);
- filt = _mm_or_si128(filt, work_a);
-
- filt = _mm_andnot_si128(hev, filt);
-
- q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
- q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
- p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
- p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
- }
-}
-
-static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
- int in_p, unsigned char *out, int out_p) {
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
- __m128i x8, x9, x10, x11, x12, x13, x14, x15;
-
- // 2-way interleave w/hoisting of unpacks
- x0 = _mm_loadl_epi64((__m128i *)in0); // 1
- x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3
- x0 = _mm_unpacklo_epi8(x0, x1); // 1
-
- x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5
- x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p)); // 7
- x1 = _mm_unpacklo_epi8(x2, x3); // 2
-
- x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p)); // 9
- x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p)); // 11
- x2 = _mm_unpacklo_epi8(x4, x5); // 3
-
- x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p)); // 13
- x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p)); // 15
- x3 = _mm_unpacklo_epi8(x6, x7); // 4
- x4 = _mm_unpacklo_epi16(x0, x1); // 9
-
- x8 = _mm_loadl_epi64((__m128i *)in1); // 2
- x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4
- x8 = _mm_unpacklo_epi8(x8, x9); // 5
- x5 = _mm_unpacklo_epi16(x2, x3); // 10
-
- x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6
- x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p)); // 8
- x9 = _mm_unpacklo_epi8(x10, x11); // 6
-
- x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p)); // 10
- x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p)); // 12
- x10 = _mm_unpacklo_epi8(x12, x13); // 7
- x12 = _mm_unpacklo_epi16(x8, x9); // 11
-
- x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p)); // 14
- x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p)); // 16
- x11 = _mm_unpacklo_epi8(x14, x15); // 8
- x13 = _mm_unpacklo_epi16(x10, x11); // 12
-
- x6 = _mm_unpacklo_epi32(x4, x5); // 13
- x7 = _mm_unpackhi_epi32(x4, x5); // 14
- x14 = _mm_unpacklo_epi32(x12, x13); // 15
- x15 = _mm_unpackhi_epi32(x12, x13); // 16
-
- // Store first 4-line result
- _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
- _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
- _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
- _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
-
- x4 = _mm_unpackhi_epi16(x0, x1);
- x5 = _mm_unpackhi_epi16(x2, x3);
- x12 = _mm_unpackhi_epi16(x8, x9);
- x13 = _mm_unpackhi_epi16(x10, x11);
-
- x6 = _mm_unpacklo_epi32(x4, x5);
- x7 = _mm_unpackhi_epi32(x4, x5);
- x14 = _mm_unpacklo_epi32(x12, x13);
- x15 = _mm_unpackhi_epi32(x12, x13);
-
- // Store second 4-line result
- _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
- _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
- _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
- _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
-}
-
-static INLINE void transpose(unsigned char *src[], int in_p,
- unsigned char *dst[], int out_p,
- int num_8x8_to_transpose) {
- int idx8x8 = 0;
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
- do {
- unsigned char *in = src[idx8x8];
- unsigned char *out = dst[idx8x8];
-
- x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
- x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
- // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- x0 = _mm_unpacklo_epi8(x0, x1);
-
- x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
- x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
- // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
- x1 = _mm_unpacklo_epi8(x2, x3);
-
- x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
- x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
- // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
- x2 = _mm_unpacklo_epi8(x4, x5);
-
- x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
- x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
- // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
- x3 = _mm_unpacklo_epi8(x6, x7);
-
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- x4 = _mm_unpacklo_epi16(x0, x1);
- // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
- x5 = _mm_unpacklo_epi16(x2, x3);
- // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- x6 = _mm_unpacklo_epi32(x4, x5);
- _mm_storel_pd((double *)(out + 0*out_p),
- _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
- _mm_storeh_pd((double *)(out + 1*out_p),
- _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- x7 = _mm_unpackhi_epi32(x4, x5);
- _mm_storel_pd((double *)(out + 2*out_p),
- _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
- _mm_storeh_pd((double *)(out + 3*out_p),
- _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73
-
- // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
- x4 = _mm_unpackhi_epi16(x0, x1);
- // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
- x5 = _mm_unpackhi_epi16(x2, x3);
- // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
- x6 = _mm_unpacklo_epi32(x4, x5);
- _mm_storel_pd((double *)(out + 4*out_p),
- _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
- _mm_storeh_pd((double *)(out + 5*out_p),
- _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
- x7 = _mm_unpackhi_epi32(x4, x5);
-
- _mm_storel_pd((double *)(out + 6*out_p),
- _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
- _mm_storeh_pd((double *)(out + 7*out_p),
- _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77
- } while (++idx8x8 < num_8x8_to_transpose);
-}
-
-void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
- unsigned char *src[2];
- unsigned char *dst[2];
-
- // Transpose 8x16
- transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
-
- // Loop filtering
- vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
- blimit1, limit1, thresh1);
- src[0] = t_dst;
- src[1] = t_dst + 8;
- dst[0] = s - 4;
- dst[1] = s - 4 + p * 8;
-
- // Transpose back
- transpose(src, 16, dst, p, 2);
-}
-
-void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh) {
- DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
- unsigned char *src[1];
- unsigned char *dst[1];
-
- // Transpose 8x8
- src[0] = s - 4;
- dst[0] = t_dst;
-
- transpose(src, p, dst, 8, 1);
-
- // Loop filtering
- vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
-
- src[0] = t_dst;
- dst[0] = s - 4;
-
- // Transpose back
- transpose(src, 8, dst, p, 1);
-}
-
-void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
- unsigned char *src[2];
- unsigned char *dst[2];
-
- // Transpose 8x16
- transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
-
- // Loop filtering
- vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
- blimit1, limit1, thresh1);
- src[0] = t_dst;
- src[1] = t_dst + 8;
-
- dst[0] = s - 4;
- dst[1] = s - 4 + p * 8;
-
- // Transpose back
- transpose(src, 16, dst, p, 2);
-}
-
-void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh) {
- DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
- unsigned char *src[2];
- unsigned char *dst[2];
-
- src[0] = s - 8;
- src[1] = s;
- dst[0] = t_dst;
- dst[1] = t_dst + 8 * 8;
-
- // Transpose 16x8
- transpose(src, p, dst, 8, 2);
-
- // Loop filtering
- vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
-
- src[0] = t_dst;
- src[1] = t_dst + 8 * 8;
- dst[0] = s - 8;
- dst[1] = s;
-
- // Transpose back
- transpose(src, 8, dst, p, 2);
-}
-
-void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh) {
- DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
-
- // Transpose 16x16
- transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
- transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
-
- // Loop filtering
- vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
-
- // Transpose back
- transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
- transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
-}
diff --git a/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h
deleted file mode 100644
index 536b206876..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_DSP_X86_TXFM_COMMON_SSE2_H_
-#define VPX_DSP_X86_TXFM_COMMON_SSE2_H_
-
-#include <emmintrin.h>
-#include "vpx/vpx_integer.h"
-
-#define pair_set_epi16(a, b) \
- _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
- (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
-
-#define dual_set_epi16(a, b) \
- _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
- (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
-
-#define octa_set_epi16(a, b, c, d, e, f, g, h) \
- _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
- (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
-
-#endif // VPX_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
deleted file mode 100644
index 422b0fc422..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/x86/convolve.h"
-
-#if HAVE_SSE2
-filter8_1dfunction vpx_filter_block1d16_v8_sse2;
-filter8_1dfunction vpx_filter_block1d16_h8_sse2;
-filter8_1dfunction vpx_filter_block1d8_v8_sse2;
-filter8_1dfunction vpx_filter_block1d8_h8_sse2;
-filter8_1dfunction vpx_filter_block1d4_v8_sse2;
-filter8_1dfunction vpx_filter_block1d4_h8_sse2;
-filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
-
-filter8_1dfunction vpx_filter_block1d16_v2_sse2;
-filter8_1dfunction vpx_filter_block1d16_h2_sse2;
-filter8_1dfunction vpx_filter_block1d8_v2_sse2;
-filter8_1dfunction vpx_filter_block1d8_h2_sse2;
-filter8_1dfunction vpx_filter_block1d4_v2_sse2;
-filter8_1dfunction vpx_filter_block1d4_h2_sse2;
-filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
-
-// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
-
-// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_ , sse2);
-
-#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
-
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
-
-// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
-// ptrdiff_t src_stride,
-// uint8_t *dst,
-// ptrdiff_t dst_stride,
-// const int16_t *filter_x,
-// int x_step_q4,
-// const int16_t *filter_y,
-// int y_step_q4,
-// int w, int h, int bd);
-// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
-// ptrdiff_t src_stride,
-// uint8_t *dst,
-// ptrdiff_t dst_stride,
-// const int16_t *filter_x,
-// int x_step_q4,
-// const int16_t *filter_y,
-// int y_step_q4,
-// int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
-// ptrdiff_t src_stride,
-// uint8_t *dst,
-// ptrdiff_t dst_stride,
-// const int16_t *filter_x,
-// int x_step_q4,
-// const int16_t *filter_y,
-// int y_step_q4,
-// int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
-// ptrdiff_t src_stride,
-// uint8_t *dst,
-// ptrdiff_t dst_stride,
-// const int16_t *filter_x,
-// int x_step_q4,
-// const int16_t *filter_y,
-// int y_step_q4,
-// int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
- sse2);
-
-// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_ , sse2);
-#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-#endif // HAVE_SSE2
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
deleted file mode 100644
index abc0270655..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ /dev/null
@@ -1,228 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro convolve_fn 1-2
-%ifidn %1, avg
-%define AUX_XMM_REGS 4
-%else
-%define AUX_XMM_REGS 0
-%endif
-%ifidn %2, highbd
-%define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
- dst, dst_stride, \
- fx, fxs, fy, fys, w, h, bd
-%else
-%define pavg pavgb
-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
- dst, dst_stride, \
- fx, fxs, fy, fys, w, h
-%endif
- mov r4d, dword wm
-%ifidn %2, highbd
- shl r4d, 1
- shl srcq, 1
- shl src_strideq, 1
- shl dstq, 1
- shl dst_strideq, 1
-%else
- cmp r4d, 4
- je .w4
-%endif
- cmp r4d, 8
- je .w8
- cmp r4d, 16
- je .w16
- cmp r4d, 32
- je .w32
-%ifidn %2, highbd
- cmp r4d, 64
- je .w64
-
- mov r4d, dword hm
-.loop128:
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+32]
- movu m3, [srcq+48]
-%ifidn %1, avg
- pavg m0, [dstq]
- pavg m1, [dstq+16]
- pavg m2, [dstq+32]
- pavg m3, [dstq+48]
-%endif
- mova [dstq ], m0
- mova [dstq+16], m1
- mova [dstq+32], m2
- mova [dstq+48], m3
- movu m0, [srcq+64]
- movu m1, [srcq+80]
- movu m2, [srcq+96]
- movu m3, [srcq+112]
- add srcq, src_strideq
-%ifidn %1, avg
- pavg m0, [dstq+64]
- pavg m1, [dstq+80]
- pavg m2, [dstq+96]
- pavg m3, [dstq+112]
-%endif
- mova [dstq+64], m0
- mova [dstq+80], m1
- mova [dstq+96], m2
- mova [dstq+112], m3
- add dstq, dst_strideq
- dec r4d
- jnz .loop128
- RET
-%endif
-
-.w64
- mov r4d, dword hm
-.loop64:
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+32]
- movu m3, [srcq+48]
- add srcq, src_strideq
-%ifidn %1, avg
- pavg m0, [dstq]
- pavg m1, [dstq+16]
- pavg m2, [dstq+32]
- pavg m3, [dstq+48]
-%endif
- mova [dstq ], m0
- mova [dstq+16], m1
- mova [dstq+32], m2
- mova [dstq+48], m3
- add dstq, dst_strideq
- dec r4d
- jnz .loop64
- RET
-
-.w32:
- mov r4d, dword hm
-.loop32:
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+src_strideq]
- movu m3, [srcq+src_strideq+16]
- lea srcq, [srcq+src_strideq*2]
-%ifidn %1, avg
- pavg m0, [dstq]
- pavg m1, [dstq +16]
- pavg m2, [dstq+dst_strideq]
- pavg m3, [dstq+dst_strideq+16]
-%endif
- mova [dstq ], m0
- mova [dstq +16], m1
- mova [dstq+dst_strideq ], m2
- mova [dstq+dst_strideq+16], m3
- lea dstq, [dstq+dst_strideq*2]
- sub r4d, 2
- jnz .loop32
- RET
-
-.w16:
- mov r4d, dword hm
- lea r5q, [src_strideq*3]
- lea r6q, [dst_strideq*3]
-.loop16:
- movu m0, [srcq]
- movu m1, [srcq+src_strideq]
- movu m2, [srcq+src_strideq*2]
- movu m3, [srcq+r5q]
- lea srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
- pavg m0, [dstq]
- pavg m1, [dstq+dst_strideq]
- pavg m2, [dstq+dst_strideq*2]
- pavg m3, [dstq+r6q]
-%endif
- mova [dstq ], m0
- mova [dstq+dst_strideq ], m1
- mova [dstq+dst_strideq*2], m2
- mova [dstq+r6q ], m3
- lea dstq, [dstq+dst_strideq*4]
- sub r4d, 4
- jnz .loop16
- RET
-
-.w8:
- mov r4d, dword hm
- lea r5q, [src_strideq*3]
- lea r6q, [dst_strideq*3]
-.loop8:
- movh m0, [srcq]
- movh m1, [srcq+src_strideq]
- movh m2, [srcq+src_strideq*2]
- movh m3, [srcq+r5q]
- lea srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
- movh m4, [dstq]
- movh m5, [dstq+dst_strideq]
- movh m6, [dstq+dst_strideq*2]
- movh m7, [dstq+r6q]
- pavg m0, m4
- pavg m1, m5
- pavg m2, m6
- pavg m3, m7
-%endif
- movh [dstq ], m0
- movh [dstq+dst_strideq ], m1
- movh [dstq+dst_strideq*2], m2
- movh [dstq+r6q ], m3
- lea dstq, [dstq+dst_strideq*4]
- sub r4d, 4
- jnz .loop8
- RET
-
-%ifnidn %2, highbd
-.w4:
- mov r4d, dword hm
- lea r5q, [src_strideq*3]
- lea r6q, [dst_strideq*3]
-.loop4:
- movd m0, [srcq]
- movd m1, [srcq+src_strideq]
- movd m2, [srcq+src_strideq*2]
- movd m3, [srcq+r5q]
- lea srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
- movd m4, [dstq]
- movd m5, [dstq+dst_strideq]
- movd m6, [dstq+dst_strideq*2]
- movd m7, [dstq+r6q]
- pavg m0, m4
- pavg m1, m5
- pavg m2, m6
- pavg m3, m7
-%endif
- movd [dstq ], m0
- movd [dstq+dst_strideq ], m1
- movd [dstq+dst_strideq*2], m2
- movd [dstq+r6q ], m3
- lea dstq, [dstq+dst_strideq*4]
- sub r4d, 4
- jnz .loop4
- RET
-%endif
-%endmacro
-
-INIT_XMM sse2
-convolve_fn copy
-convolve_fn avg
-%if CONFIG_VP9_HIGHBITDEPTH
-convolve_fn copy, highbd
-convolve_fn avg, highbd
-%endif
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
deleted file mode 100644
index d8a92354c9..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ /dev/null
@@ -1,606 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-// Due to a header conflict between math.h and intrinsics includes with ceil()
-// in certain configurations under vs9 this include needs to precede
-// immintrin.h.
-
-#include <immintrin.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/x86/convolve.h"
-#include "vpx_ports/mem.h"
-
-// filters for 16_h8 and 16_v8
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
- 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
- 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
- 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
- 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-#if defined(__clang__)
-// -- GODOT start -
-# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
- (!defined(__MACPORTS__) && defined(__APPLE__) && \
- ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
- (__clang_major__ == 5 && __clang_minor__ == 0)))
-// -- GODOT end --
-# define MM256_BROADCASTSI128_SI256(x) \
- _mm_broadcastsi128_si256((__m128i const *)&(x))
-# else // clang > 3.3, and not 5.0 on macosx.
-# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-# endif // clang <= 3.3
-#elif defined(__GNUC__)
-# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-# define MM256_BROADCASTSI128_SI256(x) \
- _mm_broadcastsi128_si256((__m128i const *)&(x))
-# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-# define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-# else // gcc > 4.7
-# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-# endif // gcc <= 4.6
-#else // !(gcc || clang)
-# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif // __clang__
-
-static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
- ptrdiff_t src_pixels_per_line,
- uint8_t *output_ptr,
- ptrdiff_t output_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
- __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
- __m256i srcReg32b1, srcReg32b2, filtersReg32;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 256 bit register
- firstFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 256 bit register
- forthFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x706u));
-
- filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
-
- // multiple the size of the source and destination stride by two
- src_stride = src_pixels_per_line << 1;
- dst_stride = output_pitch << 1;
- for (i = output_height; i > 1; i-=2) {
- // load the 2 strides of source
- srcReg32b1 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
- srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
- _mm_loadu_si128((const __m128i *)
- (src_ptr+src_pixels_per_line-3)), 1);
-
- // filter the source buffer
- srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
- srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
- // add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
- // filter the source buffer
- srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
- srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
- _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
- // reading 2 strides of the next 16 bytes
- // (part of it was being read by earlier read)
- srcReg32b2 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
- srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
- _mm_loadu_si128((const __m128i *)
- (src_ptr+src_pixels_per_line+5)), 1);
-
- // add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
- _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
- // filter the source buffer
- srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
- // add and saturate the results together
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
-
- // filter the source buffer
- srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
- srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
- _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
- _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
-
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
- srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
- srcRegFilt32b2_1);
-
- src_ptr+=src_stride;
-
- // save 16 bytes
- _mm_store_si128((__m128i*)output_ptr,
- _mm256_castsi256_si128(srcRegFilt32b1_1));
-
- // save the next 16 bits
- _mm_store_si128((__m128i*)(output_ptr+output_pitch),
- _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
- output_ptr+=dst_stride;
- }
-
- // if the number of strides is odd.
- // process only 16 bytes
- if (i > 0) {
- __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
- __m128i srcRegFilt2, srcRegFilt3;
-
- srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
- // filter the source buffer
- srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
- _mm256_castsi256_si128(filt1Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
- _mm256_castsi256_si128(filt4Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
- _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
- // filter the source buffer
- srcRegFilt3= _mm_shuffle_epi8(srcReg1,
- _mm256_castsi256_si128(filt2Reg));
- srcRegFilt2= _mm_shuffle_epi8(srcReg1,
- _mm256_castsi256_si128(filt3Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
- _mm256_castsi256_si128(secondFilters));
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
- _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
- _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-
- // reading the next 16 bytes
- // (part of it was being read by earlier read)
- srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
- _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
- // filter the source buffer
- srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
- _mm256_castsi256_si128(filt1Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
- _mm256_castsi256_si128(filt4Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
- _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
- // filter the source buffer
- srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
- _mm256_castsi256_si128(filt2Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
- _mm256_castsi256_si128(filt3Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
- _mm256_castsi256_si128(secondFilters));
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
- _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
- _mm_min_epi16(srcRegFilt3, srcRegFilt2));
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
- _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
- _mm256_castsi256_si128(addFilterReg64));
-
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
- _mm256_castsi256_si128(addFilterReg64));
-
- // shift by 7 bit each 16 bit
- srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
- srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
- // save 16 bytes
- _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
- }
-}
-
-static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
- ptrdiff_t src_pitch,
- uint8_t *output_ptr,
- ptrdiff_t out_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg64;
- __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
- __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
- __m256i srcReg32b11, srcReg32b12, filtersReg32;
- __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the
- // same data in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 256 bit register
- firstFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 256 bit register
- forthFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x706u));
-
- // multiple the size of the source and destination stride by two
- src_stride = src_pitch << 1;
- dst_stride = out_pitch << 1;
-
- // load 16 bytes 7 times in stride of src_pitch
- srcReg32b1 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr)));
- srcReg32b2 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
- srcReg32b3 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
- srcReg32b4 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
- srcReg32b5 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
- srcReg32b6 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
- srcReg32b7 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
-
- // have each consecutive loads on the same 256 register
- srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
- _mm256_castsi256_si128(srcReg32b2), 1);
- srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
- _mm256_castsi256_si128(srcReg32b3), 1);
- srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
- _mm256_castsi256_si128(srcReg32b4), 1);
- srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
- _mm256_castsi256_si128(srcReg32b5), 1);
- srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
- _mm256_castsi256_si128(srcReg32b6), 1);
- srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
- _mm256_castsi256_si128(srcReg32b7), 1);
-
- // merge every two consecutive registers except the last one
- srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
- srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
-
- // save
- srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-
- // save
- srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-
- // save
- srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
- // save
- srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
-
-
- for (i = output_height; i > 1; i-=2) {
- // load the last 2 loads of 16 bytes and have every two
- // consecutive loads in the same 256 bit register
- srcReg32b8 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
- srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
- _mm256_castsi256_si128(srcReg32b8), 1);
- srcReg32b9 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
- srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
- _mm256_castsi256_si128(srcReg32b9), 1);
-
- // merge every two consecutive registers
- // save
- srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
- srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
- srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
- // add and saturate the results together
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
- srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
- // add and saturate the results together
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
- _mm256_min_epi16(srcReg32b8, srcReg32b12));
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
- _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
- srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
- srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
- // add and saturate the results together
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
- _mm256_min_epi16(srcReg32b8, srcReg32b12));
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
- _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
- srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
-
- src_ptr+=src_stride;
-
- // save 16 bytes
- _mm_store_si128((__m128i*)output_ptr,
- _mm256_castsi256_si128(srcReg32b1));
-
- // save the next 16 bits
- _mm_store_si128((__m128i*)(output_ptr+out_pitch),
- _mm256_extractf128_si256(srcReg32b1, 1));
-
- output_ptr+=dst_stride;
-
- // save part of the registers for next strides
- srcReg32b10 = srcReg32b11;
- srcReg32b1 = srcReg32b3;
- srcReg32b11 = srcReg32b2;
- srcReg32b3 = srcReg32b5;
- srcReg32b2 = srcReg32b4;
- srcReg32b5 = srcReg32b7;
- srcReg32b7 = srcReg32b9;
- }
- if (i > 0) {
- __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
- __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
- // load the last 16 bytes
- srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
- // merge the last 2 results together
- srcRegFilt4 = _mm_unpacklo_epi8(
- _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
- srcRegFilt7 = _mm_unpackhi_epi8(
- _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
- _mm256_castsi256_si128(forthFilters));
- srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
- _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
- srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
-
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
- _mm256_castsi256_si128(secondFilters));
- srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
- _mm256_castsi256_si128(secondFilters));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
- _mm256_castsi256_si128(thirdFilters));
- srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
- _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm_min_epi16(srcRegFilt4, srcRegFilt6));
- srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
- _mm_min_epi16(srcRegFilt5, srcRegFilt7));
-
- // add and saturate the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm_max_epi16(srcRegFilt4, srcRegFilt6));
- srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
- _mm_max_epi16(srcRegFilt5, srcRegFilt7));
-
-
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm256_castsi256_si128(addFilterReg64));
- srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
- _mm256_castsi256_si128(addFilterReg64));
-
- // shift by 7 bit each 16 bit
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
- srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
-
- // save 16 bytes
- _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
- }
-}
-
-#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
-#if ARCH_X86_64
-filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
-#else // ARCH_X86
-filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
-#endif // ARCH_X86_64
-filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
-#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
-#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
-#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
-#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
-#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
-#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
-#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
-// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_2D(, avx2);
-#endif // HAVE_AX2 && HAVE_SSSE3
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
deleted file mode 100644
index 6fd52087c7..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ /dev/null
@@ -1,915 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-// Due to a header conflict between math.h and intrinsics includes with ceil()
-// in certain configurations under vs9 this include needs to precede
-// tmmintrin.h.
-
-#include <tmmintrin.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_dsp/x86/convolve.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
- 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
- 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
- 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
- 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-// These are reused by the avx2 intrinsics.
-filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-
-void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
- ptrdiff_t src_pixels_per_line,
- uint8_t *output_ptr,
- ptrdiff_t output_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i firstFilters, secondFilters, shuffle1, shuffle2;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
- __m128i addFilterReg64, filtersReg, srcReg, minReg;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter into the first lane
- firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
- // duplicate only the third 16 bit in the filter into the first lane
- secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
- // duplicate only the seconds 16 bits in the filter into the second lane
- // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
- firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
- // duplicate only the forth 16 bits in the filter into the second lane
- // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
- secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
- // loading the local filters
- shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
- shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
-
- for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
- // filter the source buffer
- srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
- srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
- // extract the higher half of the lane
- srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
- srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
-
- minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
- // add and saturate all the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
- srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bits
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
- src_ptr+=src_pixels_per_line;
-
- // save only 4 bytes
- *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
- output_ptr+=output_pitch;
- }
-}
-
-void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
- ptrdiff_t src_pixels_per_line,
- uint8_t *output_ptr,
- ptrdiff_t output_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
- __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
- __m128i addFilterReg64, filtersReg, minReg;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 128 bit register
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 128 bit register
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 128 bit register
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 128 bit register
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
- filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
- filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
- filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
- for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
- // filter the source buffer
- srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
- // filter the source buffer
- srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
- srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
- srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
- // add and saturate all the results together
- minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
- srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bits
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
- src_ptr+=src_pixels_per_line;
-
- // save only 8 bytes
- _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
- output_ptr+=output_pitch;
- }
-}
-
-void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
- ptrdiff_t src_pitch,
- uint8_t *output_ptr,
- ptrdiff_t out_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i addFilterReg64, filtersReg, minReg;
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
- __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
- __m128i srcReg8;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits in the filter
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits in the filter
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits in the filter
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- // load the first 7 rows of 8 bytes
- srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
- srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
- srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
- srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
- srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
- srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
- srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-
- for (i = 0; i < output_height; i++) {
- // load the last 8 bytes
- srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
- // merge the result together
- srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
- srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-
- // merge the result together
- srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
- srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
- srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
- // add and saturate the results together
- minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
- srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
- src_ptr+=src_pitch;
-
- // shift down a row
- srcReg1 = srcReg2;
- srcReg2 = srcReg3;
- srcReg3 = srcReg4;
- srcReg4 = srcReg5;
- srcReg5 = srcReg6;
- srcReg6 = srcReg7;
- srcReg7 = srcReg8;
-
- // save only 8 bytes convolve result
- _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
- output_ptr+=out_pitch;
- }
-}
-
-filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
-
-filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
-filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
-
-// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
- ssse3);
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \
- const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \
- const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \
- const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \
- const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \
- const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \
- \
- const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \
- const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \
- const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \
- const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \
- \
- out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \
- out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \
- out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \
- out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \
- out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \
- out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \
- out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \
- out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \
-}
-
-static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *x_filter) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
- const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
- const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
- const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
- const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
- const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
- const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
- const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
- // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
- const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
- // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
- const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
- // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
- const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
- // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
- const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
- // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
- const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
- const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
- const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
- const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
- const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
- const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
- const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
- const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
- const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
- // add and saturate the results together
- const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
- const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
- __m128i temp = _mm_adds_epi16(x0, x3);
- temp = _mm_adds_epi16(temp, min_x2x1);
- temp = _mm_adds_epi16(temp, max_x2x1);
- // round and shift by 7 bit each 16 bit
- temp = _mm_mulhrs_epi16(temp, k_256);
- // shrink to 8 bit each 16 bits
- temp = _mm_packus_epi16(temp, temp);
- // save only 8 bytes convolve result
- _mm_storel_epi64((__m128i*)dst, temp);
-}
-
-static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride) {
- __m128i A, B, C, D, E, F, G, H;
-
- A = _mm_loadl_epi64((const __m128i *)src);
- B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
- C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
- D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
- E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
- F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
- G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
- H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
-
- TRANSPOSE_8X8(A, B, C, D, E, F, G, H,
- A, B, C, D, E, F, G, H);
-
- _mm_storel_epi64((__m128i*)dst, A);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G);
- _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H);
-}
-
-static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4, int w, int h) {
- DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
- int x, y, z;
- src -= SUBPEL_TAPS / 2 - 1;
-
- // This function processes 8x8 areas. The intermediate height is not always
- // a multiple of 8, so force it to be a multiple of 8 here.
- y = h + (8 - (h & 0x7));
-
- do {
- int x_q4 = x0_q4;
- for (x = 0; x < w; x += 8) {
- // process 8 src_x steps
- for (z = 0; z < 8; ++z) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- if (x_q4 & SUBPEL_MASK) {
- filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
- } else {
- int i;
- for (i = 0; i < 8; ++i) {
- temp[z * 8 + i] = src_x[i * src_stride + 3];
- }
- }
- x_q4 += x_step_q4;
- }
-
- // transpose the 8x8 filters values back to dst
- transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
- }
-
- src += src_stride * 8;
- dst += dst_stride * 8;
- } while (y -= 8);
-}
-
-static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *filter) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
- const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
- const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
- const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
- // TRANSPOSE...
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- //
- // TO
- //
- // 00 10 20 30
- // 01 11 21 31
- // 02 12 22 32
- // 03 13 23 33
- // 04 14 24 34
- // 05 15 25 35
- // 06 16 26 36
- // 07 17 27 37
- //
- // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
- const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
- // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
- const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
- // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
- const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
- const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- // 02 03 12 13 22 23 32 33
- const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
- // 06 07 16 17 26 27 36 37
- const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
- const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
- const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
- const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
- // add and saturate the results together
- const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
- const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
- __m128i temp = _mm_adds_epi16(x0, x3);
- temp = _mm_adds_epi16(temp, min_x2x1);
- temp = _mm_adds_epi16(temp, max_x2x1);
- // round and shift by 7 bit each 16 bit
- temp = _mm_mulhrs_epi16(temp, k_256);
- // shrink to 8 bit each 16 bits
- temp = _mm_packus_epi16(temp, temp);
- // save only 4 bytes
- *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride) {
- __m128i A = _mm_cvtsi32_si128(*(const int *)src);
- __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
- __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
- __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
- // 00 10 01 11 02 12 03 13
- const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
- // 20 30 21 31 22 32 23 33
- const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- A = _mm_unpacklo_epi16(tr0_0, tr0_1);
- B = _mm_srli_si128(A, 4);
- C = _mm_srli_si128(A, 8);
- D = _mm_srli_si128(A, 12);
-
- *(int *)(dst) = _mm_cvtsi128_si32(A);
- *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
- *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
- *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
-}
-
-static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4, int w, int h) {
- DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
- int x, y, z;
- src -= SUBPEL_TAPS / 2 - 1;
-
- for (y = 0; y < h; y += 4) {
- int x_q4 = x0_q4;
- for (x = 0; x < w; x += 4) {
- // process 4 src_x steps
- for (z = 0; z < 4; ++z) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- if (x_q4 & SUBPEL_MASK) {
- filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
- } else {
- int i;
- for (i = 0; i < 4; ++i) {
- temp[z * 4 + i] = src_x[i * src_stride + 3];
- }
- }
- x_q4 += x_step_q4;
- }
-
- // transpose the 4x4 filters values back to dst
- transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
- }
-
- src += src_stride * 4;
- dst += dst_stride * 4;
- }
-}
-
-static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *filter) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
- const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
- const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
- const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
- const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
- const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
- const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
- const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
- const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
- const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
- const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
- const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
- const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
- const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
- const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
- // add and saturate the results together
- const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
- const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
- __m128i temp = _mm_adds_epi16(x0, x3);
- temp = _mm_adds_epi16(temp, min_x2x1);
- temp = _mm_adds_epi16(temp, max_x2x1);
- // round and shift by 7 bit each 16 bit
- temp = _mm_mulhrs_epi16(temp, k_256);
- // shrink to 8 bit each 16 bits
- temp = _mm_packus_epi16(temp, temp);
- // save only 4 bytes
- *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
- int y;
- int y_q4 = y0_q4;
-
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- for (y = 0; y < h; ++y) {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-
- if (y_q4 & SUBPEL_MASK) {
- filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
- } else {
- memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
- }
-
- y_q4 += y_step_q4;
- }
-}
-
-static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *filter) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
- const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
- const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
- const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
- const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
- const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
- const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
- const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
- const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
- const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
- const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
- const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
- const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
- const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
- const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
- // add and saturate the results together
- const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
- const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
- __m128i temp = _mm_adds_epi16(x0, x3);
- temp = _mm_adds_epi16(temp, min_x2x1);
- temp = _mm_adds_epi16(temp, max_x2x1);
- // round and shift by 7 bit each 16 bit
- temp = _mm_mulhrs_epi16(temp, k_256);
- // shrink to 8 bit each 16 bits
- temp = _mm_packus_epi16(temp, temp);
- // save only 8 bytes convolve result
- _mm_storel_epi64((__m128i*)dst, temp);
-}
-
-static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
- int y;
- int y_q4 = y0_q4;
-
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- for (y = 0; y < h; ++y) {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- if (y_q4 & SUBPEL_MASK) {
- filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
- } else {
- memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
- }
- y_q4 += y_step_q4;
- }
-}
-
-static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *filter, int w) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- int i;
-
- for (i = 0; i < w; i += 16) {
- const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
- const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
- const __m128i C =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
- const __m128i D =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
- const __m128i E =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
- const __m128i F =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
- const __m128i G =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
- const __m128i H =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
- // merge the result together
- const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
- const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
- const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
- const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
- const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
- const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
- const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
- // add and saturate the results together
- const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
- const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
- // merge the result together
- const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
- const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
- const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
- // merge the result together
- const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
- const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
- const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
- // add and saturate the results together
- __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
- __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
-
- // add and saturate the results together
- temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
- temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
- // round and shift by 7 bit each 16 bit
- temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
- temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
- src_ptr += 16;
- // save 16 bytes convolve result
- _mm_store_si128((__m128i*)&dst[i], temp_hi);
- }
-}
-
-static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
- int y;
- int y_q4 = y0_q4;
-
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- for (y = 0; y < h; ++y) {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- if (y_q4 & SUBPEL_MASK) {
- filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
- w);
- } else {
- memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
- }
- y_q4 += y_step_q4;
- }
-}
-
-static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters,
- int x0_q4, int x_step_q4,
- const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4,
- int w, int h) {
- // Note: Fixed size intermediate buffer, temp, places limits on parameters.
- // 2d filtering proceeds in 2 steps:
- // (1) Interpolate horizontally into an intermediate buffer, temp.
- // (2) Interpolate temp vertically to derive the sub-pixel result.
- // Deriving the maximum number of rows in the temp buffer (135):
- // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
- // --Largest block size is 64x64 pixels.
- // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
- // original frame (in 1/16th pixel units).
- // --Must round-up because block may be located at sub-pixel position.
- // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
- // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
- // --Require an additional 8 rows for the horiz_w8 transpose tail.
- DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
- const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- assert(w <= 64);
- assert(h <= 64);
- assert(y_step_q4 <= 32);
- assert(x_step_q4 <= 32);
-
- if (w >= 8) {
- scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
- w, intermediate_height);
- } else {
- scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
- w, intermediate_height);
- }
-
- if (w >= 16) {
- scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, y_filters, y0_q4, y_step_q4, w, h);
- } else if (w == 8) {
- scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, y_filters, y0_q4, y_step_q4, w, h);
- } else {
- scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, y_filters, y0_q4, y_step_q4, w, h);
- }
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
- // NOTE: This assumes that the filter table is 256-byte aligned.
- // TODO(agrange) Modify to make independent of table alignment.
- return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
- return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- scaledconvolve2d(src, src_stride, dst, dst_stride,
- filters_x, x0_q4, x_step_q4,
- filters_y, y0_q4, y_step_q4, w, h);
-}
-
-// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_ , ssse3);
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
deleted file mode 100644
index 08f3d6a6cf..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
+++ /dev/null
@@ -1,987 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;Note: tap3 and tap4 have to be applied and added after other taps to avoid
-;overflow.
-
-%macro GET_FILTERS_4 0
- mov rdx, arg(5) ;filter ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
- pshuflw xmm0, xmm7, 0b ;k0
- pshuflw xmm1, xmm7, 01010101b ;k1
- pshuflw xmm2, xmm7, 10101010b ;k2
- pshuflw xmm3, xmm7, 11111111b ;k3
- psrldq xmm7, 8
- pshuflw xmm4, xmm7, 0b ;k4
- pshuflw xmm5, xmm7, 01010101b ;k5
- pshuflw xmm6, xmm7, 10101010b ;k6
- pshuflw xmm7, xmm7, 11111111b ;k7
-
- punpcklqdq xmm0, xmm1
- punpcklqdq xmm2, xmm3
- punpcklqdq xmm5, xmm4
- punpcklqdq xmm6, xmm7
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm2
- movdqa k5k4, xmm5
- movdqa k6k7, xmm6
-
- movq xmm6, rcx
- pshufd xmm6, xmm6, 0
- movdqa krd, xmm6
-
- pxor xmm7, xmm7
- movdqa zero, xmm7
-%endm
-
-%macro APPLY_FILTER_4 1
- punpckldq xmm0, xmm1 ;two row in one register
- punpckldq xmm6, xmm7
- punpckldq xmm2, xmm3
- punpckldq xmm5, xmm4
-
- punpcklbw xmm0, zero ;unpack to word
- punpcklbw xmm6, zero
- punpcklbw xmm2, zero
- punpcklbw xmm5, zero
-
- pmullw xmm0, k0k1 ;multiply the filter factors
- pmullw xmm6, k6k7
- pmullw xmm2, k2k3
- pmullw xmm5, k5k4
-
- paddsw xmm0, xmm6 ;sum
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddsw xmm0, xmm1
- paddsw xmm0, xmm2
- psrldq xmm2, 8
- paddsw xmm0, xmm5
- psrldq xmm5, 8
- paddsw xmm0, xmm2
- paddsw xmm0, xmm5
-
- paddsw xmm0, krd ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack to byte
-
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movd [rdi], xmm0
-%endm
-
-%macro GET_FILTERS 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
- pshuflw xmm0, xmm7, 0b ;k0
- pshuflw xmm1, xmm7, 01010101b ;k1
- pshuflw xmm2, xmm7, 10101010b ;k2
- pshuflw xmm3, xmm7, 11111111b ;k3
- pshufhw xmm4, xmm7, 0b ;k4
- pshufhw xmm5, xmm7, 01010101b ;k5
- pshufhw xmm6, xmm7, 10101010b ;k6
- pshufhw xmm7, xmm7, 11111111b ;k7
-
- punpcklwd xmm0, xmm0
- punpcklwd xmm1, xmm1
- punpcklwd xmm2, xmm2
- punpcklwd xmm3, xmm3
- punpckhwd xmm4, xmm4
- punpckhwd xmm5, xmm5
- punpckhwd xmm6, xmm6
- punpckhwd xmm7, xmm7
-
- movdqa k0, xmm0 ;store filter factors on stack
- movdqa k1, xmm1
- movdqa k2, xmm2
- movdqa k3, xmm3
- movdqa k4, xmm4
- movdqa k5, xmm5
- movdqa k6, xmm6
- movdqa k7, xmm7
-
- movq xmm6, rcx
- pshufd xmm6, xmm6, 0
- movdqa krd, xmm6 ;rounding
-
- pxor xmm7, xmm7
- movdqa zero, xmm7
-%endm
-
-%macro LOAD_VERT_8 1
- movq xmm0, [rsi + %1] ;0
- movq xmm1, [rsi + rax + %1] ;1
- movq xmm6, [rsi + rdx * 2 + %1] ;6
- lea rsi, [rsi + rax]
- movq xmm7, [rsi + rdx * 2 + %1] ;7
- movq xmm2, [rsi + rax + %1] ;2
- movq xmm3, [rsi + rax * 2 + %1] ;3
- movq xmm4, [rsi + rdx + %1] ;4
- movq xmm5, [rsi + rax * 4 + %1] ;5
-%endm
-
-%macro APPLY_FILTER_8 2
- punpcklbw xmm0, zero
- punpcklbw xmm1, zero
- punpcklbw xmm6, zero
- punpcklbw xmm7, zero
- punpcklbw xmm2, zero
- punpcklbw xmm5, zero
- punpcklbw xmm3, zero
- punpcklbw xmm4, zero
-
- pmullw xmm0, k0
- pmullw xmm1, k1
- pmullw xmm6, k6
- pmullw xmm7, k7
- pmullw xmm2, k2
- pmullw xmm5, k5
- pmullw xmm3, k3
- pmullw xmm4, k4
-
- paddsw xmm0, xmm1
- paddsw xmm0, xmm6
- paddsw xmm0, xmm7
- paddsw xmm0, xmm2
- paddsw xmm0, xmm5
- paddsw xmm0, xmm3
- paddsw xmm0, xmm4
-
- paddsw xmm0, krd ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack back to byte
-%if %1
- movq xmm1, [rdi + %2]
- pavgb xmm0, xmm1
-%endif
- movq [rdi + %2], xmm0
-%endm
-
-;void vpx_filter_block1d4_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
-sym(vpx_filter_block1d4_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 6
- %define k0k1 [rsp + 16 * 0]
- %define k2k3 [rsp + 16 * 1]
- %define k5k4 [rsp + 16 * 2]
- %define k6k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define zero [rsp + 16 * 5]
-
- GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movd xmm0, [rsi] ;load src: row 0
- movd xmm1, [rsi + rax] ;1
- movd xmm6, [rsi + rdx * 2] ;6
- lea rsi, [rsi + rax]
- movd xmm7, [rsi + rdx * 2] ;7
- movd xmm2, [rsi + rax] ;2
- movd xmm3, [rsi + rax * 2] ;3
- movd xmm4, [rsi + rdx] ;4
- movd xmm5, [rsi + rax * 4] ;5
-
- APPLY_FILTER_4 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 6
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vpx_filter_block1d8_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
-sym(vpx_filter_block1d8_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- LOAD_VERT_8 0
- APPLY_FILTER_8 0, 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vpx_filter_block1d16_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
-sym(vpx_filter_block1d16_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- LOAD_VERT_8 0
- APPLY_FILTER_8 0, 0
- sub rsi, rax
-
- LOAD_VERT_8 8
- APPLY_FILTER_8 0, 8
- add rdi, rbx
-
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d4_v8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 6
- %define k0k1 [rsp + 16 * 0]
- %define k2k3 [rsp + 16 * 1]
- %define k5k4 [rsp + 16 * 2]
- %define k6k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define zero [rsp + 16 * 5]
-
- GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movd xmm0, [rsi] ;load src: row 0
- movd xmm1, [rsi + rax] ;1
- movd xmm6, [rsi + rdx * 2] ;6
- lea rsi, [rsi + rax]
- movd xmm7, [rsi + rdx * 2] ;7
- movd xmm2, [rsi + rax] ;2
- movd xmm3, [rsi + rax * 2] ;3
- movd xmm4, [rsi + rdx] ;4
- movd xmm5, [rsi + rax * 4] ;5
-
- APPLY_FILTER_4 1
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 6
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d8_v8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-.loop:
- LOAD_VERT_8 0
- APPLY_FILTER_8 1, 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d16_v8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-.loop:
- LOAD_VERT_8 0
- APPLY_FILTER_8 1, 0
- sub rsi, rax
-
- LOAD_VERT_8 8
- APPLY_FILTER_8 1, 8
- add rdi, rbx
-
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vpx_filter_block1d4_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
-sym(vpx_filter_block1d4_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 6
- %define k0k1 [rsp + 16 * 0]
- %define k2k3 [rsp + 16 * 1]
- %define k5k4 [rsp + 16 * 2]
- %define k6k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define zero [rsp + 16 * 5]
-
- GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- movdqa xmm5, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm3, 3
- psrldq xmm5, 5
- psrldq xmm4, 4
-
- APPLY_FILTER_4 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 6
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vpx_filter_block1d8_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
-sym(vpx_filter_block1d8_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 0, 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vpx_filter_block1d16_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
-sym(vpx_filter_block1d16_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 0, 0
-
- movdqu xmm0, [rsi + 5] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 0, 8
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d4_h8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 6
- %define k0k1 [rsp + 16 * 0]
- %define k2k3 [rsp + 16 * 1]
- %define k5k4 [rsp + 16 * 2]
- %define k6k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define zero [rsp + 16 * 5]
-
- GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- movdqa xmm5, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm3, 3
- psrldq xmm5, 5
- psrldq xmm4, 4
-
- APPLY_FILTER_4 1
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 6
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d8_h8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 1, 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(vpx_filter_block1d16_h8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 1, 0
-
- movdqu xmm0, [rsi + 5] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 1, 8
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
deleted file mode 100644
index d2cb8ea292..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ /dev/null
@@ -1,629 +0,0 @@
-;
-; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_64: times 8 dw 64
-
-; %define USE_PMULHRSW
-; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
-; when using this instruction.
-;
-; The add order below (based on ffvp9) must be followed to prevent outranges.
-; x = k0k1 + k4k5
-; y = k2k3 + k6k7
-; z = signed SAT(x + y)
-
-SECTION .text
-%if ARCH_X86_64
- %define LOCAL_VARS_SIZE 16*4
-%else
- %define LOCAL_VARS_SIZE 16*6
-%endif
-
-%macro SETUP_LOCAL_VARS 0
- ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
- ; pmaddubsw has a higher latency on some platforms, this might be eased by
- ; interleaving the instructions.
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- packsswb m4, m4
- ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
- ; some platforms.
- pshuflw m0, m4, 0b ;k0_k1
- pshuflw m1, m4, 01010101b ;k2_k3
- pshuflw m2, m4, 10101010b ;k4_k5
- pshuflw m3, m4, 11111111b ;k6_k7
- punpcklqdq m0, m0
- punpcklqdq m1, m1
- punpcklqdq m2, m2
- punpcklqdq m3, m3
- mova k0k1, m0
- mova k2k3, m1
- mova k4k5, m2
- mova k6k7, m3
-%if ARCH_X86_64
- %define krd m12
- %define tmp m13
- mova krd, [GLOBAL(pw_64)]
-%else
- %define tmp [rsp + 16*4]
- %define krd [rsp + 16*5]
-%if CONFIG_PIC=0
- mova m6, [GLOBAL(pw_64)]
-%else
- ; build constants without accessing global memory
- pcmpeqb m6, m6 ;all ones
- psrlw m6, 15
- psllw m6, 6 ;aka pw_64
-%endif
- mova krd, m6
-%endif
-%endm
-
-%macro HORIZx4_ROW 2
- mova %2, %1
- punpcklbw %1, %1
- punpckhbw %2, %2
-
- mova m3, %2
- palignr %2, %1, 1
- palignr m3, %1, 5
-
- pmaddubsw %2, k0k1k4k5
- pmaddubsw m3, k2k3k6k7
- mova m4, %2 ;k0k1
- mova m5, m3 ;k2k3
- psrldq %2, 8 ;k4k5
- psrldq m3, 8 ;k6k7
- paddsw %2, m4
- paddsw m5, m3
- paddsw %2, m5
- paddsw %2, krd
- psraw %2, 7
- packuswb %2, %2
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER4 1
-cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- packsswb m4, m4
-%if ARCH_X86_64
- %define k0k1k4k5 m8
- %define k2k3k6k7 m9
- %define krd m10
- %define orig_height r7d
- mova krd, [GLOBAL(pw_64)]
- pshuflw k0k1k4k5, m4, 0b ;k0_k1
- pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
- pshuflw k2k3k6k7, m4, 01010101b ;k2_k3
- pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
-%else
- %define k0k1k4k5 [rsp + 16*0]
- %define k2k3k6k7 [rsp + 16*1]
- %define krd [rsp + 16*2]
- %define orig_height [rsp + 16*3]
- pshuflw m6, m4, 0b ;k0_k1
- pshufhw m6, m6, 10101010b ;k0_k1_k4_k5
- pshuflw m7, m4, 01010101b ;k2_k3
- pshufhw m7, m7, 11111111b ;k2_k3_k6_k7
-%if CONFIG_PIC=0
- mova m1, [GLOBAL(pw_64)]
-%else
- ; build constants without accessing global memory
- pcmpeqb m1, m1 ;all ones
- psrlw m1, 15
- psllw m1, 6 ;aka pw_64
-%endif
- mova k0k1k4k5, m6
- mova k2k3k6k7, m7
- mova krd, m1
-%endif
- mov orig_height, heightd
- shr heightd, 1
-.loop:
- ;Do two rows at once
- movh m0, [srcq - 3]
- movh m1, [srcq + 5]
- punpcklqdq m0, m1
- mova m1, m0
- movh m2, [srcq + sstrideq - 3]
- movh m3, [srcq + sstrideq + 5]
- punpcklqdq m2, m3
- mova m3, m2
- punpcklbw m0, m0
- punpckhbw m1, m1
- punpcklbw m2, m2
- punpckhbw m3, m3
- mova m4, m1
- palignr m4, m0, 1
- pmaddubsw m4, k0k1k4k5
- palignr m1, m0, 5
- pmaddubsw m1, k2k3k6k7
- mova m7, m3
- palignr m7, m2, 1
- pmaddubsw m7, k0k1k4k5
- palignr m3, m2, 5
- pmaddubsw m3, k2k3k6k7
- mova m0, m4 ;k0k1
- mova m5, m1 ;k2k3
- mova m2, m7 ;k0k1 upper
- psrldq m4, 8 ;k4k5
- psrldq m1, 8 ;k6k7
- paddsw m4, m0
- paddsw m5, m1
- mova m1, m3 ;k2k3 upper
- psrldq m7, 8 ;k4k5 upper
- psrldq m3, 8 ;k6k7 upper
- paddsw m7, m2
- paddsw m4, m5
- paddsw m1, m3
- paddsw m7, m1
- paddsw m4, krd
- psraw m4, 7
- packuswb m4, m4
- paddsw m7, krd
- psraw m7, 7
- packuswb m7, m7
-
-%ifidn %1, h8_avg
- movd m0, [dstq]
- pavgb m4, m0
- movd m2, [dstq + dstrideq]
- pavgb m7, m2
-%endif
- movd [dstq], m4
- movd [dstq + dstrideq], m7
-
- lea srcq, [srcq + sstrideq ]
- prefetcht0 [srcq + 4 * sstrideq - 3]
- lea srcq, [srcq + sstrideq ]
- lea dstq, [dstq + 2 * dstrideq ]
- prefetcht0 [srcq + 2 * sstrideq - 3]
-
- dec heightd
- jnz .loop
-
- ; Do last row if output_height is odd
- mov heightd, orig_height
- and heightd, 1
- je .done
-
- movh m0, [srcq - 3] ; load src
- movh m1, [srcq + 5]
- punpcklqdq m0, m1
-
- HORIZx4_ROW m0, m1
-%ifidn %1, h8_avg
- movd m0, [dstq]
- pavgb m1, m0
-%endif
- movd [dstq], m1
-.done
- RET
-%endm
-
-%macro HORIZx8_ROW 5
- mova %2, %1
- punpcklbw %1, %1
- punpckhbw %2, %2
-
- mova %3, %2
- mova %4, %2
- mova %5, %2
-
- palignr %2, %1, 1
- palignr %3, %1, 5
- palignr %4, %1, 9
- palignr %5, %1, 13
-
- pmaddubsw %2, k0k1
- pmaddubsw %3, k2k3
- pmaddubsw %4, k4k5
- pmaddubsw %5, k6k7
- paddsw %2, %4
- paddsw %5, %3
- paddsw %2, %5
- paddsw %2, krd
- psraw %2, 7
- packuswb %2, %2
- SWAP %1, %2
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER8 1
-cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
-%if ARCH_X86_64
- %define orig_height r7d
-%else
- %define orig_height heightmp
-%endif
- mov orig_height, heightd
- shr heightd, 1
-
-.loop:
- movh m0, [srcq - 3]
- movh m3, [srcq + 5]
- movh m4, [srcq + sstrideq - 3]
- movh m7, [srcq + sstrideq + 5]
- punpcklqdq m0, m3
- mova m1, m0
- punpcklbw m0, m0
- punpckhbw m1, m1
- mova m5, m1
- palignr m5, m0, 13
- pmaddubsw m5, k6k7
- mova m2, m1
- mova m3, m1
- palignr m1, m0, 1
- pmaddubsw m1, k0k1
- punpcklqdq m4, m7
- mova m6, m4
- punpcklbw m4, m4
- palignr m2, m0, 5
- punpckhbw m6, m6
- palignr m3, m0, 9
- mova m7, m6
- pmaddubsw m2, k2k3
- pmaddubsw m3, k4k5
-
- palignr m7, m4, 13
- mova m0, m6
- palignr m0, m4, 5
- pmaddubsw m7, k6k7
- paddsw m1, m3
- paddsw m2, m5
- paddsw m1, m2
- mova m5, m6
- palignr m6, m4, 1
- pmaddubsw m0, k2k3
- pmaddubsw m6, k0k1
- palignr m5, m4, 9
- paddsw m1, krd
- pmaddubsw m5, k4k5
- psraw m1, 7
- paddsw m0, m7
-%ifidn %1, h8_avg
- movh m7, [dstq]
- movh m2, [dstq + dstrideq]
-%endif
- packuswb m1, m1
- paddsw m6, m5
- paddsw m6, m0
- paddsw m6, krd
- psraw m6, 7
- packuswb m6, m6
-%ifidn %1, h8_avg
- pavgb m1, m7
- pavgb m6, m2
-%endif
- movh [dstq], m1
- movh [dstq + dstrideq], m6
-
- lea srcq, [srcq + sstrideq ]
- prefetcht0 [srcq + 4 * sstrideq - 3]
- lea srcq, [srcq + sstrideq ]
- lea dstq, [dstq + 2 * dstrideq ]
- prefetcht0 [srcq + 2 * sstrideq - 3]
- dec heightd
- jnz .loop
-
- ;Do last row if output_height is odd
- mov heightd, orig_height
- and heightd, 1
- je .done
-
- movh m0, [srcq - 3]
- movh m3, [srcq + 5]
- punpcklqdq m0, m3
-
- HORIZx8_ROW m0, m1, m2, m3, m4
-
-%ifidn %1, h8_avg
- movh m1, [dstq]
- pavgb m0, m1
-%endif
- movh [dstq], m0
-.done:
- RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER16 1
-cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
-.loop:
- prefetcht0 [srcq + 2 * sstrideq -3]
-
- movh m0, [srcq - 3]
- movh m4, [srcq + 5]
- movh m6, [srcq + 13]
- punpcklqdq m0, m4
- mova m7, m0
- punpckhbw m0, m0
- mova m1, m0
- punpcklqdq m4, m6
- mova m3, m0
- punpcklbw m7, m7
-
- palignr m3, m7, 13
- mova m2, m0
- pmaddubsw m3, k6k7
- palignr m0, m7, 1
- pmaddubsw m0, k0k1
- palignr m1, m7, 5
- pmaddubsw m1, k2k3
- palignr m2, m7, 9
- pmaddubsw m2, k4k5
- paddsw m1, m3
- mova m3, m4
- punpckhbw m4, m4
- mova m5, m4
- punpcklbw m3, m3
- mova m7, m4
- palignr m5, m3, 5
- mova m6, m4
- palignr m4, m3, 1
- pmaddubsw m4, k0k1
- pmaddubsw m5, k2k3
- palignr m6, m3, 9
- pmaddubsw m6, k4k5
- palignr m7, m3, 13
- pmaddubsw m7, k6k7
- paddsw m0, m2
- paddsw m0, m1
-%ifidn %1, h8_avg
- mova m1, [dstq]
-%endif
- paddsw m4, m6
- paddsw m5, m7
- paddsw m4, m5
- paddsw m0, krd
- paddsw m4, krd
- psraw m0, 7
- psraw m4, 7
- packuswb m0, m4
-%ifidn %1, h8_avg
- pavgb m0, m1
-%endif
- lea srcq, [srcq + sstrideq]
- mova [dstq], m0
- lea dstq, [dstq + dstrideq]
- dec heightd
- jnz .loop
- RET
-%endm
-
-INIT_XMM ssse3
-SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER16 h8_avg
-SUBPIX_HFILTER8 h8
-SUBPIX_HFILTER8 h8_avg
-SUBPIX_HFILTER4 h8
-SUBPIX_HFILTER4 h8_avg
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_VFILTER 2
-cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
-%if ARCH_X86_64
- %define src1q r7
- %define sstride6q r8
- %define dst_stride dstrideq
-%else
- %define src1q filterq
- %define sstride6q dstrideq
- %define dst_stride dstridemp
-%endif
- mov src1q, srcq
- add src1q, sstrideq
- lea sstride6q, [sstrideq + sstrideq * 4]
- add sstride6q, sstrideq ;pitch * 6
-
-%ifidn %2, 8
- %define movx movh
-%else
- %define movx movd
-%endif
-.loop:
- movx m0, [srcq ] ;A
- movx m1, [srcq + sstrideq ] ;B
- punpcklbw m0, m1 ;A B
- movx m2, [srcq + sstrideq * 2 ] ;C
- pmaddubsw m0, k0k1
- mova m6, m2
- movx m3, [src1q + sstrideq * 2] ;D
- punpcklbw m2, m3 ;C D
- pmaddubsw m2, k2k3
- movx m4, [srcq + sstrideq * 4 ] ;E
- mova m7, m4
- movx m5, [src1q + sstrideq * 4] ;F
- punpcklbw m4, m5 ;E F
- pmaddubsw m4, k4k5
- punpcklbw m1, m6 ;A B next iter
- movx m6, [srcq + sstride6q ] ;G
- punpcklbw m5, m6 ;E F next iter
- punpcklbw m3, m7 ;C D next iter
- pmaddubsw m5, k4k5
- movx m7, [src1q + sstride6q ] ;H
- punpcklbw m6, m7 ;G H
- pmaddubsw m6, k6k7
- pmaddubsw m3, k2k3
- pmaddubsw m1, k0k1
- paddsw m0, m4
- paddsw m2, m6
- movx m6, [srcq + sstrideq * 8 ] ;H next iter
- punpcklbw m7, m6
- pmaddubsw m7, k6k7
- paddsw m0, m2
- paddsw m0, krd
- psraw m0, 7
- paddsw m1, m5
- packuswb m0, m0
-
- paddsw m3, m7
- paddsw m1, m3
- paddsw m1, krd
- psraw m1, 7
- lea srcq, [srcq + sstrideq * 2 ]
- lea src1q, [src1q + sstrideq * 2]
- packuswb m1, m1
-
-%ifidn %1, v8_avg
- movx m2, [dstq]
- pavgb m0, m2
-%endif
- movx [dstq], m0
- add dstq, dst_stride
-%ifidn %1, v8_avg
- movx m3, [dstq]
- pavgb m1, m3
-%endif
- movx [dstq], m1
- add dstq, dst_stride
- sub heightd, 2
- cmp heightd, 1
- jg .loop
-
- cmp heightd, 0
- je .done
-
- movx m0, [srcq ] ;A
- movx m1, [srcq + sstrideq ] ;B
- movx m6, [srcq + sstride6q ] ;G
- punpcklbw m0, m1 ;A B
- movx m7, [src1q + sstride6q ] ;H
- pmaddubsw m0, k0k1
- movx m2, [srcq + sstrideq * 2 ] ;C
- punpcklbw m6, m7 ;G H
- movx m3, [src1q + sstrideq * 2] ;D
- pmaddubsw m6, k6k7
- movx m4, [srcq + sstrideq * 4 ] ;E
- punpcklbw m2, m3 ;C D
- movx m5, [src1q + sstrideq * 4] ;F
- punpcklbw m4, m5 ;E F
- pmaddubsw m2, k2k3
- pmaddubsw m4, k4k5
- paddsw m2, m6
- paddsw m0, m4
- paddsw m0, m2
- paddsw m0, krd
- psraw m0, 7
- packuswb m0, m0
-%ifidn %1, v8_avg
- movx m1, [dstq]
- pavgb m0, m1
-%endif
- movx [dstq], m0
-.done:
- RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_VFILTER16 1
-cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
-%if ARCH_X86_64
- %define src1q r7
- %define sstride6q r8
- %define dst_stride dstrideq
-%else
- %define src1q filterq
- %define sstride6q dstrideq
- %define dst_stride dstridemp
-%endif
- mov src1q, srcq
- add src1q, sstrideq
- lea sstride6q, [sstrideq + sstrideq * 4]
- add sstride6q, sstrideq ;pitch * 6
-
-.loop:
- movh m0, [srcq ] ;A
- movh m1, [srcq + sstrideq ] ;B
- movh m2, [srcq + sstrideq * 2 ] ;C
- movh m3, [src1q + sstrideq * 2] ;D
- movh m4, [srcq + sstrideq * 4 ] ;E
- movh m5, [src1q + sstrideq * 4] ;F
-
- punpcklbw m0, m1 ;A B
- movh m6, [srcq + sstride6q] ;G
- punpcklbw m2, m3 ;C D
- movh m7, [src1q + sstride6q] ;H
- punpcklbw m4, m5 ;E F
- pmaddubsw m0, k0k1
- movh m3, [srcq + 8] ;A
- pmaddubsw m2, k2k3
- punpcklbw m6, m7 ;G H
- movh m5, [srcq + sstrideq + 8] ;B
- pmaddubsw m4, k4k5
- punpcklbw m3, m5 ;A B
- movh m7, [srcq + sstrideq * 2 + 8] ;C
- pmaddubsw m6, k6k7
- movh m5, [src1q + sstrideq * 2 + 8] ;D
- punpcklbw m7, m5 ;C D
- paddsw m2, m6
- pmaddubsw m3, k0k1
- movh m1, [srcq + sstrideq * 4 + 8] ;E
- paddsw m0, m4
- pmaddubsw m7, k2k3
- movh m6, [src1q + sstrideq * 4 + 8] ;F
- punpcklbw m1, m6 ;E F
- paddsw m0, m2
- paddsw m0, krd
- movh m2, [srcq + sstride6q + 8] ;G
- pmaddubsw m1, k4k5
- movh m5, [src1q + sstride6q + 8] ;H
- psraw m0, 7
- punpcklbw m2, m5 ;G H
- pmaddubsw m2, k6k7
-%ifidn %1, v8_avg
- mova m4, [dstq]
-%endif
- movh [dstq], m0
- paddsw m7, m2
- paddsw m3, m1
- paddsw m3, m7
- paddsw m3, krd
- psraw m3, 7
- packuswb m0, m3
-
- add srcq, sstrideq
- add src1q, sstrideq
-%ifidn %1, v8_avg
- pavgb m0, m4
-%endif
- mova [dstq], m0
- add dstq, dst_stride
- dec heightd
- jnz .loop
- RET
-%endm
-
-INIT_XMM ssse3
-SUBPIX_VFILTER16 v8
-SUBPIX_VFILTER16 v8_avg
-SUBPIX_VFILTER v8, 8
-SUBPIX_VFILTER v8_avg, 8
-SUBPIX_VFILTER v8, 4
-SUBPIX_VFILTER v8_avg, 4
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
deleted file mode 100644
index a378dd0402..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
+++ /dev/null
@@ -1,448 +0,0 @@
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm3, [rdx] ;load filters
- pshuflw xmm4, xmm3, 11111111b ;k3
- psrldq xmm3, 8
- pshuflw xmm3, xmm3, 0b ;k4
- punpcklqdq xmm4, xmm3 ;k3k4
-
- movq xmm3, rcx ;rounding
- pshufd xmm3, xmm3, 0
-
- pxor xmm2, xmm2
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
-
- punpckldq xmm0, xmm1 ;two row in one register
- punpcklbw xmm0, xmm2 ;unpack to word
- pmullw xmm0, xmm4 ;multiply the filter factors
-
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddsw xmm0, xmm1
-
- paddsw xmm0, xmm3 ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack to byte
-
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
-
- movd [rdi], xmm0
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro GET_PARAM 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
-
- pshuflw xmm6, xmm7, 11111111b ;k3
- pshufhw xmm7, xmm7, 0b ;k4
- punpcklwd xmm6, xmm6
- punpckhwd xmm7, xmm7
-
- movq xmm4, rcx ;rounding
- pshufd xmm4, xmm4, 0
-
- pxor xmm5, xmm5
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
-
- pmullw xmm0, xmm6
- pmullw xmm1, xmm7
- paddsw xmm0, xmm1
- paddsw xmm0, xmm4 ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack back to byte
-%if %1
- movq xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movq [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro APPLY_FILTER_16 1
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- punpckhbw xmm2, xmm5
- punpckhbw xmm3, xmm5
-
- pmullw xmm0, xmm6
- pmullw xmm1, xmm7
- pmullw xmm2, xmm6
- pmullw xmm3, xmm7
-
- paddsw xmm0, xmm1
- paddsw xmm2, xmm3
-
- paddsw xmm0, xmm4 ;rounding
- paddsw xmm2, xmm4
- psraw xmm0, 7 ;shift
- psraw xmm2, 7
- packuswb xmm0, xmm2 ;pack back to byte
-%if %1
- movdqu xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movdqu [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
-sym(vpx_filter_block1d4_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movd xmm0, [rsi] ;load src
- movd xmm1, [rsi + rax]
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d8_v2_sse2) PRIVATE
-sym(vpx_filter_block1d8_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movq xmm0, [rsi] ;0
- movq xmm1, [rsi + rax] ;1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d16_v2_sse2) PRIVATE
-sym(vpx_filter_block1d16_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + rax] ;1
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d4_v2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movd xmm0, [rsi] ;load src
- movd xmm1, [rsi + rax]
-
- APPLY_FILTER_4 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d8_v2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movq xmm0, [rsi] ;0
- movq xmm1, [rsi + rax] ;1
-
- APPLY_FILTER_8 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d16_v2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + rax] ;1
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- APPLY_FILTER_16 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d4_h2_sse2) PRIVATE
-sym(vpx_filter_block1d4_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d8_h2_sse2) PRIVATE
-sym(vpx_filter_block1d8_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d16_h2_sse2) PRIVATE
-sym(vpx_filter_block1d16_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 1]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d4_h2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_4 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d8_h2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_8 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(vpx_filter_block1d16_h2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 1]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- APPLY_FILTER_16 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
deleted file mode 100644
index 3c8cfd2253..0000000000
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
+++ /dev/null
@@ -1,422 +0,0 @@
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm3, [rdx] ;load filters
- psrldq xmm3, 6
- packsswb xmm3, xmm3
- pshuflw xmm3, xmm3, 0b ;k3_k4
-
- movq xmm2, rcx ;rounding
- pshufd xmm2, xmm2, 0
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
- punpcklbw xmm0, xmm1
- pmaddubsw xmm0, xmm3
-
- paddsw xmm0, xmm2 ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack to byte
-
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movd [rdi], xmm0
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro GET_PARAM 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
- psrldq xmm7, 6
- packsswb xmm7, xmm7
- pshuflw xmm7, xmm7, 0b ;k3_k4
- punpcklwd xmm7, xmm7
-
- movq xmm6, rcx ;rounding
- pshufd xmm6, xmm6, 0
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
- punpcklbw xmm0, xmm1
- pmaddubsw xmm0, xmm7
-
- paddsw xmm0, xmm6 ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack back to byte
-
-%if %1
- movq xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movq [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro APPLY_FILTER_16 1
- punpcklbw xmm0, xmm1
- punpckhbw xmm2, xmm1
- pmaddubsw xmm0, xmm7
- pmaddubsw xmm2, xmm7
-
- paddsw xmm0, xmm6 ;rounding
- paddsw xmm2, xmm6
- psraw xmm0, 7 ;shift
- psraw xmm2, 7
- packuswb xmm0, xmm2 ;pack back to byte
-
-%if %1
- movdqu xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movdqu [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
-sym(vpx_filter_block1d4_v2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movd xmm0, [rsi] ;load src
- movd xmm1, [rsi + rax]
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE
-sym(vpx_filter_block1d8_v2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movq xmm0, [rsi] ;0
- movq xmm1, [rsi + rax] ;1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE
-sym(vpx_filter_block1d16_v2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + rax] ;1
- movdqa xmm2, xmm0
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d4_v2_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movd xmm0, [rsi] ;load src
- movd xmm1, [rsi + rax]
-
- APPLY_FILTER_4 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d8_v2_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movq xmm0, [rsi] ;0
- movq xmm1, [rsi + rax] ;1
-
- APPLY_FILTER_8 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d16_v2_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + rax] ;1
- movdqa xmm2, xmm0
-
- APPLY_FILTER_16 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE
-sym(vpx_filter_block1d4_h2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE
-sym(vpx_filter_block1d8_h2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE
-sym(vpx_filter_block1d16_h2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 1]
- movdqa xmm2, xmm0
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d4_h2_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_4 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d8_h2_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_8 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d16_h2_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 1]
- movdqa xmm2, xmm0
-
- APPLY_FILTER_16 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret